In [9]:
import pandas as pd

# 读取数据
df = pd.read_csv('data_processed_std.csv')

In [None]:


# 1. 筛选男胎孕妇（Y染色体浓度非空）
df_male = df[df['Y染色体浓度'].notna()].copy()

df_male['孕周_float'] = df_male['检测孕周_周数']

# 3. 找到每个孕妇的最早达标时间
threshold = 0.04  # 4%
earliest_times = df_male[df_male['Y染色体浓度'] >= threshold] \
    .groupby('孕妇代码')['孕周_float'].min().reset_index()

earliest_times.rename(columns={'孕周_float': '最早达标孕周'}, inplace=True)

# 4. 合并回原数据（保留BMI）
df_bmi = df_male[['孕妇代码', '孕妇BMI']].drop_duplicates()
df_merged = pd.merge(earliest_times, df_bmi, on='孕妇代码', how='left')

print(df_merged.head())


   孕妇代码    最早达标孕周     孕妇BMI
0  A001  0.935433 -1.392187
1  A001  0.935433 -1.260680
2  A001  0.935433 -1.129174
3  A002 -0.672166  0.360731
4  A002 -0.672166  0.664012


In [3]:
df_unique = df_merged.drop_duplicates(subset=['孕妇代码']).copy()
df_unique

Unnamed: 0,孕妇代码,最早达标孕周,孕妇BMI
0,A001,0.935433,-1.392187
3,A002,-0.672166,0.360731
6,A003,-0.197194,-0.511092
10,A004,-0.197194,-1.218390
13,A005,-1.074066,-0.775917
...,...,...,...
759,A263,-1.000994,-0.980897
763,A264,-1.220212,0.070946
767,A265,-1.220212,0.491302
770,A266,-0.708703,-0.042159


In [4]:
from sklearn.cluster import KMeans
import numpy as np

# 提取BMI列
X = df_unique[['孕妇BMI']].values

# K-means 聚类
kmeans = KMeans(n_clusters=4, random_state=0, n_init=10)
df_unique['BMI_group'] = kmeans.fit_predict(X)

# 查看每组的BMI范围
group_ranges = df_unique.groupby('BMI_group')['孕妇BMI'].agg(['min','max','mean','count'])
print(group_ranges)


                min       max      mean  count
BMI_group                                     
0          0.828365  2.370669  1.354451     34
1         -1.899078 -0.865509 -1.155833     59
2         -0.847072 -0.160427 -0.558177     83
3         -0.116925  0.777691  0.284216     72


In [5]:
import pandas as pd
from sklearn.cluster import KMeans

# 读取数据
df = pd.read_csv('data_processed_std.csv')
df_mean_std = pd.read_csv('feature_mean_std.csv')

# 提取BMI均值和标准差
bmi_mean = df_mean_std.loc[df_mean_std['特征名'] == '孕妇BMI', '均值'].values[0]
bmi_std = df_mean_std.loc[df_mean_std['特征名'] == '孕妇BMI', '标准差'].values[0]

# 只保留男胎孕妇
df_male = df[df['Y染色体浓度'].notna()].copy()



df_male['孕周_float'] = df_male['检测孕周_周数'].astype(float)

# 找最早达标孕周
threshold = 0.04
earliest_times = df_male[df_male['Y染色体浓度'] >= threshold] \
    .groupby('孕妇代码')['孕周_float'].min().reset_index()
earliest_times.rename(columns={'孕周_float': '最早达标孕周'}, inplace=True)

# 合并BMI（标准化值）
df_bmi = df_male[['孕妇代码', '孕妇BMI']].drop_duplicates()
df_merged = pd.merge(earliest_times, df_bmi, on='孕妇代码', how='left')

# 去重
df_unique = df_merged.drop_duplicates(subset=['孕妇代码']).copy()

# 反标准化BMI
df_unique['BMI_real'] = df_unique['孕妇BMI'] * bmi_std + bmi_mean

# K-means 分组（假设4组，可调）
kmeans = KMeans(n_clusters=4, random_state=0, n_init=10)
df_unique['BMI_group'] = kmeans.fit_predict(df_unique[['BMI_real']])

# 查看每组的真实BMI范围
group_ranges = df_unique.groupby('BMI_group')['BMI_real'].agg(['min','max','mean','count']).sort_values('min')
print(group_ranges)


                 min        max       mean  count
BMI_group                                        
1          26.619343  29.689436  28.827062     59
2          29.744200  31.783795  30.602328     83
3          31.913011  34.570361  33.104554     72
0          34.720883  39.302112  36.283557     34


In [7]:
import numpy as np

# 假设最晚检测孕周
T_deadline = 20
w1, w2 = 1, 1

best_times = []

for g in sorted(df_unique['BMI_group'].unique()):
    group_data = df_unique[df_unique['BMI_group'] == g]
    earliest_weeks = group_data['最早达标孕周'].dropna().values
    
    candidate_t = np.arange(10, 25, 0.1)  # 从10周到24周，每0.1周一个点
    risks = []
    
    for t in candidate_t:
        risk_early = np.mean(earliest_weeks > t)  # 未达标风险
        risk_late = 1.0 if t > T_deadline else 0.0  # 晚发现风险（硬阈值）
        risk = w1 * risk_early + w2 * risk_late
        risks.append(risk)
    
    t_best = candidate_t[np.argmin(risks)]
    best_times.append((g, t_best, np.min(risks)))

best_df = pd.DataFrame(best_times, columns=['BMI_group', '最佳孕周', '最小风险'])
print(best_df)


   BMI_group  最佳孕周  最小风险
0          0  10.0   0.0
1          1  10.0   0.0
2          2  10.0   0.0
3          3  10.0   0.0
