In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score
from joblib import Parallel, delayed
from tqdm import tqdm
import pickle
import numpy as np 
from ML_Models import ModelSelector
from logging_config import setup_logger

random_state = 3220821

### 读取数据

In [10]:
mm_prs = pd.read_csv(r'mm_prs.csv')
labels = pd.read_csv(r'clean_lables.csv')

X_columns = mm_prs.columns.to_list()[1:]
Y_columns = labels.columns.to_list()[1:]

In [11]:
mm_prs.shape, labels.shape 

((336978, 38), (336979, 105))

### 合并数据

In [12]:
df = pd.merge(mm_prs,labels,left_on='FID',right_on='eid',how='inner')
df.drop('eid',axis=1,inplace=True)

### 检查数据

In [13]:
df[(df[Y_columns]==0).all(axis=1)]

Unnamed: 0,FID,HF,AAM,AMD,AD,AST,AF,BD,BMI,CRC,...,Class_I48,Class_I63,Class_I10,Class_F00,Class_F20,Class_C50,Class_C18,Class_C56,Class_J45,Class_M05
0,1000011,0.390474,0.591999,0.553127,0.397860,0.513557,0.504414,0.375549,0.445283,0.545092,...,0,0,0,0,0,0,0,0,0,0
2,1000044,0.468422,0.444217,0.388766,0.557332,0.456365,0.524752,0.537074,0.507530,0.517431,...,0,0,0,0,0,0,0,0,0,0
4,1000116,0.351678,0.624132,0.488704,0.486897,0.452750,0.481785,0.402263,0.388306,0.513450,...,0,0,0,0,0,0,0,0,0,0
7,1000143,0.426496,0.419968,0.453513,0.530163,0.420282,0.461987,0.291205,0.440422,0.477416,...,0,0,0,0,0,0,0,0,0,0
9,1000168,0.458818,0.280702,0.431210,0.550208,0.509028,0.441621,0.477845,0.374281,0.531192,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336959,6023308,0.443033,0.284950,0.484217,0.384919,0.225018,0.574028,0.380861,0.493368,0.360054,...,0,0,0,0,0,0,0,0,0,0
336966,6023394,0.518329,0.372964,0.522717,0.529828,0.495431,0.448520,0.519743,0.449305,0.571605,...,0,0,0,0,0,0,0,0,0,0
336969,6023440,0.377258,0.421311,0.439327,0.415556,0.432147,0.375610,0.388640,0.495169,0.480763,...,0,0,0,0,0,0,0,0,0,0
336971,6023464,0.465263,0.591730,0.420846,0.530429,0.515539,0.518991,0.520347,0.507308,0.675221,...,0,0,0,0,0,0,0,0,0,0


#### 筛选部分数据集

In [None]:
import pandas as pd


# 因变量列均为0的样本
df1 = df[(df[Y_columns] == 0).all(axis=1)]

# 因变量列不全为0的样本
df2 = df[~(df[Y_columns] == 0).all(axis=1)]

# 从df1中随机抽取1万条样本形成df3
df3 = df1.sample(n=10000, random_state=random_state)  # 确保抽样可以复现，设置random_state

# 合并df3和df2
df = pd.concat([df2, df3], ignore_index=True)
df.to_csv(r'train_df.csv',index=False)

### 读取处理后的数据集

In [None]:
df = pd.read_csv(r'train_df.csv')

In [16]:
X = df.iloc[:,1:38]
Y = df.iloc[:,38:]

X_columns = X.columns.to_list()
Y_columns = Y.columns.to_list()

### 建立ml模型

In [22]:
from tools import save_json, load_json

In [None]:
import os 


logger = setup_logger(r'train_model_ml.log')

def process_column(column, model_name='decision_tree', k_neighbors=2):
 

    if os.path.exists(f'Simple_test/{model_name}_{column}.pkl'):
        return None 
    
    y = Y.loc[:, column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

    # 尝试SMOTE过采样，适应小样本情况
    try:
        smote = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    except ValueError as e:
        logger.error(f"Error in SMOTE for column {column}: {str(e)}")
        # 处理样本量不足的情况，回退到原始数据
        X_train_smote, y_train_smote = X_train, y_train
        logger.info(f"Using original data for training due to insufficient samples in column {column}")

    if len(y_train_smote.unique()) == 1:
        logger.info(f'The y label has only 1 value and is not trained')
        return None 

    logger.info(f'Start training feature column {column}...')
    model = ModelSelector(random_state).get_model(model_name)
    model.fit(X_train_smote, y_train_smote)
    with open(f'Simple_test/{model_name}_{column}.pkl', 'wb') as f:
        pickle.dump(model, f)
    logger.info(f'The training is complete, and the model is saved')

    y_pred = model.predict(X_test)
    return classification_report(y_test, y_pred), roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# 依然使用循环处理各列
model_name = 'decision_tree'
results = {}
for column in tqdm(Y_columns):
    results[column] = process_column(column,model_name)

save_json(results,rf'{model_name}_results.json')

  0%|          | 0/104 [00:00<?, ?it/s]2024-10-31 17:38:29,793 - logging_config - INFO - Start training feature column Class_A00...
2024-10-31 17:40:22,628 - logging_config - INFO - The training is complete, and the model is saved
  1%|          | 1/104 [01:53<3:15:28, 113.86s/it]2024-10-31 17:40:23,270 - logging_config - INFO - Start training feature column Class_A01...
2024-10-31 17:42:22,082 - logging_config - INFO - The training is complete, and the model is saved
  2%|▏         | 2/104 [03:53<3:19:09, 117.15s/it]2024-10-31 17:42:22,720 - logging_config - INFO - Start training feature column Class_A02...
2024-10-31 17:44:01,593 - logging_config - INFO - The training is complete, and the model is saved
  3%|▎         | 3/104 [05:32<3:03:39, 109.10s/it]2024-10-31 17:44:02,238 - logging_config - INFO - Start training feature column Class_A03...
2024-10-31 17:45:51,703 - logging_config - INFO - The training is complete, and the model is saved
  4%|▍         | 4/104 [07:22<3:02:29, 109.

#### 查看result结果

In [None]:
results = load_json(rf'{model_name}_results.json')

In [20]:
print(results['Class_A00'][0])
print(results['Class_A00'][1])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     72361
           1       0.00      0.00      0.00         4

    accuracy                           1.00     72365
   macro avg       0.50      0.50      0.50     72365
weighted avg       1.00      1.00      1.00     72365

0.4998618040104476


In [26]:
# 输出结果
for column in results.keys():
    print(f"结果对于标签列 {column}:")
    print(" 分类报告:\n", results[column][0])
    print(" AUC-ROC:",  results[column][1],end='\n\n')

结果对于标签列 Class_A00:
 分类报告:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     72361
           1       0.00      0.00      0.00         4

    accuracy                           1.00     72365
   macro avg       0.50      0.50      0.50     72365
weighted avg       1.00      1.00      1.00     72365

 AUC-ROC: 0.4998618040104476

结果对于标签列 Class_A01:
 分类报告:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     72348
           1       0.00      0.00      0.00        17

    accuracy                           1.00     72365
   macro avg       0.50      0.50      0.50     72365
weighted avg       1.00      1.00      1.00     72365

 AUC-ROC: 0.49912229778293804

结果对于标签列 Class_A02:
 分类报告:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     72194
           1       0.01      0.03      0.01       171

    accuracy                           0