# 导入相关库

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# 读数据

In [2]:
data = pd.read_excel('./2023年中国研究生数学建模竞赛赛题/E题/data3a.xlsx')
data_image = pd.read_excel('./2023年中国研究生数学建模竞赛赛题/E题/竞赛发布数据/表3-患者影像信息血肿及水肿的形状及灰度分布.xlsx')


In [3]:
data = data.rename(columns={'入院首次影像检查流水号':'流水号'})

In [4]:
data = data.merge(data_image, on='流水号', how='left')

In [5]:
train = data.iloc[:100, :]
test = data.iloc[100:, :]

In [6]:
label = train['90天mRS']

In [7]:
label.value_counts()

3.0    20
2.0    20
1.0    19
5.0    15
4.0    12
0.0    10
6.0     4
Name: 90天mRS, dtype: int64

# 特征工程

In [8]:
train.drop(['ID', '流水号', '90天mRS'], axis=1, inplace=True)
test.drop(['ID', '流水号'], axis=1, inplace=True)
train['高压'] = train['血压'].apply(lambda x: int(x.split('/')[0]))
train['低压'] = train['血压'].apply(lambda x: int(x.split('/')[1]))
test['高压'] = test['血压'].apply(lambda x: int(x.split('/')[0]))
test['低压'] = test['血压'].apply(lambda x: int(x.split('/')[1]))
train.drop(['血压'], axis=1, inplace=True)
test.drop(['血压'], axis=1, inplace=True)
train['性别'] = train['性别'].map({'男':0, '女':1})
test['性别'] = test['性别'].map({'男':0, '女':1})
feat = train.columns
scaler = StandardScaler()
train[feat] = scaler.fit_transform(train[feat])
test[feat] = scaler.transform(test[feat])
print(len(feat))

73


In [9]:
feat = ['糖尿病史', 'HM_Cerebellum_R_Ratio', 'original_shape_Maximum2DDiameterSlice', 'NCCT_original_firstorder_InterquartileRange', 'original_shape_Elongation', 'ED_Pons_Medulla_L_Ratio', 'NCCT_original_firstorder_Range', '降颅压治疗', 'HM_Pons_Medulla_R_Ratio', '吸烟史', '饮酒史', 'ED_Cerebellum_R_Ratio', 'HM_volume']

# 交叉验证

In [10]:
# oof = np.zeros(len(train))
# feat_imp = np.zeros(len(feat))
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# for i, (tra_index, val_index) in enumerate(skf.split(train, label)):
#     print(f'第{i+1}折....')
#     tra_x = train.iloc[tra_index, :][feat].values
#     tra_y = label.iloc[tra_index].values
#     val_x = train.iloc[val_index, :][feat].values
#     val_y = label.iloc[val_index].values
#     model = LogisticRegression(random_state=42)
#     model.fit(tra_x, tra_y)
#     feat_imp += abs(np.mean(model.coef_, axis=0))
#     pred_soft = model.predict_proba(val_x)[:, 1]
#     pred = model.predict(val_x)
#     score = accuracy_score(val_y, pred)
#     print(f'准确率为{score}')
#     oof[val_index] = pred
# oof_score = accuracy_score(label.values, oof)
# feat_imp /= 5
# print('*' * 30)
# print(f'OOF 准确率 = {oof_score}')

In [11]:
def Range_Deviation(y_true, y_pred):
    n = len(y_true)
    m = np.sum(np.abs(y_true - y_pred) <= 1)
    
    return m / n

In [12]:
oof = np.zeros(len(train))
feat_imp = np.zeros(len(feat))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {'booster': 'gbtree', 
         'device': 'cpu', 
         'eta': 0.01, 
         'max_depth': 6, 
         'objective': 'multi:softmax',
          'num_round': 300, 
         'seed': 42, 
         'num_class': 7}
for i, (tra_index, val_index) in enumerate(skf.split(train, label)):
    print(f'第{i+1}折....')
    tra_x = train.iloc[tra_index, :][feat]
    tra_y = label.iloc[tra_index].values
    val_x = train.iloc[val_index, :][feat]
    val_y = label.iloc[val_index].values
    model = xgb.XGBClassifier(params, early_stopping_rounds=30)
    model.fit(tra_x, tra_y, eval_set=[(val_x, val_y)])
    feat_imp += model.feature_importances_
#    pred_soft = model.predict_proba(val_x)[:, 1]
    pred = model.predict(val_x)
    score = Range_Deviation(val_y, pred)
    print(f'准确率为{score}')
    oof[val_index] = pred
oof_score = Range_Deviation(label.values, oof)
feat_imp /= 5
print('*' * 30)
print(f'OOF 准确率 = {oof_score}')

第1折....
[0]	validation_0-mlogloss:1.82578
[1]	validation_0-mlogloss:1.75660
[2]	validation_0-mlogloss:1.75550
[3]	validation_0-mlogloss:1.72111
[4]	validation_0-mlogloss:1.68137
[5]	validation_0-mlogloss:1.67242
[6]	validation_0-mlogloss:1.68061
[7]	validation_0-mlogloss:1.66534
[8]	validation_0-mlogloss:1.65914
[9]	validation_0-mlogloss:1.66001
[10]	validation_0-mlogloss:1.67582
[11]	validation_0-mlogloss:1.68229
[12]	validation_0-mlogloss:1.70923
[13]	validation_0-mlogloss:1.70886
[14]	validation_0-mlogloss:1.71735
[15]	validation_0-mlogloss:1.71913
[16]	validation_0-mlogloss:1.70389
[17]	validation_0-mlogloss:1.71870
[18]	validation_0-mlogloss:1.73565
[19]	validation_0-mlogloss:1.75216
[20]	validation_0-mlogloss:1.75934
[21]	validation_0-mlogloss:1.76656
[22]	validation_0-mlogloss:1.77116
[23]	validation_0-mlogloss:1.77909
[24]	validation_0-mlogloss:1.78150
[25]	validation_0-mlogloss:1.78888
[26]	validation_0-mlogloss:1.79851
[27]	validation_0-mlogloss:1.79914
[28]	validation_0-mlog

# 特征选择

In [13]:
# feature_import = pd.DataFrame()
# feature_import['columns'] = feat
# feature_import['score'] = feat_imp
# feature_import = feature_import.sort_values(by='score', ascending=False)
# best_thre = 0
# best_score = 0
# for j in np.arange(0, 0.03, 0.001):
#     print(f'阈值={j}')
#     feat = list(feature_import[feature_import['score'] >= j]['columns'].values)
#     oof = np.zeros(len(train))
#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     for i, (tra_index, val_index) in enumerate(skf.split(train, label)):
#         tra_x = train.iloc[tra_index, :][feat]
#         tra_y = label.iloc[tra_index].values
#         val_x = train.iloc[val_index, :][feat]
#         val_y = label.iloc[val_index].values
#         model = xgb.XGBClassifier(params, early_stopping_rounds=30)
#         model.fit(tra_x, tra_y, eval_set=[(val_x, val_y)])
#         pred = model.predict(val_x)
#         score = Range_Deviation(val_y, pred)
#         oof[val_index] = pred
#     oof_score = Range_Deviation(label.values, oof)
#     if oof_score > best_score:
#         best_score = oof_score
#         best_thre = j
#     print(f'OOF 准确率 = {oof_score}')
#     print('*' * 30)

# print(list(feature_import[feature_import['score'] >= best_thre]['columns'].values))
# print(best_score)

# 全数据训练和预测

In [14]:
params = {'booster': 'gbtree', 
         'device': 'cpu', 
         'eta': 0.01, 
         'max_depth': 6, 
         'objective': 'multi:softmax',
          'num_round': 5, 
         'seed': 42, 
         'num_class': 7}
model = xgb.XGBClassifier(params)
model.fit(tra_x, tra_y)
train_pred = model.predict(train[feat])
test_pred = model.predict(test[feat])
all_pred = np.concatenate([train_pred, test_pred])
data['pred'] = all_pred
data[['ID', 'pred']].to_csv('3a.csv', index=None)

In [15]:
#可改进的地方：特征工程，使用多个模型，特征选择，调参，多模型集成