# 导入相关库

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# 读数据

In [2]:
data = pd.read_excel('data1b.xlsx')
data_image = pd.read_excel('表3-患者影像信息血肿及水肿的形状及灰度分布.xlsx')
label = pd.read_csv('fin_result.csv')

In [3]:
data = data.rename(columns={'Unnamed: 0':'ID', '入院首次影像检查流水号':'流水号'})
data = data.merge(data_image, on='流水号', how='left')
data = data.merge(label[['ID', 'label']], on='ID', how='left')

In [4]:
train = data.iloc[:100, :]
test = data.iloc[100:, :]

In [5]:
label = train['label']

In [6]:
label.value_counts()

0.0    77
1.0    23
Name: label, dtype: int64

# 特征工程

In [7]:
train.drop(['ID', '流水号', 'label'], axis=1, inplace=True)
test.drop(['ID', '流水号'], axis=1, inplace=True)
train['高压'] = train['血压'].apply(lambda x: int(x.split('/')[0]))
train['低压'] = train['血压'].apply(lambda x: int(x.split('/')[1]))
test['高压'] = test['血压'].apply(lambda x: int(x.split('/')[0]))
test['低压'] = test['血压'].apply(lambda x: int(x.split('/')[1]))
train.drop(['血压'], axis=1, inplace=True)
test.drop(['血压'], axis=1, inplace=True)
train['性别'] = train['性别'].map({'男':0, '女':1})
test['性别'] = test['性别'].map({'男':0, '女':1})
feat = train.columns
scaler = StandardScaler()
train[feat] = scaler.fit_transform(train[feat])
test[feat] = scaler.transform(test[feat])
print(len(feat))

73


In [8]:
feat = ['饮酒史',
 'ED_PCA_R_Ratio',
 '卒中病史',
 'HM_ACA_R_Ratio',
 '止血治疗',
 '脑出血前mRS评分',
 'NCCT_original_firstorder_Kurtosis',
 '发病到首次影像检查时间间隔',
 '营养神经',
 'HM_PCA_L_Ratio',
 'original_shape_Maximum3DDiameter',
 '降压治疗',
 'ED_MCA_R_Ratio',
 'original_shape_Maximum2DDiameterColumn',
 '冠心病史',
 'NCCT_original_firstorder_Minimum',
 '低压']

# 交叉验证

In [9]:
oof = np.zeros(len(train))
feat_imp = np.zeros(len(feat))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (tra_index, val_index) in enumerate(skf.split(train, label)):
    print(f'第{i+1}折....')
    tra_x = train.iloc[tra_index, :][feat].values
    tra_y = label.iloc[tra_index].values
    val_x = train.iloc[val_index, :][feat].values
    val_y = label.iloc[val_index].values
    model = LogisticRegression(class_weight='balanced', random_state=42)
    model.fit(tra_x, tra_y)
    feat_imp += abs(model.coef_.reshape(-1))
    pred_soft = model.predict_proba(val_x)[:, 1]
    pred = model.predict(val_x)
    score = accuracy_score(val_y, pred)
    print(f'准确率为{score}')
    oof[val_index] = pred
oof_score = accuracy_score(label.values, oof)
feat_imp /= 5
print('*' * 30)
print(f'OOF 准确率 = {oof_score}')

第1折....
准确率为0.65
第2折....
准确率为0.65
第3折....
准确率为0.75
第4折....
准确率为0.8
第5折....
准确率为0.8
******************************
OOF 准确率 = 0.73


In [10]:
# oof = np.zeros(len(train))
# feat_imp = np.zeros(len(feat))
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# for i, (tra_index, val_index) in enumerate(skf.split(train, label)):
#     print(f'第{i+1}折....')
#     tra_x = train.iloc[tra_index, :][feat].values
#     tra_y = label.iloc[tra_index].values
#     val_x = train.iloc[val_index, :][feat].values
#     val_y = label.iloc[val_index].values
#     model = GradientBoostingClassifier(random_state=42)
#     model.fit(tra_x, tra_y)
#     feat_imp += model.feature_importances_
#     pred_soft = model.predict_proba(val_x)[:, 1]
#     pred = model.predict(val_x)
#     score = accuracy_score(val_y, pred)
#     print(f'准确率为{score}')
#     oof[val_index] = pred
# oof_score = accuracy_score(label.values, oof)
# feat_imp /= 5
# print('*' * 30)
# print(f'OOF 准确率 = {oof_score}')

# 特征选择

In [11]:
# feature_import = pd.DataFrame()
# feature_import['columns'] = feat
# feature_import['score'] = feat_imp
# feature_import = feature_import.sort_values(by='score', ascending=False)
# for j in np.arange(0.2, 0.91, 0.01):
#     print(f'阈值={j}')
#     feat = list(feature_import[feature_import['score'] >= j]['columns'].values)
#     oof = np.zeros(len(train))
#     feat_imp = np.zeros(len(feat))
#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     for i, (tra_index, val_index) in enumerate(skf.split(train, label)):
#         tra_x = train.iloc[tra_index, :][feat].values
#         tra_y = label.iloc[tra_index].values
#         val_x = train.iloc[val_index, :][feat].values
#         val_y = label.iloc[val_index].values
#         model = LogisticRegression(class_weight='balanced', random_state=42)
#         model.fit(tra_x, tra_y)
#         pred_soft = model.predict_proba(val_x)[:, 1]
#         pred = model.predict(val_x)
#         score = accuracy_score(val_y, pred)
#         oof[val_index] = pred
#     oof_score = accuracy_score(label.values, oof)
#     print(f'OOF 准确率 = {oof_score}')
#     print('*' * 30)


In [12]:
# list(feature_import[feature_import['score'] >= 0.48]['columns'].values)

# 全数据训练和预测

In [13]:
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(train[feat].values, label.values)
train_pred = model.predict_proba(train[feat].values)[:, 1]
test_pred = model.predict_proba(test[feat].values)[:, 1]
all_pred = np.concatenate([train_pred, test_pred])
data['pred'] = all_pred
data[['ID', 'pred']].to_csv('1b.csv', index=None)

In [14]:
#可改进的地方：特征工程，使用多个模型，特征选择，调参，多模型集成