In [None]:
%load_ext autoreload
from __future__ import print_function, division

In [None]:
%autoreload

import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve

import matplotlib.pyplot as plt

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from main import MIMICDATASET

#### 导入数据

训练集

In [None]:
%%time

# raw data 与 mort
train_raw_x = pd.read_csv('m_train.csv', index_col=[0, 1, 2], header = [0, 1, 2, 3])
train_mort_y = pd.read_csv('my_train.csv', index_col=[0, 1])

# 之后用于验证的diagnosis数据
diagnosis = pd.read_csv('ms_train.csv', index_col=[0, 1, 2])

测试集

In [None]:
test_raw_x = pd.read_csv('m_test.csv', index_col=[0, 1, 2], header = [0, 1, 2, 3])
test_mort_y = pd.read_csv('my_test.csv', index_col=[0, 1])

normalization

检查 head 确保导入成功

In [None]:
train_raw_x.head()

In [None]:
train_mort_y.head()

In [None]:
diagnosis.head()

In [None]:
test_raw_x.head()

In [None]:
test_mort_y.head()

## SMOTE 辅助数据

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification


In [None]:
sm = SMOTE(random_state=42)
train_raw_x, train_mort_y = sm.fit_resample(train_raw_x, train_mort_y)

#### 显微镜

In [None]:
mimic_dataset = MIMICDATASET(x_path='m_test.csv', x_s_path='ms_test.csv', y_path='my_test.csv')

观察数据集大小

In [None]:
print(len(mimic_dataset))

观察单个数据点

In [None]:
sample, sample_y = mimic_dataset[0]
print(f"Sample: {sample}, Sample_y: {sample_y}")

查看原始dataframe

In [None]:
ehr_data, label_data = mimic_dataset.return_data()
print(ehr_data.head())
print(label_data.head())


批量观察

In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(mimic_dataset, batch_size=4, shuffle=True)

for i, data in enumerate(dataloader):
    print(f"Batch {i} -> Number of elements in tuple: {len(data)}")

    for j, element in enumerate(data):
        print(f"Element {j} in batch {i}: {element}")



## 识别 numerical data 后 normalization

In [None]:
# 选择数值列
numeric_cols = train_raw_x.select_dtypes(include=['number']).columns.tolist()

In [None]:
# 选择标准差大于阈值的列
threshold = 0.5
std_devs = train_raw_x[numeric_cols].std()
numeric_cols_filtered = std_devs[std_devs > threshold].index.tolist()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_raw_x[numeric_cols_filtered] = scaler.fit_transform(train_raw_x[numeric_cols_filtered])

In [None]:
print(numeric_cols_filtered)


# Linear Regression 训练

In [None]:
# 实例化 lr
clf = LogisticRegression(max_iter=1000)

处理 mort column，保证只有一个 mort column

In [None]:
train_mort_y = train_mort_y['mort_icu']
test_mort_y = test_mort_y['mort_icu']

In [None]:
clf.fit(train_raw_x, train_mort_y)

In [None]:
y_pred = clf.predict(test_raw_x)

accuracy, recall, precision, f1

In [None]:
# 计算准确率
accuracy = accuracy_score(test_mort_y, y_pred)
print(f'Accuracy: {accuracy}')

# 计算召回率
recall = recall_score(test_mort_y, y_pred)
print(f'Recall: {recall}')

# 计算精度
precision = precision_score(test_mort_y, y_pred)
print(f'Precision: {precision}')

# 计算F1分数
f1 = f1_score(test_mort_y, y_pred)
print(f'F1 Score: {f1}')

In [None]:
y_pred_prob = clf.predict_proba(test_raw_x)[:, 1]  # 获取正类别（'1'）的概率预测值


# 计算AUC-ROC曲线下面积
roc_auc = roc_auc_score(test_mort_y, y_pred_prob)
print(f'ROC AUC: {roc_auc}')

# 获取用于绘制ROC曲线的值
fpr, tpr, _ = roc_curve(test_mort_y, y_pred_prob)

# 绘制曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')  # 绘制对角线
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


# Random Forest 训练

In [None]:
rdm = RandomForestClassifier()

In [None]:
rdm.fit(train_raw_x, train_mort_y)

In [None]:
y_pred = rdm.predict(test_raw_x)

evaluation

In [None]:
# 计算准确率
accuracy = accuracy_score(test_mort_y, y_pred)
print(f'Accuracy: {accuracy}')

# 计算召回率
recall = recall_score(test_mort_y, y_pred)
print(f'Recall: {recall}')

# 计算精度
precision = precision_score(test_mort_y, y_pred)
print(f'Precision: {precision}')

# 计算F1分数
f1 = f1_score(test_mort_y, y_pred)
print(f'F1 Score: {f1}')


AUC-ROC

In [None]:

# 获取正类别（'1'）的概率预测值
y_pred_prob = clf.predict_proba(test_raw_x)[:, 1]

# 计算AUC-ROC曲线下面积
roc_auc = roc_auc_score(test_mort_y, y_pred_prob)
print(f'ROC AUC: {roc_auc}')

# 获取用于绘制ROC曲线的值
fpr, tpr, _ = roc_curve(test_mort_y, y_pred_prob)

# 绘制曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')  # 绘制对角线
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# SVM