In [190]:
%load_ext autoreload
from __future__ import print_function, division

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [191]:
%autoreload

import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve

import matplotlib.pyplot as plt

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from main import MIMICDATASET

#### 导入数据

训练集

In [206]:
%%time

# raw data 与 mort
train_raw_x = pd.read_csv('m_train.csv', index_col=[0, 1, 2], header = [0, 1, 2, 3])
train_mort_y = pd.read_csv('my_train.csv', index_col=[0, 1])

# 之后用于验证的diagnosis数据
diagnosis = pd.read_csv('ms_train.csv')

CPU times: total: 7.59 s
Wall time: 18.2 s


测试集

In [193]:
test_raw_x = pd.read_csv('m_test.csv', index_col=[0, 1, 2], header = [0, 1, 2, 3])
test_mort_y = pd.read_csv('my_test.csv', index_col=[0, 1])

normalization

检查 head 确保导入成功

In [None]:
train_raw_x.head()

In [None]:
train_mort_y.head()

In [207]:
diagnosis.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,diagnosis,ethnicity,admission_type
0,3,145834,211552,HYPOTENSION,WHITE,EMERGENCY
1,6,107064,228232,CHRONIC RENAL FAILURE/SDA,WHITE,ELECTIVE
2,9,150750,220597,HEMORRHAGIC CVA,UNKNOWN/NOT SPECIFIED,EMERGENCY
3,11,194540,229441,BRAIN MASS,WHITE,EMERGENCY
4,12,112213,232669,PANCREATIC CANCER/SDA,WHITE,ELECTIVE


In [None]:
test_raw_x.head()

In [None]:
test_mort_y.head()

## SMOTE 辅助数据

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification


In [None]:
sm = SMOTE(random_state=42)
train_raw_x, train_mort_y = sm.fit_resample(train_raw_x, train_mort_y)

#### 显微镜

In [None]:
mimic_dataset = MIMICDATASET(x_path='m_test.csv', x_s_path='ms_test.csv', y_path='my_test.csv')

观察数据集大小

In [None]:
print(len(mimic_dataset))

观察单个数据点

In [None]:
sample, sample_y = mimic_dataset[0]
print(f"Sample: {sample}, Sample_y: {sample_y}")

查看原始dataframe

In [None]:
ehr_data, label_data = mimic_dataset.return_data()
print(ehr_data.head())
print(label_data.head())


批量观察

In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(mimic_dataset, batch_size=4, shuffle=True)

for i, data in enumerate(dataloader):
    print(f"Batch {i} -> Number of elements in tuple: {len(data)}")

    for j, element in enumerate(data):
        print(f"Element {j} in batch {i}: {element}")



## 识别 numerical data 后 normalization

In [None]:
# 选择数值列
numeric_cols = train_raw_x.select_dtypes(include=['number']).columns.tolist()

In [None]:
# 选择标准差大于阈值的列
threshold = 0.5
std_devs = train_raw_x[numeric_cols].std()
numeric_cols_filtered = std_devs[std_devs > threshold].index.tolist()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_raw_x[numeric_cols_filtered] = scaler.fit_transform(train_raw_x[numeric_cols_filtered])

In [None]:
print(numeric_cols_filtered)


# Linear Regression 训练

In [None]:
# 实例化 lr
clf = LogisticRegression(max_iter=1000)

处理 mort column，保证只有一个 mort column

In [None]:
train_mort_y = train_mort_y['mort_icu']
test_mort_y = test_mort_y['mort_icu']

In [None]:
clf.fit(train_raw_x, train_mort_y)

In [None]:
y_pred = clf.predict(test_raw_x)

accuracy, recall, precision, f1

In [None]:
# 计算准确率
accuracy = accuracy_score(test_mort_y, y_pred)
print(f'Accuracy: {accuracy}')

# 计算召回率
recall = recall_score(test_mort_y, y_pred)
print(f'Recall: {recall}')

# 计算精度
precision = precision_score(test_mort_y, y_pred)
print(f'Precision: {precision}')

# 计算F1分数
f1 = f1_score(test_mort_y, y_pred)
print(f'F1 Score: {f1}')

In [None]:
y_pred_prob = clf.predict_proba(test_raw_x)[:, 1]  # 获取正类别（'1'）的概率预测值


# 计算AUC-ROC曲线下面积
roc_auc = roc_auc_score(test_mort_y, y_pred_prob)
print(f'ROC AUC: {roc_auc}')

# 获取用于绘制ROC曲线的值
fpr, tpr, _ = roc_curve(test_mort_y, y_pred_prob)

# 绘制曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')  # 绘制对角线
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


# Random Forest 训练

In [None]:
rdm = RandomForestClassifier()

In [None]:
rdm.fit(train_raw_x, train_mort_y)

In [None]:
y_pred = rdm.predict(test_raw_x)

evaluation

In [None]:
# 计算准确率
accuracy = accuracy_score(test_mort_y, y_pred)
print(f'Accuracy: {accuracy}')

# 计算召回率
recall = recall_score(test_mort_y, y_pred)
print(f'Recall: {recall}')

# 计算精度
precision = precision_score(test_mort_y, y_pred)
print(f'Precision: {precision}')

# 计算F1分数
f1 = f1_score(test_mort_y, y_pred)
print(f'F1 Score: {f1}')


AUC-ROC

In [None]:

# 获取正类别（'1'）的概率预测值
y_pred_prob = clf.predict_proba(test_raw_x)[:, 1]

# 计算AUC-ROC曲线下面积
roc_auc = roc_auc_score(test_mort_y, y_pred_prob)
print(f'ROC AUC: {roc_auc}')

# 获取用于绘制ROC曲线的值
fpr, tpr, _ = roc_curve(test_mort_y, y_pred_prob)

# 绘制曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')  # 绘制对角线
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Diagnosis处理

In [220]:
# Global mapping dictionaries
g_map = {'ELECTIVE': 1, 'URGENT': 2, 'EMERGENCY': 3, '': 0, 'NaN': 0, 'Unknown': 0, 'Other': 0}
e_map = {'ASIAN': 1, 'BLACK': 2, 'AFRICAN AMERICAN': 2, 'WHITE': 3, 'HISPANIC': 4, 'LATINO':4, 'NATIVE': 5, 'NaN': 0, '': 0}

# Convert gender to numbers
def transform_ad(gender_series):
    global g_map
    return {'gender': gender_series.fillna('').apply(lambda s: g_map.get(s, g_map.get('')))}
    
# Convert ethnicity to numbers
def transform_eth(ethnicity_series):
    global e_map
    return {'ethnicity': ethnicity_series.fillna('').apply(lambda s: e_map.get(s, e_map.get('')))}

# Convert diagnosis into numbers, considering multiple diagnoses separated by semicolon
def transform_dx_into_id(df, column_name='diagnosis'):
    # Fill NaN values with 'nodx'
    df[column_name].fillna('nodx', inplace=True)
    
    # Get the unique diagnoses by splitting them at the semicolon and flattening the list
    all_diagnoses = df[column_name].apply(lambda x: x.split(';')).explode().unique()
    
    # Factorize the unique diagnoses
    dict_dx_val, dict_dx_key = pd.factorize(all_diagnoses)
    
    # Create a dictionary for mapping
    dictionary = dict(zip(dict_dx_key, dict_dx_val))
    
    # Map each diagnosis to its respective id
    df[column_name] = df[column_name].apply(lambda x: ';'.join([str(dictionary[i]) for i in x.split(';')]))
    
    return df

# Gender and ethnicity maps
ad_map = {'ELECTIVE': 1, 'URGENT': 2, 'EMERGENCY': 3, '': 0, 'NaN': 0, 'Unknown': 0, 'Other': 0}
e_map = {'ASIAN': 1, 'BLACK': 2, 'AFRICAN AMERICAN': 2, 'WHITE': 3, 'HISPANIC': 4, 'LATINO':4, 'NATIVE': 5, 'NaN': 0, '': 0}


In [223]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('ms_train.csv')

# Apply transformations
df = transform_dx_into_id(df, 'diagnosis')  # transform diagnosis
transformed_gender_series = transform_ad(df['admission_type'])  # transform admission_type
transformed_ethnicity_series = transform_eth(df['ethnicity'])  # transform ethnicity

# Add new columns for transformed data
df['transformed_ethnicity'] = transformed_ethnicity_series['ethnicity']
df['transformed_admission_type'] = transformed_gender_series['gender']

# Save the transformed DataFrame back to a new CSV file
df.to_csv('ms_train_transformed.csv', index=False)
