<a href="https://colab.research.google.com/github/lindapu-1/30DaysReadmission/blob/main/ALL_Machine_Learning_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
import zipfile
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/gdrive')
zip_data_path = "/gdrive/MyDrive/all_data_3612.zip"

!mkdir all_data_3612
extract_path = '/content/all_data_3612'

with zipfile.ZipFile(zip_data_path, 'r') as zip_ref:
   zip_ref.extractall(extract_path)

with open('/content/all_data_3612/ehr_preprocessed_seq_by_day_cat_embedding.pkl', 'rb') as f:
  data = pickle.load(f)

data.keys()

Mounted at /gdrive


dict_keys(['feat_dict', 'feature_cols', 'cat_idxs', 'cat_dims', 'demo_cols', 'icd_cols', 'lab_cols', 'med_cols'])

In [None]:
import pandas as pd

df_train = pd.read_csv("/content/all_data_3612/train.csv")
df_valid = pd.read_csv("/content/all_data_3612/valid.csv")
df_test = pd.read_csv("/content/all_data_3612/test.csv")

In [None]:
df_labels = pd.read_csv('1.csv')
df_test = df_test.merge(df_labels, on='id')

In [None]:
df = pd.concat([df_train, df_valid])

#Data prepare

In [None]:
#@title 正常dataset
import torch
from torch.utils.data import Dataset
import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, dataframe, feat_dict, include_labels=True):
        self.df = dataframe.drop_duplicates(subset=['id'], keep='first')
        self.feat_dict = feat_dict
        self.include_labels = include_labels
        self.mean, self.std = self.compute_stats()

    def compute_stats(self):
        # 将所有特征合并到一个大数组中
        all_features = torch.cat([torch.tensor(self.feat_dict[id], dtype=torch.float32) for id in self.df['id']])
        # 计算均值和标准差
        mean = all_features.mean(dim=0)
        std = all_features.std(dim=0)
        return mean, std

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        features = torch.tensor(self.feat_dict[row['id']], dtype=torch.float32)

        #features = (features - self.mean) / (self.std + 1e-6)  # 防止除以零
        if self.include_labels:
            label = row['readmitted_within_30days']
            return features, torch.tensor(label, dtype=torch.long)
        else:
            return features

train_dataset = CustomDataset(df, data['feat_dict'])
test_dataset=CustomDataset(df_test, data['feat_dict'], include_labels=True)

In [None]:
import torch

def extract_features_and_labels(dataset, feature_indices=None, retain_TS=False):
    features_list = []
    labels_list = []

    for features, label in dataset:
        # 如果提供了特征索引，只选择这些特征
        if feature_indices is not None:
            selected_features = features[:, feature_indices]
        else:
            selected_features = features

        # 检查是否保留时间序列
        if retain_TS:
            # 保留时间序列的第一个维度
            if selected_features.shape[0] < 30:
                # 如果少于30天，使用最后一天的值进行填充
                padding = torch.ones((30 - selected_features.shape[0], *selected_features.shape[1:])) * selected_features[-1]
                selected_features = torch.cat([selected_features, padding], dim=0)
            else:
                # 只取最后30天的值
                selected_features = selected_features[-30:]
        else:
            # 对时间维度进行平均
            selected_features = selected_features.mean(dim=0)  # 对时间维度取平均值

            # 添加住院天数作为一个新的特征
            num_days = features.shape[0]  # 时间维度的大小就是住院天数
            selected_features = torch.cat([selected_features, torch.tensor([num_days], dtype=torch.float32)])

        features_list.append(selected_features)  # 存储特征
        labels_list.append(label)  # 存储标签

    # 将列表转换为Tensor
    features_tensor = torch.stack(features_list)
    labels_tensor = torch.stack(labels_list)

    return features_tensor, labels_tensor

In [None]:
#@title indice
demo_indices = list(range(3))  # demo 特征在前3个位置

icd_cols_indices = list(range(3, 3+91))  # icd_cols 特征在接下来的91个位置


lab_cols_indices = list(range(3+91, 3+91+36))  # lab_cols 特征在接下来的36个位置
#加一个time feature

med_cols_indices = list(range(3+91+36, 3+91+36+41))  # med_cols 特征在接下来的41个位置
#加一个feature为这些value的standarlized sum（？）

# 打印结果
print("demo_indices:", demo_indices)
print("icd_cols_indices:", icd_cols_indices)
print("lab_cols_indices:", lab_cols_indices)
print("med_cols_indices:", med_cols_indices)

demo_indices: [0, 1, 2]
icd_cols_indices: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93]
lab_cols_indices: [94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129]
med_cols_indices: [130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170]


In [None]:
# 提取训练和测试特征
feature_indices_x1=demo_indices+icd_cols_indices+lab_cols_indices+med_cols_indices
feature_indices_x2=lab_cols_indices

train_features_1, train_labels = extract_features_and_labels(train_dataset,feature_indices_x1, retain_TS=False)
train_features_2, train_labels = extract_features_and_labels(train_dataset,feature_indices_x2, retain_TS=True)

test_features_1, test_labels = extract_features_and_labels(train_dataset,feature_indices_x1, retain_TS=False)
test_features_2, test_labels = extract_features_and_labels(train_dataset,feature_indices_x2, retain_TS=True)#last 30 days
#train_features_1, train_features_2->train_labels 训练
#test_features_1, test_features_2->test_labels 评估

In [None]:
train_features_2.shape

In [None]:
#@title 三个model
import torch.nn as nn

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=3, dropout_rate=0.8):
        super(LSTMModel, self).__init__()
        # 定义三层LSTM，每层之间添加dropout
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout_rate)
        # Dropout层
        self.dropout = nn.Dropout(dropout_rate)
        # 定义线性层
        self.fc_token = nn.Linear(hidden_size, output_size)  # 用于 token 改变量的预测
        self.fc_label = nn.Linear(hidden_size, 3)  # 用于每个时间步的标签预测

    def forward(self, x):
        # LSTM 层
        lstm_out, _ = self.lstm(x)
        # 应用dropout
        lstm_out = self.dropout(lstm_out)
        # 在每个时间步预测 token 的改变量
        token_change = self.fc_token(lstm_out)
        # 在每个时间步预测标签
        labels = self.fc_label(lstm_out)
        return token_change, labels


class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        #self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        #out = self.sigmoid(out)
        return out

class SimpleFusionNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate=0.5):
        super(SimpleFusionNetwork, self).__init__()

        # Input layer
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        # Output layer
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid() #因为pred是prob所以框在0-1
        )

    def forward(self, x):
        x = self.input_layer(x)
        x = self.output_layer(x)
        return x

class VerySimpleFusionNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
      super(VerySimpleFusionNetwork, self).__init__()
      self.fc = nn.Linear(input_dim, output_dim)
      self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        out = self.sigmoid(out)
        return out


class FusionModel(nn.Module):
    def __init__(self, lstm_input_size, lstm_hidden_size, lstm_output_size, nn_input_dim, nn_output_dim, fusion_input_dim, fusion_hidden_dim, dropout_rate=0.5):
        super(FusionModel, self).__init__()

        # 定义LSTM模型
        self.lstm_model = LSTMModel(lstm_input_size, lstm_hidden_size, lstm_output_size, dropout_rate=dropout_rate)

        # 定义逻辑回归模型
        self.nn_model = LogisticRegression(nn_input_dim, nn_output_dim)

        # 定义融合网络
        self.fusion_model = SimpleFusionNetwork(fusion_input_dim, fusion_hidden_dim, dropout_rate=dropout_rate)

    def forward(self, batch_features_1, batch_features_2):
        _, lstm_out = self.lstm_model(batch_features_2)
        nn_out = self.nn_model(batch_features_1)

        # 取 LSTM 输出的最后一个时间步
        lstm_out_last = lstm_out[:, -1, :]

        # 融合 LSTM 和 NN 的输出
        fusion_in = torch.cat((nn_out, lstm_out_last), dim=1)
        fusion_out = self.fusion_model(fusion_in)

        return fusion_out

In [None]:
import torch.nn as nn


class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        out = self.sigmoid(out)
        return out


class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()  # 添加 Sigmoid 激活函数

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)  # 确保输出在 [0,1] 范围内
        return out

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=3, dropout_rate=0.8):
        super(LSTMModel, self).__init__()
        # 定义三层LSTM，每层之间添加dropout
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout_rate)
        # Dropout层
        self.dropout = nn.Dropout(dropout_rate)
        # 定义线性层
        self.fc_token = nn.Linear(hidden_size, output_size)  # 用于 token 改变量的预测
        self.fc_label = nn.Linear(hidden_size, 3)  # 用于每个时间步的标签预测

    def forward(self, x):
        # LSTM 层
        lstm_out, _ = self.lstm(x)
        # 应用dropout
        lstm_out = self.dropout(lstm_out)
        # 在每个时间步预测 token 的改变量
        token_change = self.fc_token(lstm_out)
        # 在每个时间步预测标签
        labels = self.fc_label(lstm_out)
        return token_change, labels

In [None]:
#initialze model + dataloader
nn_model = LogisticRegression(input_dim=136,output_dim=1)
lstm_model = LSTMModel(input_size=36, hidden_size=128, output_size=50)
fusion_model = VerySimpleFusionNetwork(input_dim=2, output_dim=1)

from torch.utils.data import TensorDataset, DataLoader

train_features_1 = torch.tensor(train_features_1, dtype=torch.float32)
train_features_2 = torch.tensor(train_features_2, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.float32)

test_features_1 = torch.tensor(test_features_1, dtype=torch.float32)
test_features_2 = torch.tensor(test_features_2, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.float32)


train_dataset = TensorDataset(train_features_1, train_features_2, train_labels)
test_dataset = TensorDataset(test_features_1, test_features_2, test_labels)

#dataloader
batch_size = 1000
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

  train_features_1 = torch.tensor(train_features_1, dtype=torch.float32)
  train_features_2 = torch.tensor(train_features_2, dtype=torch.float32)
  train_labels = torch.tensor(train_labels, dtype=torch.float32)
  test_features_1 = torch.tensor(test_features_1, dtype=torch.float32)
  test_features_2 = torch.tensor(test_features_2, dtype=torch.float32)
  test_labels = torch.tensor(test_labels, dtype=torch.float32)


In [None]:
demo_indices = list(range(3))  # demo 特征在前3个位置

icd_cols_indices = list(range(3, 3+91))  # icd_cols 特征在接下来的91个位置


lab_cols_indices = list(range(3+91, 3+91+36))  # lab_cols 特征在接下来的36个位置
#加一个time feature

med_cols_indices = list(range(3+91+36, 3+91+36+41))


print(
    data['feat_dict']['11674366_29673314'][1][3:3+91]
)
print(data['feat_dict']['11674366_29673314'][1][3:3+91])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [None]:
#@title all model train and evaluate:
from sklearn.metrics import roc_auc_score
import torch.optim as optim

num_epochs=5
learning_rate=1e-4

criterion = torch.nn.BCELoss()
optimizer = optim.SGD(list(nn_model.parameters()) + list(lstm_model.parameters()) + list(fusion_model.parameters()), lr=learning_rate)

for epoch in range(num_epochs):
    for batch_features_1, batch_features_2, batch_labels in train_dataloader:

        nn_out = nn_model(batch_features_1)
        _,lstm_out = lstm_model(batch_features_2)

        # 取最后一个时间步的输出
        lstm_out_last = lstm_out[:, -1, :]

        fusion_in = torch.cat((nn_out, lstm_out_last), dim=1)
        fusion_out = fusion_model(fusion_in)

        # 计算loss
        batch_labels = batch_labels.unsqueeze(1)
        loss = criterion(fusion_out, batch_labels)

        # 反向传播和优化
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f'Loss: {loss.item()}')

    #一个epoch结束
    nn_model.eval()
    lstm_model.eval()
    fusion_model.eval()

    preds = []
    true_labels = []

    #infer+evaluate
    with torch.no_grad():
        for batch_features_1, batch_features_2, batch_labels in test_dataloader:
            nn_out = nn_model(batch_features_1)
            _, lstm_out = lstm_model(batch_features_2)
            lstm_out_last = lstm_out[:, -1, :]
            fusion_in = torch.cat((nn_out, lstm_out_last), dim=1)
            fusion_out = fusion_model(fusion_in)

            probas = fusion_out.squeeze().cpu().numpy()
            preds.extend(probas)
            true_labels.extend(batch_labels.squeeze().cpu().numpy())

    # 计算本次epoch的AUC
    auc = roc_auc_score(true_labels, preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test AUC: {auc}')

    #训练模式
    nn_model.train()
    lstm_model.train()
    fusion_model.train()


In [None]:
#train and evaluate: part 1
from sklearn.metrics import roc_auc_score
import torch.optim as optim

input_dim = 172  # 这是一个示例值，你需要根据你的数据来设置
hidden_dim = 10  # 这也是一个示例值，你可以根据你的需求来设置
output_dim = 1  # 这是一个例子，假设你的问题是二分类问题

nn_model = LogisticRegression(input_dim, output_dim)

num_epochs=10
learning_rate=1e-4

criterion = torch.nn.BCELoss()
optimizer = optim.SGD(list(nn_model.parameters()), lr=learning_rate)  # 只优化 nn_model 的参数

for epoch in range(num_epochs):
    for batch_features_1, _, batch_labels in test_dataloader:  # 只使用 feature1 和 labels

        nn_out = nn_model(batch_features_1)

        # 计算loss
        batch_labels = batch_labels.unsqueeze(1)
        loss = criterion(nn_out, batch_labels)  # 直接使用 nn_out 计算 loss

        # 反向传播和优化
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f'Loss: {loss.item()}')

    #一个epoch结束
    nn_model.eval()

    preds = []
    true_labels = []

    #infer+evaluate
    with torch.no_grad():
        for batch_features_1, _,batch_labels in test_dataloader:  # 只使用 feature1 和 labels
            nn_out = nn_model(batch_features_1)

            probas = nn_out.squeeze().cpu().numpy()
            preds.extend(probas)
            true_labels.extend(batch_labels.squeeze().cpu().numpy())

    # 计算本次epoch的AUC
    auc = roc_auc_score(true_labels, preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test AUC: {auc}')

    #训练模式
    nn_model.train()

In [None]:
# 实例化模型
input_dim = 100  # 例如，你的输入维度是100
output_dim = 1  # 例如，你的输出维度是1
model = LogisticRegression(input_dim, output_dim)

# 定义损失函数和优化器
criterion = nn.BCEWithLogitsLoss()  # 由于你没有在模型中使用sigmoid，所以这里使用BCEWithLogitsLoss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)  # 使用SGD优化器

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    for batch_features, batch_labels in train_dataloader:
        # 前向传播
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 打印损失信息
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

ValueError: ignored

In [None]:
batch_labels

tensor([0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 1., 0., 0., 0., 0.])