# **Application of machine learning in healthcare**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Load data (assumes a CSV with features and a 'readmitted' binary target)
df = pd.read_csv('/kaggle/input/ecr-healthcare/ehr_readmission.csv')

# Define static and dynamic feature lists (example)
static_features = ['age', 'gender', 'race', 'admission_type']
continuous_features = ['lab_creatinine', 'lab_glucose', 'vital_hr', 'vital_bp_systolic']
categorical_features = ['gender', 'race', 'admission_type']
target = 'readmitted'

# Split data
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df[target], random_state=42)
X_train = df_train[static_features + continuous_features]
X_test = df_test[static_features + continuous_features]
y_train = df_train[target].values
y_test = df_test[target].values

# Preprocessing pipeline
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, continuous_features),
    ('cat', cat_pipeline, categorical_features)
])

# Logistic Regression baseline
logreg_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))
])

logreg_pipeline.fit(X_train, y_train)
y_pred_proba = logreg_pipeline.predict_proba(X_test)[:, 1]
print("Logistic Regression AUC-ROC:", roc_auc_score(y_test, y_pred_proba))
print("Logistic Regression AUC-PR:", average_precision_score(y_test, y_pred_proba))

# Prepare data for PyTorch MLP
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

if hasattr(X_train_proc, 'toarray'):
    X_train_proc = X_train_proc.toarray()
    X_test_proc = X_test_proc.toarray()

tensor_x_train = torch.tensor(X_train_proc, dtype=torch.float32)
tensor_y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
tensor_x_test = torch.tensor(X_test_proc, dtype=torch.float32)
tensor_y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(tensor_x_train, tensor_y_train)
test_dataset = TensorDataset(tensor_x_test, tensor_y_test)

dataloader_train = DataLoader(train_dataset, batch_size=64, shuffle=True)
dataloader_test = DataLoader(test_dataset, batch_size=64)

class ReadmissionMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ReadmissionMLP(tensor_x_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, 11):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in dataloader_train:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch}: Train loss = {epoch_loss / len(dataloader_train):.4f}")

model.eval()
with torch.no_grad():
    preds = []
    targets = []
    for X_batch, y_batch in dataloader_test:
        X_batch = X_batch.to(device)
        y_pred = model(X_batch).cpu().numpy()
        preds.extend(y_pred.flatten().tolist())
        targets.extend(y_batch.numpy().flatten().tolist())

print("MLP AUC-ROC:", roc_auc_score(targets, preds))
print("MLP AUC-PR:", average_precision_score(targets, preds))
print(classification_report(targets, [1 if p > 0.5 else 0 for p in preds], zero_division=0))


Logistic Regression AUC-ROC: 0.5227373722411495
Logistic Regression AUC-PR: 0.21483853220711757
Epoch 1: Train loss = 0.6189
Epoch 2: Train loss = 0.5302
Epoch 3: Train loss = 0.5395
Epoch 4: Train loss = 0.5231
Epoch 5: Train loss = 0.5248
Epoch 6: Train loss = 0.5205
Epoch 7: Train loss = 0.5198
Epoch 8: Train loss = 0.5241
Epoch 9: Train loss = 0.5223
Epoch 10: Train loss = 0.5264
MLP AUC-ROC: 0.5288105465856909
MLP AUC-PR: 0.2181835338355407
              precision    recall  f1-score   support

         0.0       0.79      1.00      0.88       157
         1.0       0.00      0.00      0.00        43

    accuracy                           0.79       200
   macro avg       0.39      0.50      0.44       200
weighted avg       0.62      0.79      0.69       200

