# setup

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.model_selection import train_test_split



In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [None]:
data = pd.read_csv("/nobackup/users/ericason/mlhc-final-project/clean_data/nafl/combined.large.nafl.csv")

In [None]:
# create the X and Y datasets

data = data.drop(columns='DaysUntilFirstProgression')
# data = data.drop(columns='Outcome')
data = data.drop(columns='Censored')

Y = data[['StudyID', 'Outcome']]
# Y = data[['StudyID', 'DaysUntilFirstProgression']]
X = data.drop(columns='Outcome')
X = X.drop(columns=['mean_BMI_category', 'last_BMI_category'])


X = X.set_index('StudyID')
Y = Y.set_index('StudyID')

In [None]:
Y = Y.astype(int)        # from True/False → 1/0


In [None]:
# check if GPU is enabled
device = "cuda" if torch.cuda.is_available() else "cpu" # need to define device since python can use both cpu and gpu
print(f"Using {device} device")
print(f"Shape of X: {X.shape}. Shape of Y: {Y.shape}.")

In [None]:
#bmi, lab, age 
X.head()

In [None]:
Y.value_counts()

# establish the model

In [None]:
#reweighted
class MAFLDDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.w = torch.tensor(w, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.w[idx]

In [None]:
# define by subclassing nn.Module and initialize the neural network layers in __init__.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
# create an instance of NeuralNetwork, move to device, print its structure
model = NeuralNetwork().to(device)
# print(model)

# define loss function and optimizer
# loss_fn = nn.MSELoss()
# loss_fn = nn.BCELoss() # if using BCELoss, do not run the sigmoid layer in the forward step!
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3) # start with this baseline learning rate

# train model on train/test split



In [None]:

#reweight
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import torch
import pandas as pd

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Identify columns to scale
cols_bmi_age = [col for col in X.columns if 'bmi' in col.lower() or 'age' in col.lower()]
cols_lab = [col for col in X.columns if col.startswith("Lab_") and pd.api.types.is_numeric_dtype(X[col])]
cols_to_scale = cols_bmi_age + cols_lab

# Apply scaling
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

# Convert to numpy arrays
X_train_np = X_train_scaled.values.astype(np.float32)
y_train_np = y_train.values.astype(np.float32) 
X_test_np = X_test_scaled.values.astype(np.float32)
y_test_np = y_test.values.astype(np.float32)

# Compute class weights
counts = Counter(y_train_np.ravel())
total = sum(counts.values())
#class_weights = {cls: total/count for cls, count in counts.items()}
class_weights = {
    0.0: 1.0,   # majority class
    1.0: 50.0   # minority class – give 50x more importance
}

sample_weights = np.array([class_weights[y] for y in y_train_np.ravel()], dtype=np.float32)



# Convert to torch tensors
X_torch = torch.tensor(X_train_np, dtype=torch.float32)
Y_torch = torch.tensor(y_train_np, dtype=torch.float32)
W_torch = torch.tensor(sample_weights, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32).to(device)
Y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).to(device)


In [None]:

from torch.utils.data import DataLoader
train_data = MAFLDDataset(X_torch, Y_torch, W_torch)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

In [None]:
import torch.nn as nn

class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
from sklearn.metrics import roc_auc_score
#early stopping

model = SimpleNN(X_torch.shape[1]).to(device)
loss_fn = nn.BCEWithLogitsLoss(reduction='none')  # We'll weight manually
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-3)

num_epochs = 30
patience = 5  # early stopping patience
best_auc = 0
counter = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for batch_X, batch_y, batch_w in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device).squeeze() 
        batch_w = batch_w.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X).squeeze()
        loss = (loss_fn(outputs, batch_y) * batch_w).mean()

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # validation
    model.eval()
    with torch.no_grad():
        val_preds = model(X_test_tensor).squeeze()
        val_probs = torch.sigmoid(val_preds)
        val_auc = roc_auc_score(y_test_np.ravel(), val_probs.cpu().numpy())

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Val AUC: {val_auc:.4f}")

    # early stopping
    if val_auc > best_auc:
        best_auc = val_auc
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break


In [None]:
# reweight
# original model
# num_epochs = 20

# for epoch in range(num_epochs):
#     print(f"Epoch {epoch+1}/{num_epochs}")
    
#     for batch_X, batch_y, batch_w in train_data:  # train_data must return sample weights too
#         # Move to device
#         batch_X = torch.tensor(batch_X).to(device)
#         batch_y = torch.tensor(batch_y).to(device)
#         batch_w = torch.tensor(batch_w).to(device)

#         # Zero gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(batch_X)

#         # Compute weighted loss (e.g., Binary Cross Entropy)
#         loss = (loss_fn(outputs, batch_y) * batch_w).mean()

#         # Backward and optimize
#         loss.backward()
#         optimizer.step()

#     print(f"Loss: {loss.item():.4f}")


In [None]:
#reweight
# Run model on test data
X_test_tensor = torch.tensor(X_test_scaled.astype(np.float32).values, dtype=torch.float32).to(device)

# Run model and evaluate
Y_hat_test = model(X_test_tensor)
Y_hat_probs = torch.sigmoid(Y_hat_test)
Y_pred_binary = (Y_hat_probs > 0.2).float()

from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test.values.ravel(), Y_pred_binary.cpu().detach().numpy()))



In [None]:
Y_hat_probs

In [None]:
print(classification_report(y_test, Y_pred_binary.cpu().detach().numpy()))

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test_np, Y_hat_probs.cpu().detach().numpy())



In [None]:
!pip install matplotlib

# tweaking model design

In [None]:
# original model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1), # no activation follows this layer
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

In [None]:
# adding dropout, switching to LeakyReLU, adding batchnorm layers
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 256),
            nn.LeakyReLU(),

            nn.Linear(256, 128),
            nn.Dropout(0.2),
            nn.LeakyReLU(),

            nn.Linear(128, 64),
            nn.LeakyReLU(),

            nn.Linear(64, 1)
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

In [None]:
# attempting skip connections
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.relu(x + self.block(x))  # skip connection

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, 256)

        self.resblock1 = ResidualBlock(256)
        self.resblock2 = ResidualBlock(256)
        self.resblock3 = ResidualBlock(256)

        self.output_layer = nn.Linear(256, 1)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.resblock1(x)
        x = self.resblock2(x)
        x = self.resblock3(x)
        return self.output_layer(x)

In [None]:
# creating an experiment manager that can test run the various edits we want to make
from itertools import product

search_space = {
    "hidden_sizes": [[512, 128], [1024, 512, 128]],
    "activation": ["relu", "leaky_relu"],
    "dropout": [0.0, 0.2],
    "use_batchnorm": [True, False],
    "learning_rate": [1e-3, 1e-4]
}

# Create list of all combinations
all_configs = [dict(zip(search_space.keys(), values)) for values in product(*search_space.values())]

In [None]:
import torch.nn as nn

def get_activation(name):
    return {
        "relu": nn.ReLU(),
        "leaky_relu": nn.LeakyReLU(0.01),
    }[name]

class FlexibleNetwork(nn.Module):
    def __init__(self, input_dim, hidden_sizes, activation, dropout, use_batchnorm):
        super().__init__()
        layers = []
        last_dim = input_dim
        for h in hidden_sizes: # for each layer, construct linear + batchnorm + dropout
            layers.append(nn.Linear(last_dim, h))
            if use_batchnorm:
                layers.append(nn.BatchNorm1d(h))
            layers.append(get_activation(activation))
            if dropout > 0.0:
                layers.append(nn.Dropout(dropout))
            last_dim = h
        layers.append(nn.Linear(last_dim, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)


In [None]:
# training loop and evaluator
def train_model(model, train_loader, val_loader, lr, device="cpu", epochs=10):
    model.to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    val_losses = []

    for epoch in range(epochs):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            loss = loss_fn(model(x).squeeze(), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Evaluate
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                val_loss += loss_fn(model(x).squeeze(), y).item()
        val_losses.append(val_loss / len(val_loader))
    return val_losses[-1]  # return final validation loss


In [None]:
# run experiments
def run_experiments(X_train, y_train, X_val, y_val):
    from torch.utils.data import DataLoader, TensorDataset

    results = []
    for config in all_configs:
        print(f"Running config: {config}")
        model = FlexibleNetwork(
            input_dim=X_train.shape[1],
            hidden_sizes=config["hidden_sizes"],
            activation=config["activation"],
            dropout=config["dropout"],
            use_batchnorm=config["use_batchnorm"]
        )

        train_loader = DataLoader(MAFLDDataset(X_train, y_train), batch_size=64, shuffle=True)
        val_loader = DataLoader(MAFLDDataset(X_val, y_val), batch_size=64)

        val_loss = train_model(model, train_loader, val_loader, lr=config["learning_rate"])
        results.append((config, val_loss))
        print(f"Validation loss: {val_loss:.4f}")

    return sorted(results, key=lambda x: x[1])  # sorted by val loss


In [None]:
run_experiments(X_train, y_train, X_test, y_test)

In [None]:
# split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_torch, Y_torch, test_size=0.3, random_state=42)

train_dataset = MAFLDDataset(X_train, y_train)
train_data = DataLoader(train_dataset, shuffle=True, batch_size=64)

In [None]:
X_train.shape[1]

In [None]:
y_train.shape