In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE

import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random

## Prepare Data
Separate out the outcomes. Keep only mean and last BMI. 

Split into train and test datasets.

Standardize the lab values, age at first diagnosis, and BMI (train the scaler on the training set and then use it on the test set).

Convert to tensors

In [None]:
df = pd.read_csv("/nobackup/users/ericason/mlhc-final-project/clean_data/nafl/combined.large.nafl.csv", header=0, delimiter=",")
df.head()

In [None]:
# make lists of important columns
outcome_cols = ["Outcome", "DaysUntilFirstProgression"] # outcomes
drop_cols = ["StudyID"] # columns to drop that are not outcome
# columns that should be scaled later
numerical_cols = [x for x in df.columns if ("lab" in x.lower()) or ("age" in x.lower()) or ("bmi" in x.lower() and "category" not in x.lower())]


In [None]:
# make features dataframe
X = df.drop(columns=outcome_cols + drop_cols)
X.head()

In [None]:
# make outcome dataframe (including both linear and logistic outcomes)
Y = df[["Outcome"]]

In [None]:
# make train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}, Y_train shape: {Y_train.shape}, Y_test shape: {Y_test.shape}')

In [None]:
# standardize numerical columns
scaler = StandardScaler()
X_train_scaled = X_train
# scale numerical columns and replace them in the original dataframe
X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols]) 
X_train_scaled.head()

In [None]:
# scale numerical test features
X_test_scaled = X_test
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [None]:
# check if GPU is enabled
device = "cuda" if torch.cuda.is_available() else "cpu" # need to define device since python can use both cpu and gpu
print(f"Using {device} device")
print(f"Shape of X: {X.shape}. Shape of Y: {Y.shape}.")

In [None]:
# convert training data to tensors
X_train_numpy = X_train_scaled.values.astype(np.float32) # turn into a numpy array
X_train_torch = torch.from_numpy(X_train_numpy)

Y_train_numpy = Y_train.values.astype(np.float32) # turn into a numpy array
Y_train_torch = torch.from_numpy(Y_train_numpy)

In [None]:
# convert test data to tensors
X_test_numpy = X_test_scaled.values.astype(np.float32) # turn into a numpy array
X_test_torch = torch.from_numpy(X_test_numpy)

Y_test_numpy = Y_test.values.astype(np.float32) # turn into a numpy array
Y_test_torch = torch.from_numpy(Y_test_numpy)

In [None]:
# curate the dataset
class MAFLDDataset(Dataset): # must contain init, len, and getitem
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

test_dataset = MAFLDDataset(X_test_torch, Y_test_torch)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

## Establish Binary NN Model

Model architecture and hyperparameter definition

In [None]:
# hyperparameters
NUM_EPOCHS=30
LEARNING_RATE=1e-2
NUM_SAMPLES = (Y_train_torch == 1).sum()

In [None]:
# define by subclassing nn.Module and initialize the neural network layers in __init__.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

## Train a Binary NN on the Downsampled Data
Downsample the test data and train
Repeat 10 times and average the metrics?

In [None]:
def train_model(model, data_loader, num_epochs=30, lr=1e-3):
    # train model for 30 epochs
    model.train()
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr) # start with this baseline learning rate
    for epoch in range(num_epochs):
        for batch_X, batch_y in data_loader:
            # send batch to device
            batch_X = torch.tensor(batch_X).to(device)
            batch_y = torch.tensor(batch_y).to(device)
    
            #initialize the gradients to zero
            optimizer.zero_grad() 
    
            # forward pass
            outputs = model(batch_X)
    
            # compute loss
            loss = loss_fn(outputs, batch_y)
            
            # gradient descent and update the weights
            loss.backward()
            optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
# Set the random seed to 42
torch.manual_seed(42)

In [None]:
# store models
models = []

for i in range(10):
    # downsample the 0 class
    mask = (Y_train_torch == 0).squeeze()
    X_0 = X_train_torch[mask,] # X with 0
    Y_0 = Y_train_torch[mask,] # Y with 0

    sample_indices = torch.randperm(X_0.size(0))[:NUM_SAMPLES]
    X_0 = X_0[sample_indices,:]
    Y_0 = Y_0[sample_indices,:]

    X_concat_0 = torch.cat((X_0, X_train_torch[~mask,]), dim=0)
    Y_concat_0 = torch.cat((Y_0, Y_train_torch[~mask,]), dim=0)

    print(Y_concat_0.sum())
    
    train_dataset = MAFLDDataset(X_train_torch[sample_indices,:], Y_train_torch[sample_indices,:])
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) # batch size 64
    
    # create an instance of NeuralNetwork, move to device, print its structure
    model = NeuralNetwork().to(device)
    train_model(model, train_loader, NUM_EPOCHS, LEARNING_RATE)

    models.append(model)

In [None]:
# get validation losses across all models
val_losses = []
# auroc
test_auroc = []
train_auroc = []
Y_pred_train = []
Y_pred_test = []

# separate test AUROC
for model in models:
    model.eval()
    
    # train auroc
    X_input = torch.tensor(X_train_torch, device=device, dtype=torch.float32)
    Y_hat = model.sigmoid(model(X_input))
    train_auroc.append(roc_auc_score(Y_train_numpy, (Y_hat.cpu().detach().numpy() >= 0.5)))
    Y_pred_train.append(Y_hat.cpu().detach().numpy())
    
    # test auroc
    X_input = torch.tensor(X_test_torch, device=device, dtype=torch.float32)
    Y_hat = model(X_input)
    test_auroc.append(roc_auc_score(Y_test_numpy, (Y_hat.cpu().detach().numpy() >= 0.5)))
    Y_pred_test.append(Y_hat.cpu().detach().numpy())


In [None]:
train_auroc

In [None]:
sum(train_auroc) / len(train_auroc)

In [None]:
test_auroc

In [None]:
sum(test_auroc) / len(train_auroc)

In [None]:
# average the predictions
Y_pred_train_numpy = np.array(Y_pred_train).mean(axis=0)
Y_pred_test_numpy = np.array(Y_pred_test).mean(axis=0)

In [None]:
print("AUROC of averaged predictions")
print("Train: ", roc_auc_score(Y_train_numpy, (Y_pred_train_numpy >= 0.5)))
print("Test: ", roc_auc_score(Y_test_numpy, (Y_pred_test_numpy >= 0.5)))

## Downsampling with Reweighting

Model architecture and hyperparameter definition

In [None]:
from collections import Counter

In [None]:
# Compute class weights
counts = Counter(Y_train_numpy.ravel())
total = sum(counts.values())
# class_weights = {cls: total/count for cls, count in counts.items()}
class_weights = {1: 50, 0: 1}
print(class_weights)

In [None]:
sample_weights = np.array([class_weights[y] for y in Y_train_numpy.ravel()], dtype=np.float32)

In [None]:
W_train_torch = torch.from_numpy(sample_weights)

In [None]:
#reweighted
class MAFLDDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.w = torch.tensor(w, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.w[idx]

In [None]:
def train_model_reweight(model, data_loader, num_epochs=30, lr=1e-3):
    # train model for 30 epochs
    model.train()
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr) # start with this baseline learning rate
    for epoch in range(num_epochs):
        for batch_X, batch_y, batch_w in data_loader:
            # send batch to device
            batch_X = torch.tensor(batch_X).to(device)
            batch_y = torch.tensor(batch_y).to(device)
            batch_w = torch.tensor(batch_w).to(device)
    
            #initialize the gradients to zero
            optimizer.zero_grad() 
    
            # forward pass
            outputs = model(batch_X)
            outputs = model(batch_X)

            # manually reweight the loss
            loss = (loss_fn(outputs, batch_y) * batch_w).mean()
    
            # compute loss
            loss = loss_fn(outputs, batch_y)
            
            # gradient descent and update the weights
            loss.backward()
            optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
# store models
models = []

for i in range(10):
    # downsample the 0 class
    mask = (Y_train_torch == 0).squeeze()
    X_0 = X_train_torch[mask,] # X with 0
    Y_0 = Y_train_torch[mask,] # Y with 0

    sample_indices = torch.randperm(X_0.size(0))[:NUM_SAMPLES]
    X_0 = X_0[sample_indices,:]
    Y_0 = Y_0[sample_indices,:]

    X_concat_0 = torch.cat((X_0, X_train_torch[~mask,]), dim=0)
    Y_concat_0 = torch.cat((Y_0, Y_train_torch[~mask,]), dim=0)

    print(Y_concat_0.sum())
    
    train_dataset = MAFLDDataset(X_train_torch[sample_indices,:], Y_train_torch[sample_indices,:], W_train_torch[sample_indices])
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) # batch size 64
    
    # create an instance of NeuralNetwork, move to device, print its structure
    model = NeuralNetwork().to(device)
    train_model_reweight(model, train_loader, NUM_EPOCHS, LEARNING_RATE)

    models.append(model)

In [None]:
# get validation losses across all models
val_losses = []
# auroc
test_auroc = []
train_auroc = []
Y_pred_train = []
Y_pred_test = []

# separate test AUROC
for model in models:
    model.eval()
    
    # train auroc
    X_input = torch.tensor(X_train_torch, device=device, dtype=torch.float32)
    Y_hat = model.sigmoid(model(X_input))
    train_auroc.append(roc_auc_score(Y_train_numpy, (Y_hat.cpu().detach().numpy() >= 0.5)))
    Y_pred_train.append(Y_hat.cpu().detach().numpy())
    
    # test auroc
    X_input = torch.tensor(X_test_torch, device=device, dtype=torch.float32)
    Y_hat = model(X_input)
    test_auroc.append(roc_auc_score(Y_test_numpy, (Y_hat.cpu().detach().numpy() >= 0.5)))
    Y_pred_test.append(Y_hat.cpu().detach().numpy())


In [None]:
train_auroc

In [None]:
sum(train_auroc) / len(train_auroc) # average auroc across 10 samples

In [None]:
test_auroc

In [None]:
sum(test_auroc) / len(train_auroc) # test 

In [None]:
# average the predictions
Y_pred_train_numpy = np.array(Y_pred_train).mean(axis=0)
Y_pred_test_numpy = np.array(Y_pred_test).mean(axis=0)

In [None]:
print("AUROC of averaged predictions")
print("Train: ", roc_auc_score(Y_train_numpy, (Y_pred_train_numpy >= 0.5)))
print("Test: ", roc_auc_score(Y_test_numpy, (Y_pred_test_numpy >= 0.5)))

In [None]:
# check number of people of each label
print(Y_concat_0.sum())
print(len(Y_concat_0))

In [None]:
Y_pred_train_numpy.mean()