In [None]:
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.model_selection import train_test_split

data = pd.read_csv("../clean_data/nafl/combined.large.nafl.csv")

In [None]:
print(f"{(np.sum(data['Outcome']) / data.shape[0])} of patients progressed in our dataset.")

In [None]:
data.shape

In [None]:
X = data.drop(columns=['DaysUntilFirstProgression', 'Outcome'])
X.set_index('StudyID', inplace=True)
X.shape

In [None]:
# create the X and Y datasets

# data = data.drop(columns='DaysUntilFirstProgression')
data = data.drop(columns='Outcome')
data = data.drop(columns='Censored')

# Y = data[['StudyID', 'Outcome']]
Y = data[['StudyID', 'DaysUntilFirstProgression']]
X = data.drop(columns='DaysUntilFirstProgression')

X = X.set_index('StudyID')
Y = Y.set_index('StudyID')

In [None]:
# check if GPU is enabled
device = "cuda" if torch.cuda.is_available() else "cpu" # need to define device since python can use both cpu and gpu
print(f"Using {device} device")
print(f"Shape of X: {X.shape}. Shape of Y: {Y.shape}.")

In [None]:
# convert data to tensors
X_numpy = X.values.astype(np.int64) # turn into a numpy array
X_torch = torch.from_numpy(X_numpy)

Y_numpy = Y.values.astype(np.int64) # turn into a numpy array
Y_torch = torch.from_numpy(Y_numpy)

In [None]:
bins = [0, 30, 90, 180, 365, 1000, 2000]
labels = ['0-30', '31-90', '91-180', '181-365', '366-1000', '1001-2000']

Y_binned = pd.cut(Y['DaysUntilFirstProgression'], bins=bins, labels=labels, include_lowest=True)

bin_counts = Y_binned.value_counts().sort_index()
print(bin_counts)

In [None]:
import matplotlib.pyplot as plt
plt.hist(Y['DaysUntilFirstProgression']) #, bins=bins)

In [None]:
plt.hist(Y['DaysUntilFirstProgression'], bins=bins)

# establish model

In [None]:
# curate the dataset
class MAFLDDataset(Dataset): # must contain init, len, and getitem
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = MAFLDDataset(X_torch, Y_torch)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True) # batch size 64

In [None]:
# define by subclassing nn.Module and initialize the neural network layers in __init__.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__() # inherit init from parent class
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X.shape[1], 1024),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 32),
            nn.LeakyReLU(),
            nn.Linear(32, 1), # no activation follows this layer
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

model = NeuralNetwork().to(device)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
for layer in model.linear_relu_stack:
    print(layer.weight)

In [None]:
# split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_torch, Y_torch, test_size=0.3, random_state=42)

train_dataset = MAFLDDataset(X_train, y_train)
train_data = DataLoader(train_dataset, shuffle=True, batch_size=64)

In [None]:
bins = [0, 30, 90, 180, 365, 1000, 2000]
labels = ['0-30', '31-90', '91-180', '181-365', '366-1000', '1001-2000']

Y_binned = pd.cut(Y['DaysUntilFirstProgression'], bins=bins, labels=labels, include_lowest=True)

bin_counts = Y_binned.value_counts().sort_index()
print(bin_counts)
bins = [0, 30, 90, 180, 365, 1000, 2000]
labels = ['0-30', '31-90', '91-180', '181-365', '366-1000', '1001-2000']


y_train_df = pd.DataFrame({'y': y_train.cpu().numpy().squeeze()})
y_train_binned = pd.cut(y_train_df['y'], bins=bins, labels=labels, include_lowest=True)
bin_counts = y_train_binned.value_counts().sort_index()
bin_weights = 1 / bin_counts
bin_weights = bin_weights / bin_weights.sum()

def get_weight_from_y(y_val):
    for i in range(len(bins) - 1):
        if bins[i] <= y_val <= bins[i + 1]:
            return bin_weights[labels[i]]
    return 1.0  # fallback in case of unexpected value

In [None]:
# train model for 30 epochs
num_epochs = 30 # typically between 10-50 for small datasets

for epoch in range(num_epochs):
    for batch_X, batch_y in train_data:
        batch_X = torch.tensor(batch_X).to(device)
        batch_y = torch.tensor(batch_y).to(device)
        
        #initialize the gradients to zero
        optimizer.zero_grad() 

        # forward pass
        outputs = model(batch_X)

        # batch_weights = torch.tensor(
        #     [get_weight_from_y(y.item()) for y in batch_y],
        #     dtype=torch.float,
        #     device=device
        # )
        
        # compute loss
        loss = loss_fn(outputs, batch_y)

        # weighted_loss = (loss * batch_weights).mean()

        # weighted_loss.backward()
        # optimizer.step()
        
        # gradient descent and update the weights
        loss.backward()
        optimizer.step()

print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
# run model on test data
Y_hat_test = model(X_test.float().to(device)) # run on testing data

# evaluate via MSE
print(f'MSE: {mean_squared_error(y_test, Y_hat_test.cpu().detach().numpy())}')
print(f'Absolute Error: {mean_absolute_error(y_test, Y_hat_test.cpu().detach().numpy())}')

# training loss with regular loss: 5248.9624, 1136.1140
# training loss with weighted loss: 11919.7793
# training loss with leakyReLU architecture: 18536.0098, 668.5651

In [None]:
plt.hist(y_test - Y_hat_test.cpu().detach().numpy(), bins=np.linspace(-2500,2500,150))

In [None]:
plt.hist(Y_hat_test.cpu().detach().numpy(), bins=np.linspace(0, 3000, 10))

In [None]:
plt.hist(Y['DaysUntilFirstProgression']) #, bins=bins)

In [None]:
print(f"True distribution median: {Y['DaysUntilFirstProgression'].median()}")
print(f'Predicted distribution median: {np.median(Y_hat_test.cpu().detach().numpy())}')

In [None]:
print(f"True distribution mean: {Y['DaysUntilFirstProgression'].mean()}")
print(f'Predicted distribution mean: {np.mean(Y_hat_test.cpu().detach().numpy())}')