In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim


In [2]:
from Modules import Fingerprint_Generator

fingerprint = 'ecfp'

df = pd.read_csv('Transformed_Data/DILIst_DILI.csv', index_col=0)  # Map style dataset
df = Fingerprint_Generator.generate_fp_column(df, df.drug, fp_type=fingerprint)

df_fp = pd.DataFrame(df.iloc[:, 2])
df_fp.insert(len(df_fp.columns), 'DILI?', df.iloc[:, 1].astype(int)) # Insert 'DILI?' column as the last column


print(df_fp)

df_fp.to_csv("Transformed_Data/testo")



                                                   ecfp  DILI?
0     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...      1
1     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
2     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
3     [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
4     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
...                                                 ...    ...
1274  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
1275  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
1276  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
1277  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
1278  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...      1

[1100 rows x 2 columns]


In [3]:
DILIfeatures = df_fp["ecfp"]
DILIlabels = df_fp["DILI?"]

class DILIDataset():
    def __init__(self, features, labels) -> None:
        self.features = features
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        features = self.features[index]
        labels = self.labels[index]
        return torch.tensor([features], dtype=torch.float32), torch.tensor([labels], dtype=torch.float32)
    
dataset = DILIDataset(DILIfeatures, DILIlabels)

In [4]:
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

torch.manual_seed(42)

train_size = int(len(dataset) * 0.8)
validation_size = len(dataset) - train_size

train_data, validation_data = random_split(dataset, [train_size, validation_size])

if len(train_data) + len(validation_data) == len(dataset):
    print("Dataset split succeeded")
else:
    print("Dataset split failed")

train_dataloader = DataLoader(train_data, batch_size=12, shuffle=True)
test_dataloader = DataLoader(validation_data, batch_size=12, shuffle=True)

train_features, train_labels = next(iter(train_dataloader))

train_features = train_features.squeeze(1)


print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

print(train_features[0]); print(train_labels[0]) # Will be random bcs of dataloader shuffle

Dataset split succeeded
Feature batch shape: torch.Size([12, 2048])
Labels batch shape: torch.Size([12, 1])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([1.])


  return torch.tensor([features], dtype=torch.float32), torch.tensor([labels], dtype=torch.float32)


In [5]:
# Set the device to GPU if available
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [6]:
class DILI_Predictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size) -> None:
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        model_output = self.fc1(x)
        model_output = torch.relu(model_output)
        model_output = self.fc2(model_output)
        model_output = torch.relu(model_output)
        model_output = self.fc3(model_output)
        return model_output
    
model0 = DILI_Predictor(2048, 128, 1).to(device)

In [7]:
# Define loss function and optimizer
loss_fn = nn.L1Loss()
optimizer = optim.Adam(model0.parameters(), lr=0.001)

writer = SummaryWriter()

In [31]:
from torchmetrics import classification

# Set the number of epochs (how many times the model will pass over the training data)
epochs = 11


b_auroc = classification.BinaryAUROC(thresholds=None)
bcm = classification.BinaryConfusionMatrix()

# Create empty loss lists to track values
train_loss_values = []
test_loss_values = []
epoch_count = []

for epoch in range(epochs):
    ### Training

    # Put model in training mode (this is the default state of a model)
    model0.train()

    # 1. Forward pass on train data using the forward() method inside 
    predicted_labels = model0(train_features)

    # 2. Calculate the loss (how different are our models predictions to the ground truth)
    train_loss = loss_fn(predicted_labels, train_labels)


    ### Back propagation
    optimizer.zero_grad() # 3. Zero grad of the optimizer
    train_loss.backward() # 4. Loss backwards
    optimizer.step() # 5. Progress the optimizer, update weights

    ### Testing

    # Put the model in evaluation mode
    model0.eval()

    with torch.inference_mode():
      # 1. Forward pass on test data
      test_pred = model0(train_features)

      # 2. Caculate loss on test data
      test_loss = loss_fn(test_pred, train_labels.type(torch.float)) # predictions come in torch.float datatype, so comparisons need to be done with tensors of the same type

      # Print out what's happening
      if epoch % 2 == 0:
            epoch_count.append(epoch)
            train_loss_values.append(train_loss.detach().numpy())
            test_loss_values.append(test_loss.detach().numpy())
            
            
            # print(test_pred); print(train_labels)
            print(f"Epoch: {epoch} | MAE Train Loss: {train_loss:.3f} | MAE Test Loss: {test_loss:.3f} | AUROC: {b_auroc(test_pred, train_labels)} \n Binary Confusion Matrix: \n {bcm(test_pred, train_labels)} \n {'-'*20} ")


Epoch: 0 | MAE Train Loss: 0.004 | MAE Test Loss: 0.007 | AUROC: 1.0 
 Binary Confusion Matrix: 
 tensor([[ 0,  2],
        [ 0, 10]]) 
 -------------------- 
Epoch: 2 | MAE Train Loss: 0.010 | MAE Test Loss: 0.008 | AUROC: 1.0 
 Binary Confusion Matrix: 
 tensor([[ 1,  1],
        [ 0, 10]]) 
 -------------------- 
Epoch: 4 | MAE Train Loss: 0.008 | MAE Test Loss: 0.009 | AUROC: 1.0 
 Binary Confusion Matrix: 
 tensor([[ 2,  0],
        [ 0, 10]]) 
 -------------------- 
Epoch: 6 | MAE Train Loss: 0.008 | MAE Test Loss: 0.009 | AUROC: 1.0 
 Binary Confusion Matrix: 
 tensor([[ 1,  1],
        [ 0, 10]]) 
 -------------------- 
Epoch: 8 | MAE Train Loss: 0.008 | MAE Test Loss: 0.011 | AUROC: 1.0 
 Binary Confusion Matrix: 
 tensor([[ 1,  1],
        [ 0, 10]]) 
 -------------------- 
Epoch: 10 | MAE Train Loss: 0.012 | MAE Test Loss: 0.007 | AUROC: 1.0 
 Binary Confusion Matrix: 
 tensor([[ 0,  2],
        [ 0, 10]]) 
 -------------------- 


In [32]:
from Modules import Save_PyTorch_Models

dummy_data = torch.rand([1, 2048])

dummy_data.shape

Save_PyTorch_Models.save(model0, "DILIst", dummy_data)

Directory for today's PyTorch models already exists at Pytorch_Models/08-11-2023. Placing saved modles in here.
verbose: False, log level: Level.ERROR



def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train_dataloader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model0(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_dataloader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

# Initializing in a separate cell so we can easily add more epochs to the same run
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model0.train()
    avg_loss = train_one_epoch(epoch_number, writer)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model0.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(test_dataloader):
            vinputs, vlabels = vdata
            voutputs = model0(vinputs)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print(f"MAE Train Loss: {avg_loss} | MAE Validation Loss: {avg_vloss}")

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model0.state_dict(), model_path)

    epoch_number += 1