In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim


In [2]:
from Modules import Fingerprint_Generator

Xu_df = pd.read_csv('Transformed_Data/Xu_DILI.csv', index_col=0)  # Map style dataset
Xu_df = Fingerprint_Generator.generate_fp_column(Xu_df, Xu_df.SMILES, 'ecfp')

Xu_df_fp = pd.DataFrame(Xu_df.iloc[:, 2])
Xu_df_fp.insert(len(Xu_df_fp.columns), 'DILI?', Xu_df['DILI?'].astype(int)) # Insert 'DILI?' column as the last column


print(Xu_df_fp)

                                                  ecfp  DILI?
0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...      0
1    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
2    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
4    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
..                                                 ...    ...
470  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
471  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
472  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1
473  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
474  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1

[475 rows x 2 columns]


In [3]:
DILIfeatures = Xu_df_fp["ecfp"]
DILIlabels = Xu_df_fp["DILI?"]

class DILIDataset():
    def __init__(self, features, labels) -> None:
        self.features = features
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        features = self.features[index]
        labels = self.labels[index]
        return torch.tensor([features], dtype=torch.float32), torch.tensor([labels], dtype=torch.float32)
    
dataset = DILIDataset(DILIfeatures, DILIlabels)

In [4]:
from torch.utils.data import DataLoader, random_split

dataset_split = [0.8, 0.2]

train_data, test_data = random_split(dataset, [0.8, 0.2])

if len(train_data) + len(test_data) == len(dataset) and len(train_data) == len(dataset)*dataset_split[0]:
    print("Dataset split succeeded")
else:
    print("Dataset split failed")


train_dataloader = DataLoader(train_data, shuffle=True)
test_dataloader = DataLoader(test_data, shuffle=True)

train_features, train_labels = next(iter(train_dataloader))

train_features = train_features.squeeze(1)

print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

print(train_features[0]); print(train_labels[0]) # Will be random bcs of dataloader shuffle

Dataset split succeeded
Feature batch shape: torch.Size([1, 2048])
Labels batch shape: torch.Size([1, 1])
tensor([0., 1., 0.,  ..., 0., 0., 0.])
tensor([1.])


  return torch.tensor([features], dtype=torch.float32), torch.tensor([labels], dtype=torch.float32)


In [5]:
# Set the device to GPU if available
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [6]:
class DILI_Predictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size) -> None:
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        model_output = self.fc1(x)
        model_output = torch.relu(model_output)
        model_output = self.fc2(model_output)
        model_output = torch.relu(model_output)
        model_output = self.fc3(model_output)
        model_output = self.sigmoid(model_output)
        return model_output
    
model0 = DILI_Predictor(2048, 128, 1).to(device)

In [7]:
# Define loss function and optimizer
criterion = nn.L1Loss()
optimizer = optim.Adam(model0.parameters(), lr=0.001)

In [18]:
torch.manual_seed(42)

# Set the number of epochs (how many times the model will pass over the training data)
epochs = 11

# Create empty loss lists to track values
train_loss_values = []
test_loss_values = []
epoch_count = []

for epoch in range(epochs):
    ### Training

    # Put model in training mode (this is the default state of a model)
    model0.train()

    # 1. Forward pass on train data using the forward() method inside 
    predicted_labels = model0(train_features)
    # print(y_pred)

    # 2. Calculate the loss (how different are our models predictions to the ground truth)
    loss = criterion(predicted_labels, train_labels)

    # 3. Zero grad of the optimizer
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Progress the optimizer
    optimizer.step()

    ### Testing

    # Put the model in evaluation mode
    model0.eval()

    with torch.inference_mode():
      # 1. Forward pass on test data
      test_pred = model0(train_features)

      # 2. Caculate loss on test data
      test_loss = criterion(test_pred, train_labels.type(torch.float)) # predictions come in torch.float datatype, so comparisons need to be done with tensors of the same type

      # Print out what's happening
      if epoch % 2 == 0:
            epoch_count.append(epoch)
            train_loss_values.append(loss.detach().numpy())
            test_loss_values.append(test_loss.detach().numpy())
            print(f"Epoch: {epoch} | MAE Train Loss: {loss} | MAE Test Loss: {test_loss} ")


Epoch: 0 | MAE Train Loss: 7.796287536621094e-05 | MAE Test Loss: 7.641315460205078e-05 
Epoch: 2 | MAE Train Loss: 7.474422454833984e-05 | MAE Test Loss: 7.319450378417969e-05 
Epoch: 4 | MAE Train Loss: 7.152557373046875e-05 | MAE Test Loss: 6.985664367675781e-05 
Epoch: 6 | MAE Train Loss: 6.830692291259766e-05 | MAE Test Loss: 6.663799285888672e-05 
Epoch: 8 | MAE Train Loss: 6.496906280517578e-05 | MAE Test Loss: 6.341934204101562e-05 
Epoch: 10 | MAE Train Loss: 6.175041198730469e-05 | MAE Test Loss: 6.020069122314453e-05 
