In [249]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.optim import lr_scheduler 
from torch.utils.data import Dataset, DataLoader

#setting basic environment
torch.manual_seed(100)

#if we have a gpu then run network on that, if not use cpu which is much slower
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#" Varying learning rate between 0.0001 and 0.01 is considered optimal in most of the cases" - https://www.kdnuggets.com/2022/12/tuning-adam-optimizer-parameters-pytorch.html
lr = 0.001
batch_size = 25
num_epochs = 500
loss_fn = nn.BCELoss()

class CustomDataset(Dataset):
    def __init__(self, dataframe):
       self.dataframe = dataframe.apply(pd.to_numeric, errors='coerce')

    def __getitem__(self, index):
       row = self.dataframe.iloc[index].to_numpy()
       features = row[1:26]
       label = row[26]
       features_tensor = torch.tensor(features, dtype=torch.float32)
       label_tensor = torch.tensor(label, dtype=torch.float32)
       return features_tensor, label_tensor
    
    def __len__(self):
       return len(self.dataframe)
    
train_df = pd.read_csv('training data.csv')
train_df.set_index("PDB codes",inplace=True)
train_df = (train_df-train_df.min())/(train_df.max()-train_df.min())
test_df = pd.read_csv('testing data.csv')
test_df.set_index("PDB codes",inplace=True)
test_df = (test_df-test_df.min())/(test_df.max()-test_df.min())
train = CustomDataset(dataframe=train_df)
test = CustomDataset(dataframe=test_df)
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True)



In [250]:
class Simple_NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(25, 50),
            #nn.BatchNorm1d(num_features=50),
            nn.LeakyReLU(),
            #nn.Dropout(0.1),
            nn.Linear(50, 100),
            #nn.BatchNorm1d(num_features=100),
            nn.LeakyReLU(),
            #nn.Dropout(0.1),
            nn.Linear(100, 50),
            #nn.BatchNorm1d(num_features=50),
            nn.LeakyReLU(),
            #last linear layer outputs the logits 
            nn.Linear(50, 1),
            #sigmoid layer converts logits to prediction probabilities
            nn.Sigmoid()
        )

    def forward(self, x):
        output = self.model(x)
        return output

def init_weights(layer):
    if isinstance(layer, nn.Linear):
        nn.init.kaiming_uniform(layer.weight)
        layer.bias.data.fill_(0.01)

#to run using GPU, we have instantiate as usual but then send to GPU w/ .to() the device
simple_NN = Simple_NN().to(device=device)
simple_NN.apply(init_weights)
optimizer_SNN = torch.optim.SGD(simple_NN.parameters(), lr=lr)

  nn.init.kaiming_uniform(layer.weight)


In [251]:
def accuracy_fn(y_true, y_pred):
    correct = 0
    for true,pred in zip(y_true,y_pred):
        if true.item()==pred.item():
            correct+=1
    return correct/len(y_true)

# #UNCOMMENT to see untrained model predictions on train set
# count = 0
# for i, data in enumerate(train_loader):
#     values, labels = data
#     values = values.to(torch.float32)
#     labels = labels.to(torch.float32)
#     untrained_preds = torch.round(simple_NN(values))
#     untrained_preds = untrained_preds.squeeze(dim=1)
#     untrained_loss = loss_fn(labels, untrained_preds)
#     untrained_acc = accuracy_fn(labels, untrained_preds)
#     print(untrained_acc)
#     count+=1
# count

In [252]:
def train(model, num_epochs, loss_fn, train_data, test_data, optimizer):
    epoch_loss1 = []
    train_acc = []
    train_losses = []
    scheduler = lr_scheduler.StepLR(optimizer, step_size = 100, gamma=0.9)

    for epoch in range(num_epochs):
        
        model.train()
        for i, data in enumerate(train_data):
            values, labels = data
            values = values.to(torch.float32)
            labels = labels.to(torch.float32)
            optimizer.zero_grad()
            #turn the prediction probabilities into prediction labels, i.e. determining the class of 0 or 1
            train_preds = torch.round(model(values))
            train_preds = train_preds.squeeze(dim=1)
            train_loss = loss_fn(train_preds, labels)
            epoch_loss1.append(train_loss.item())
            train_acc.append(accuracy_fn(labels, train_preds))
            train_loss.backward()
            optimizer.step()
            scheduler.step()
        train_losses.append(np.mean(epoch_loss1))

        if epoch % 20 == 0:
            print(f"Epoch {epoch}/{num_epochs}, Train Loss: {np.mean(epoch_loss1):.4f}, Train Accuracy: {np.mean(train_acc):.2f}") #Val Loss: {np.mean(val_losses):.4f} Val Accuracy: {np.mean(val_acc):.2f}")

    plt.plot(np.arange(num_epochs), train_losses)
    plt.title("Training Loss Graph")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()  

train(simple_NN, num_epochs, loss_fn, train_loader, test_loader, optimizer_SNN)

Epoch 0/500, Train Loss: 43.6757, Train Accuracy: 0.56
Epoch 20/500, Train Loss: 43.5727, Train Accuracy: 0.56
Epoch 40/500, Train Loss: 43.5768, Train Accuracy: 0.56
Epoch 60/500, Train Loss: 43.5800, Train Accuracy: 0.56
Epoch 80/500, Train Loss: 43.5829, Train Accuracy: 0.56
Epoch 100/500, Train Loss: 43.5906, Train Accuracy: 0.56
Epoch 120/500, Train Loss: 43.5877, Train Accuracy: 0.56
Epoch 140/500, Train Loss: 43.5890, Train Accuracy: 0.56
Epoch 160/500, Train Loss: 43.5884, Train Accuracy: 0.56


KeyboardInterrupt: 