# Fully connected neural network (FCNN)

In this notebook, we will use a fully-connected neural network and physicochemical descriptors to predict the value of blood brain barrier (BBB) penetration.

In [None]:
# Install conda
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
# Check conda installation
import condacolab
condacolab.check()

# Install required packages
!mamba install python=3 pip
!mamba install -c conda-forge pandas numpy matplotlib rdkit scikit-learn
!pip3 install torch --index-url https://download.pytorch.org/whl/cu126

# Download required files
!wget https://github.com/lillgroup/AIiDD/raw/main/Lab%202/Data/model_saved.pt
!wget https://github.com/lillgroup/AIiDD/raw/main/Lab%202/Data/B3DB_regression.tsv

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, Crippen, AllChem

There are some Warnings informing about future scipy changes.

We will suppress them here as they have no influence on the results.

In [None]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

### Read data

Here we use classification data on blood brain barrier (BBB) penetration.

In [None]:
tmp = pd.read_table('B3DB_regression.tsv',sep='\t')

table = tmp.loc[:,('compound_name', 'IUPAC_name', 'SMILES', 'logBB')]
table = table.dropna(subset="logBB")
table.reset_index(drop=True, inplace=True)

table

### Molecular descriptors calculation

We can use RDKIT to calculate several molecular descriptors (2D and 3D).

In [None]:
# We will calculate the descriptors and add them to our table
for i in table.index:
    mol=Chem.MolFromSmiles(table.loc[i,'SMILES'])
    table.loc[i,'MolWt']=Descriptors.ExactMolWt (mol)
    table.loc[i,'TPSA']=Chem.rdMolDescriptors.CalcTPSA(mol) #Topological Polar Surface Area
    table.loc[i,'nRotB']=Descriptors.NumRotatableBonds (mol) #Number of rotable bonds
    table.loc[i,'HBD']=Descriptors.NumHDonors(mol) #Number of H bond donors
    table.loc[i,'HBA']=Descriptors.NumHAcceptors(mol) #Number of H bond acceptors
    table.loc[i,'LogP']=Descriptors.MolLogP(mol) #LogP

table

### Model: We will use neural network regression

Generate x (descriptors) and y (logBB) vectors

In [None]:
descriptors_selected = ['MolWt', 'TPSA', 'nRotB', 'HBD', 'HBA', 'LogP']

x = table.loc[:, descriptors_selected].values
y = table.loc[:, ['logBB']].values

# standardization/normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x = scaler.fit_transform(x)


### Split data into training and test set

Here, we use random splitting

In [None]:
# train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=1)

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.25, random_state=1)

### Baseline: Multilinear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

reg = LinearRegression().fit(x_train, y_train)
reg.score(x_train, y_train)
pred_train = reg.predict(x_train)
pred_valid = reg.predict(x_valid)
pred_test = reg.predict(x_test)

# Mean squared error
print("Mean squared error, train: %.2f" % mean_squared_error(y_train, pred_train))
print("Mean squared error, valid: %.2f" % mean_squared_error(y_valid, pred_valid))
print("Mean squared error, test : %.2f" % mean_squared_error(y_test, pred_test))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination, train: %.2f" % r2_score(y_train, pred_train))
print("Coefficient of determination, valid: %.2f" % r2_score(y_valid, pred_valid))
print("Coefficient of determination, test: %.2f" % r2_score(y_test, pred_test))

# Plot outputs
plt.figure(figsize=(7, 7))
plt.scatter(y_train, pred_train, color="black", s=10)
plt.scatter(y_valid, pred_valid, color="lightgreen", s=10)
plt.scatter(y_test, pred_test, color="red", s=10)


plt.plot([-3.0, 2.0], [-3.0, 2.0], color="black", linewidth=3)

plt.xlabel("ground truth")
plt.ylabel("predicted")


plt.show()

### Generate Dataset for pytorch

We generate three instances (training, validation and test set) of a class Data that is derived from the pytorch class Dataset.

Then, we initiate three dataloaders for the three instances that will be used to provide data to the training, validation and test phases.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim

# Convert data to torch tensors
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.len

batch_size = 64

# Instantiate training, validation and test data
train_data = Data(x_train, y_train)
train_dataloader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

valid_data = Data(x_valid, y_valid)
valid_dataloader = DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=True)

test_data = Data(x_test, y_test)
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

# Check batches
for loaders in [train_dataloader, valid_dataloader, test_dataloader]:
    print("------------------------------------------------------------")
    for batch, (X, y) in enumerate(loaders):
        print(f"Batch: {batch+1}")
        print(f"X shape: {X.shape}")
        print(f"y shape: {y.shape}")



### Network architecture

We implement a simple two-layer neural network that uses ReLU activation.

The architecture is defined by the class NeuralNetwork that that is derivec from pytorch's nn.Module which is the base class for all neural network modules built in PyTorch.

In [None]:
# Number of input descriptors per sample (need to be changed if you use other input)
input_dim = 6
# Size if hidden layer
hidden_dim = 64
# Output is one float value representing logBB
output_dim = 1

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NeuralNetwork, self).__init__()
        self.layer_1 = nn.Linear(input_dim, hidden_dim)
        nn.init.xavier_uniform_(self.layer_1.weight)
        self.layer_2 = nn.Linear(hidden_dim, output_dim)
        nn.init.xavier_uniform_(self.layer_2.weight)

    def forward(self, x):
        x = self.layer_1(x)
        x = torch.nn.functional.relu(x)
        x = self.layer_2(x)
        x = torch.nn.functional.relu(x)

        return x

model = NeuralNetwork(input_dim, hidden_dim, output_dim)
print(model)
print('# of model parameters:', sum([np.prod(p.size()) for p in model.parameters()]))
for p in model.parameters():
    print(p.size())

### Optimizer and loss function

To train the model we define a loss function to calculate the gradients and an optimizer to update the parameters.

Here we use mean square error (MSE) and the ADAM optimizer with a learning rate of 0.01.

In [None]:
learning_rate = 0.01

loss_fn = nn.MSELoss()

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Train model

In [None]:
num_epochs = 500
loss_values = []
loss_epoch_train = []
loss_epoch_valid = []

count_batches = 0
for epoch in range(num_epochs):
    loss_epoch = 0
    for X, y in train_dataloader:
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        pred = model(X)
        loss = loss_fn(pred, y)

        # save loss values
        count_batches += 1
        loss_values.append([count_batches, loss.item()])
        loss_epoch += X.shape[0]*loss.item()

        # optimize model
        loss.backward()
        optimizer.step()
    # save loss for each epoch
    loss_epoch_train.append([count_batches, loss_epoch/train_data.X.shape[0]])
    # validation loss
    # TO DO

torch.save(model, "model_saved.pt")

print("Training Complete")

#### Plot training loss as function of number of batches (here: One epoch = 10 batches)

In [None]:
step = np.linspace(0, num_epochs, num_epochs*10)

fig, ax = plt.subplots(figsize=(15,5))
#plt.plot(step, np.array(loss_values))
loss_values_np = np.asanyarray(loss_values)
loss_epoch_train_np = np.asanyarray(loss_epoch_train)
plt.plot(loss_values_np[:,0], loss_values_np[:,1], "-")
plt.plot(loss_epoch_train_np[:,0], loss_epoch_train_np[:,1], "-", c="r")
plt.title("Step-wise Loss")
plt.xlabel("Batches")
plt.ylabel("Loss")
plt.show()

#### Correlation between experimental and predicted logBB for training set

In [None]:
model = torch.load("model_saved.pt", weights_only=False)

with torch.no_grad():
    pred_train = model(train_data.X).detach().numpy()

# Mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_train, model(train_data.X).detach().numpy()))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_train, model(train_data.X).detach().numpy()))

# Plot outputs
plt.figure(figsize=(7, 7))
plt.scatter(y_train, model(train_data.X).detach().numpy(), color="black", s=10)
plt.plot([-3.0, 2.0], [-3.0, 2.0], color="black", linewidth=3)

plt.xlabel("ground truth")
plt.ylabel("predicted")

plt.show()

#### Correlation between experimental and predicted logBB for test set

In [None]:
with torch.no_grad():
    pred_test = model(test_data.X).detach().numpy()

# Mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, model(test_data.X).detach().numpy()))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, model(test_data.X).detach().numpy()))

# Plot outputs
plt.figure(figsize=(7, 7))
plt.scatter(y_test, model(test_data.X).detach().numpy(), color="red", s=10)
plt.plot([-3.0, 2.0], [-3.0, 2.0], color="black", linewidth=3)

plt.xlabel("ground truth")
plt.ylabel("predicted")

plt.show()

#### Comparison between training and validation loss

In [None]:
step = np.linspace(0, num_epochs, num_epochs*10)

fig, ax = plt.subplots(figsize=(15,5))
#plt.plot(step, np.array(loss_values))
loss_epoch_train_np = np.asanyarray(loss_epoch_train)
loss_epoch_valid_np = np.asanyarray(loss_epoch_valid)
plt.plot(loss_epoch_train_np[:,0]/10, loss_epoch_train_np[:,1], "-", c="r")
plt.plot(loss_epoch_valid_np[:,0]/10, loss_epoch_valid_np[:,1], "-", c="g")
plt.title("Step-wise Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()