### Initial setup

In [1]:
from sklearn.model_selection import train_test_split

from src.features.build_features import Features

First we load up our training features. X for the features, y for the labels for the respective features, and y_transform for the text classes. 

In [2]:
# Load training features
features = Features(project_base="../")
X, y, y_transform = features.load_training_features()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

We need to know the shape of the data

In [3]:
print(X.shape)
print(y.shape)
print(len(y_transform), len(set(y))) # should be the same

print(X_train.shape)

(525782, 2048)
(525782,)
417 417
(473203, 2048)


So we have 525,779 data points, each with 2048 features. There are 418 different classes.

We save 10% of these for testing, so our traing set has 473,201 data points. 

In [4]:
del X, y  # Delete original X and y to free up ~10GB of memory

### Neural Net Model

For an experiment, we will try a simple neural network model in PyTorch. The model needs 2048 input neurons and 2018 outputs. 

In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

X_train = torch.Tensor(X_train).float()
y_train = torch.Tensor(y_train).long()

my_dataset = TensorDataset(X_train, y_train)
trainloader = DataLoader(my_dataset, batch_size=100, shuffle=True, num_workers=2)

X_test = torch.Tensor(X_test).float()
y_test = torch.Tensor(y_test).long()

my_testset = TensorDataset(X_test, y_test)
testloader = DataLoader(my_testset, batch_size=100, shuffle=True, num_workers=2)

In [6]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.L1 = nn.Linear(2048, 1024)
        self.L2 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, 418)

    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = F.relu(self.L1(x))
        x = F.relu(self.L2(x))
        x = self.output(x)

        return x
    

net = Net()
net.to(device)


    
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)




for epoch in range(30):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

[1,  2000] loss: 1.971
[1,  4000] loss: 0.923
[2,  2000] loss: 0.630
[2,  4000] loss: 0.608
[3,  2000] loss: 0.488
[3,  4000] loss: 0.501
[4,  2000] loss: 0.425
[4,  4000] loss: 0.447
[5,  2000] loss: 0.391
[5,  4000] loss: 0.406
[6,  2000] loss: 0.370
[6,  4000] loss: 0.389
[7,  2000] loss: 0.357
[7,  4000] loss: 0.370
[8,  2000] loss: 0.346
[8,  4000] loss: 0.359
[9,  2000] loss: 0.333
[9,  4000] loss: 0.350
[10,  2000] loss: 0.328
[10,  4000] loss: 0.345
[11,  2000] loss: 0.323
[11,  4000] loss: 0.338
[12,  2000] loss: 0.316
[12,  4000] loss: 0.331
[13,  2000] loss: 0.311
[13,  4000] loss: 0.326
[14,  2000] loss: 0.307
[14,  4000] loss: 0.322
[15,  2000] loss: 0.302
[15,  4000] loss: 0.319
[16,  2000] loss: 0.304
[16,  4000] loss: 0.314
[17,  2000] loss: 0.297
[17,  4000] loss: 0.315
[18,  2000] loss: 0.297
[18,  4000] loss: 0.310
[19,  2000] loss: 0.293
[19,  4000] loss: 0.308
[20,  2000] loss: 0.292
[20,  4000] loss: 0.306
[21,  2000] loss: 0.291
[21,  4000] loss: 0.301
[22,  2000

In [9]:
torch.save(net.state_dict(), "../models/simple_nn_model.pth")

In [7]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        inputs, labels = data[0].to(device), data[1].to(device)
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test set: %d %%' % (100 * correct / total))

correct_k = 0
total_k = 0
with torch.no_grad():
    for data in testloader:
        inputs, labels = data[0].to(device), data[1].to(device)
        outputs = net(inputs)
        _, pred = torch.topk(outputs.data, 5, 1)
        for i, p5 in enumerate(pred):
            if labels.data[i] in p5:
                correct_k += 1
            total_k += 1
print('Top 5 accuracy of the network on the test set: %d %%' % (100 * correct_k / total_k))

Accuracy of the network on the test set: 79 %
Top 5 accuracy of the network on the test set: 97 %


In [8]:
smiles = "O=C1NC2=CC=C([N+]([O-])=O)C=C2C(C3=CC=CC=C3F)=NC1"
model_input = features.get_numpy_fingerprint_from_smiles(smiles)
model_input = torch.Tensor(model_input).float()
inputs = model_input.to(device)

output = net(inputs)

_, pred = torch.topk(output, 15)
preds = [y_transform[x] for x in pred]
for i, pred in enumerate(preds):
    print("{}. {}".format(i+1, pred))

1. Phosphodiesterase 4D, Phosphodiesterase 4A, Phosphodiesterase 4B, Phosphodiesterase 3B, Phosphodiesterase 5A, Phosphodiesterase 4C, Phosphodiesterase 2A, Phosphodiesterase 7A, Phosphodiesterase 3A, Phosphodiesterase 10A, Phosphodiesterase 11A, Phosphodiesterase 9A, Phosphodiesterase 6C, Phosphodiesterase 1A, Phosphodiesterase 1C, Phosphodiesterase 7B, Phosphodiesterase 8B, Phosphodiesterase 8A, Phosphodiesterase 1B, Phosphodiesterase 6A, Phosphodiesterase 6D
2. Indoleamine 2,3-dioxygenase, Indoleamine 2,3-dioxygenase 2
3. Cyclin-dependent kinase 1/cyclin B1, Cyclin-dependent kinase 2/cyclin A, Cyclin-dependent kinase 1/cyclin B, Cyclin-dependent kinase 2/cyclin E, Cyclin-dependent kinase 4/cyclin D1, Cyclin-dependent kinase 2/cyclin E1, Cyclin-dependent kinase 4/cyclin D, Cyclin-dependent kinase 4/cyclin D2
4. Monoamine oxidase B, Monoamine oxidase A
5. Cytochrome P450 11B1, Cytochrome P450 11B2, Cytochrome P450 1B1, Cytochrome P450 17A1, Cytochrome P450 19A1, Cytochrome P450 2J2, C