In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from torch import optim
from tqdm import tqdm_notebook

In [4]:
from transformer.models import TrfmSeq2seqProp2

In [5]:
df = pd.read_csv("data/chembl_24_chemreps.csv")
df.head()

Unnamed: 0,canonical_smiles,sas,logP,qed
0,Cc1cc(cn1C)c2csc(N=C(N)N)n2,3.048474,1.36192,0.608781
1,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...,9.08459,-16.6611,0.016357
2,CCCC[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN...,9.752879,-12.1958,0.010798
3,CC(C)C[C@@H]1NC(=O)CNC(=O)[C@@H](NC(=O)[C@H](N...,9.146251,-5.5864,0.016726
4,Brc1cccc(Nc2ncnc3ccncc23)c1NCCN4CCOCC4,2.595185,3.275,0.623114


In [7]:
from transformer.dataset import Seq2seqDatasetProp
from transformer.build_vocab import WordVocab

In [18]:
vocab = WordVocab.load_vocab('data/vocab.pkl')
dataset = Seq2seqDatasetProp(df, vocab)
loader = DataLoader(dataset, batch_size=32, num_workers=56, pin_memory=True)

In [21]:
model = TrfmSeq2seqProp2(len(vocab), 256, len(vocab), 4)
model.load_state_dict(torch.load("exps/exp1/ST_49_1767-0.320578.pkl"))
model.eval()

TrfmSeq2seqProp2(
  (embed): Embedding(75, 256)
  (pe): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (trfm): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=256, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
  

In [26]:
with torch.no_grad():
    for i, (x, y) in enumerate(loader):
        _, p = model(x)
        print(y.numpy())
        print(p.numpy())
        print(model.encode(x))
        break

[[ 1.3619200e+00  6.0878122e-01  3.0484741e+00]
 [-1.6661100e+01  1.6356876e-02  9.0845900e+00]
 [-1.2195800e+01  1.0797775e-02  9.7528791e+00]
 [-5.5864000e+00  1.6725697e-02  9.1462507e+00]
 [ 3.2750001e+00  6.2311411e-01  2.5951846e+00]
 [ 2.1281700e+00  5.5315000e-01  2.6586289e+00]
 [ 2.0427999e+00  7.9487485e-01  1.9519814e+00]
 [ 4.3040001e-01  2.0208402e-01  6.3635716e+00]
 [ 1.1223000e+00  1.4987187e-01  3.7469871e+00]
 [ 3.9300001e+00  4.1431710e-02  7.7566690e+00]
 [ 6.8586998e+00  2.1149287e-01  4.2275925e+00]
 [ 1.1734000e+00  3.7493873e-02  5.4670529e+00]
 [ 6.3139999e-01  5.6085479e-02  3.0652828e+00]
 [ 4.0012002e+00  2.6559186e-01  6.6093149e+00]
 [ 4.9428000e+00  2.5068691e-01  3.8377149e+00]
 [ 2.5766001e+00  7.9252049e-02  7.4949985e+00]
 [ 3.5840001e+00  5.5678141e-01  3.2889094e+00]
 [ 1.4102000e+00  4.5694306e-01  1.9019942e+00]
 [ 4.5454998e+00  4.5240137e-01  3.5957432e+00]
 [ 2.7349200e+00  5.7064170e-01  2.2314897e+00]
 [ 4.2009702e+00  7.4722511e-01  2.32088

In [13]:
model

TrfmSeq2seqProp2(
  (embed): Embedding(75, 256)
  (pe): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (trfm): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=256, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
  

In [7]:
Xt = torch.load("X.tensor")
Y = df[["logP", "qed", "SAS"]].values[:Xt.size()[0]]
Yt = torch.from_numpy(Y)
Yt = Yt.to(torch.float32)

In [8]:
Xt = Xt.to(device='cuda')
Yt = Yt.to(device='cuda')
print(Xt.size(), Yt.size())

torch.Size([248592, 1024]) torch.Size([248592, 3])


In [9]:
dataset = TensorDataset(Xt, Yt)
test_size = int(0.1*len(dataset))
train, test = torch.utils.data.random_split(dataset, [len(dataset) - test_size, test_size])
batch_size = 64
epochs = 100
traindataloader = DataLoader(train, batch_size=batch_size, shuffle=True)#, num_workers=8, pin_memory=True)
testdataloader = DataLoader(test, batch_size=batch_size, shuffle=False)

In [10]:
def eval(model):
    model.eval()
    eval_loss = 0
    for i, (x, y) in enumerate(testdataloader):
        with torch.no_grad():
            prop = model(x)
            pred_loss = F.mse_loss(prop[:, 0], y[:, 0]) + \
                        F.mse_loss(prop[:, 1], y[:, 1]) + \
                        F.mse_loss(prop[:, 2], y[:, 2])
            eval_loss += pred_loss.item()

    return (eval_loss/len(testdataloader))

In [11]:
model = PredictorModel(1024, 3, 0.0).cuda(device="cuda")
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [12]:
model.train()
best_loss = None
for e in tqdm_notebook(range(epochs)):
    for i, (x, y) in enumerate(traindataloader):
        optimizer.zero_grad()
        prop = model(x)
        pred_loss = F.mse_loss(prop[:, 0], y[:, 0]) + \
                    F.mse_loss(prop[:, 1], y[:, 1]) + \
                    F.mse_loss(prop[:, 2], y[:, 2])
        pred_loss.backward()
        optimizer.step()
    
    
    eval_loss = eval(model)
    if e%5==0:
        print(f"Train Loss: {pred_loss.item()}")
        print(f"Eval Loss: {eval_loss}")
    if not best_loss or eval_loss < best_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), "./saved_models/%d_%f.mdl" % (e, best_loss))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Train Loss: 0.8560492396354675
Eval Loss: 0.6363265601712205
Train Loss: 0.28453484177589417
Eval Loss: 0.30980443004470865
Train Loss: 0.2623657286167145
Eval Loss: 0.2926026750882671
Train Loss: 0.2861662209033966
Eval Loss: 0.2848143145863379
Train Loss: 0.18335600197315216
Eval Loss: 0.2837755856676396
Train Loss: 0.2956135869026184
Eval Loss: 0.2869073555141611
Train Loss: 0.26382842659950256
Eval Loss: 0.2792869941511007
Train Loss: 0.17775124311447144
Eval Loss: 0.2811605188717879
Train Loss: 0.15396934747695923
Eval Loss: 0.28320693333841535
Train Loss: 0.15040983259677887
Eval Loss: 0.2817502891726236
Train Loss: 0.17452488839626312
Eval Loss: 0.2836908488546362
Train Loss: 0.2980740964412689
Eval Loss: 0.2972186971216398
Train Loss: 0.17469362914562225
Eval Loss: 0.2878703772638635
Train Loss: 0.124408058822155
Eval Loss: 0.2897227902568704
Train Loss: 0.17002902925014496
Eval Loss: 0.291902334072596
Train Loss: 0.1411111056804657
Eval Loss: 0.2942299855092497
Train Loss: 0.1

In [14]:
model.load_state_dict(torch.load('saved_models/27_0.274643.mdl'))

<All keys matched successfully>

In [15]:
import numpy as np

In [16]:
model.eval()
ytrue = np.empty((1000, 3), np.float32)
ypred = np.empty_like(ytrue, np.float32)
for i, (x,y) in enumerate(testdataloader):
    with torch.no_grad():
        p = model(x)
        for p, q in zip(y, p):
            ytrue[i*batch_size:i*batch_size+batch_size] = p.cpu().numpy()
            ypred[i*batch_size:i*batch_size+batch_size] = q.cpu().numpy()
#         break

`logp, qed, SAS`

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [19]:
[mean_absolute_error(ytrue[:, i], ypred[:, i]) for i in range(3)]

[0.31905702, 0.049619623, 0.11448679]

In [23]:
[np.sqrt(mean_squared_error(ytrue[:, i], ypred[:, i])) for i in range(3)]

[0.40899307, 0.06349394, 0.14717217]

Note: The model was trained to predict 3 labels which is harder than predicting just one. The encoded representations were obtained from a network that was trained probably one label (need to check). So what we can do is train the transformer model on entire chembl24 dataset and predict 3 labels. That way, this result would be better hopefully.