In [1]:
#Set google drive
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
#Install miniconda and rdkit
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

--2019-12-31 12:44:29--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.201.79, 104.18.200.79, 2606:4700::6812:c94f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.201.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71785000 (68M) [application/x-sh]
Saving to: ‘Miniconda3-latest-Linux-x86_64.sh’


2019-12-31 12:44:30 (160 MB/s) - ‘Miniconda3-latest-Linux-x86_64.sh’ saved [71785000/71785000]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - \ done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.2.0=py37_0
    - ca-certificates==2019.10.16=0
    - certifi==2019.9.11=py37_0
    - cffi==1.13.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0
    - co

In [3]:
#Read smiles files
maxlen = 64
with open('/content/gdrive/My Drive/Colab Notebooks/smiles.txt') as f:
  smiles = f.readlines()[:]
  smiles = [s.strip() for s in smiles]
  smiles = [s.split()[1] for s in smiles]
  smiles = [s for s in smiles if len(s)<maxlen]


#Characters of smiles
all_smiles=''
for s in smiles: all_smiles+=s
chars = sorted(list(set(list(all_smiles))))
chars.append('X')
c_to_i = {c:i for i,c in enumerate(chars)}
i_to_c = {i:c for i,c in enumerate(chars)}

print ('Max len:', maxlen)
print ('Number of chars:', len(chars))
print (chars)
print (c_to_i)

Max len: 64
Number of chars: 46
['#', '(', ')', '+', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', '@', 'B', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'S', 'V', 'Z', '[', '\\', ']', 'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 'u', 'X']
{'#': 0, '(': 1, ')': 2, '+': 3, '-': 4, '.': 5, '/': 6, '1': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '=': 14, '@': 15, 'B': 16, 'C': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'K': 22, 'L': 23, 'M': 24, 'N': 25, 'O': 26, 'P': 27, 'S': 28, 'V': 29, 'Z': 30, '[': 31, '\\': 32, ']': 33, 'a': 34, 'c': 35, 'e': 36, 'g': 37, 'i': 38, 'l': 39, 'n': 40, 'o': 41, 'r': 42, 's': 43, 'u': 44, 'X': 45}


In [0]:
#Dataset
from torch.utils.data import Dataset, DataLoader
import numpy as np

class MolDataset(Dataset):
    def __init__(self, smiles, c_to_i):
      self.smiles = smiles
      self.c_to_i = c_to_i
      
    def __len__(self):
      return len(self.smiles)

    def __getitem__(self, idx):
      s = self.smiles[idx]+'X'
      i = torch.from_numpy(np.array([c_to_i[c] for c in s]))
      sample = dict()
      sample['X'] = i
      sample['L'] = len(s)-1
      return sample

In [0]:
#Collate fn
def my_collate(batch):
    sample = dict()
    X =  torch.nn.utils.rnn.pad_sequence([b['X'] for b in batch], 
                                          batch_first=True, padding_value = 45)
    L = torch.Tensor([b['L'] for b in batch])

    sample['X'] = X
    sample['L'] = L

    return sample

In [0]:
#Model
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNRegressor(torch.nn.Module):
  def __init__(self, n_feature=128, n_rnn_layer = 1, n_char=46):
    super(RNNRegressor, self).__init__()
    self.n_feature = n_feature
    self.encoder = nn.GRU(input_size=n_feature, hidden_size=n_feature, 
                          num_layers=n_rnn_layer)
    self.decoder = nn.GRU(input_size=n_feature*2, hidden_size=n_feature, 
                          num_layers=n_rnn_layer)
    
    self.mu = nn.Linear(n_feature, n_feature)
    self.logvar = nn.Linear(n_feature, n_feature)

    self.fc = nn.Linear(n_feature, n_char)
    self.embedding = nn.Embedding(n_char, n_feature)


  def reparameterize(self, mu, logvar):
    std = torch.exp(0.5*logvar)
    eps = torch.randn_like(std)
    return mu + eps*std

  def forward(self, x, l):
    x_emb = self.embedding(x)
    x_emb = x_emb.permute((1,0,2))
    enc_output, _= self.encoder(x_emb)
    z = []
    for i in range(len(l)):
      z.append(enc_output[l[i]-1,i])
    z = torch.stack(z, 0)

    mu = self.mu(z)
    logvar = self.logvar(z)
    sampled_z = self.reparameterize(mu, logvar)

    dec_input = torch.cat([torch.ones(1, x_emb.size(1), x_emb.size(2)).cuda(), x_emb], 0)
    repeat_z = sampled_z.unsqueeze(0).repeat(dec_input.size(0), 1, 1)
    dec_input = torch.cat([dec_input, repeat_z], -1)
    dec_output, _ = self.decoder(dec_input)
    dec_output = dec_output[:-1]
    dec_output = self.fc(dec_output)

    loss1 = 0
    loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')
    for i in range(len(l)):
      loss1 += loss_fn(dec_output[:(l[i]+1),i,:], x[i,:(l[i]+1)])
    loss1/=len(dec_output)      

    loss2 = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    return loss1, loss2, dec_output

  def indices_to_string(self, indices, i_to_c):
    return ''.join([i_to_c[i] for i in indices])

  def sample(self):
    sampled_z = torch.randn(1, self.n_feature).cuda()
    token = torch.ones((1, self.n_feature)).cuda()
    initial_state = torch.zeros((1, 1, self.n_feature)).cuda()

    gen_smiles = []
    for i in range(120):
      dec_input = torch.cat([token, sampled_z], -1).unsqueeze(0)
      output, initial_state = self.decoder(dec_input, initial_state)
      output = F.softmax(self.fc(output), -1).squeeze()
      #if i<1: print (output)
      m = torch.distributions.categorical.Categorical(output)
      sampled_idx = m.sample()
      gen_smiles.append(int(sampled_idx))
      token = self.embedding(torch.tensor([sampled_idx]).cuda())
    return gen_smiles

In [9]:
#Train model
import time
lr = 1e-4
model = RNNRegressor(512, 1, 46).cuda()

#Dataset
train_smiles = smiles[:100000]
test_smiles = smiles[100000:]
train_dataset = MolDataset(train_smiles,  c_to_i)
test_dataset = MolDataset(test_smiles, c_to_i)

#Dataloader
train_dataloader = DataLoader(train_dataset, batch_size=128, num_workers=1, \
                              collate_fn=my_collate, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, num_workers=1, \
                             collate_fn=my_collate)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)

loss_fn = nn.MSELoss()
loss_list = []
st = time.time()

for epoch in range(100):
  epoch_loss = []
  for i_batch, batch in enumerate(train_dataloader):
    model.zero_grad()
    x, l = batch['X'].cuda().long(), batch['L'].cuda().long()
    loss1, loss2, dec_output = model(x, l)
    loss = loss1 + loss2*1.0
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    loss_list.append([loss1.data.cpu().numpy(), loss2.data.cpu().numpy()])
    epoch_loss.append([loss1.data.cpu().numpy(), loss2.data.cpu().numpy()])
  _, indices = torch.max(dec_output[:,0,:], -1)
  print ('input:\t', model.indices_to_string(x.data.cpu().numpy()[0], i_to_c)) 
  print ('output:\t', model.indices_to_string(indices.data.cpu().numpy(), i_to_c)) 
  print ('gen:\t', model.indices_to_string(model.sample(), i_to_c).split('X')[0])
  fn = f'/content/gdrive/My Drive/Colab Notebooks/save/save_{epoch}.pt'
  torch.save(model.state_dict(), fn)
  if True: print (epoch, np.mean(np.array(epoch_loss)[:,0]), 
                  np.mean(np.array(epoch_loss)[:,1]))
end = time.time()
print ('Time:', end-st)

input:	 Cc1ccc(NCc2nc3ccccc3n2Cc2ccccc2)cc1CXXXXXXXXXXXXXXXXXXXXXXXXXXXX
output:	 CC1ccc(CC(2cc(ccccc3c2C)2ccccc2)cc1XXCCccccccccccccccccccccccccc
gen:	 O=C1/C(=N\N=CCN2C(C)C1CCc2cccc2C1c1ccc(OC)cc1
0 2.3963323 16.047918
input:	 CCc1ccc(/C=C2\SC(=S)N(CCCC(=O)NNC(=O)c3ccccc3Cl)C2=O)cc1XXXXXXXX
output:	 CCO1ccc(NC=C2\SC(=S)N(CCCCC=O)Nc3(=O)c3ccccc3)l)C2=O)cc1XXXXcccc
gen:	 Cc1ccccc1Cl.Cl
1 1.5784793 0.104390524
input:	 COc1ccc(N2C(=O)C3C(C2=O)N2CCCN2C3c2ccc(Cl)cc2)cc1XXXXXXXXXXXX
output:	 CCc1ccc(CCC(=O)C(C(C()O)CCCCCC2C(c2ccccCl)cc2)cc1XCXccccccccc
gen:	 N#CC1=C(N)OC2=CC(CO(C)C2)C1c1ccc(O)cc1
2 1.412535 0.06638413
input:	 CCOc1ccc(-n2c(C)c(C(=O)OC3CCCCC3)c3cc(OC)ccc32)cc1XXXXXXXXXXXXXX
output:	 CCCC1ccc(Cc2c(C)ccC)=O)NCCCCCCC3)c3cccCC)ccc3n)cccXCXccccccccccc
gen:	 N#CCS/C=C(\C#N)c1c(C(F)(F)F)oc2ccc(OCc3ccccc3F)cc21
3 1.3219619 0.079605386
input:	 C/C(=N\NC(=O)CSc1ccccn1)c1ccc(Br)cc1XXXXXXXXXXXXXXXXXXXXXXXXXX
output:	 CCC(=N\NC(=O)CNc1ncccc1)c1ccccCr)cc1XCcccccccccccccccccccccccc
gen:	 O

In [10]:
# generate molecules
from rdkit import Chem
model = RNNRegressor(512, 1, 46).cuda()

for i in [0,1,2,5,10,20,40,60,80,99]:
  fn = f'/content/gdrive/My Drive/Colab Notebooks/save/save_{i}.pt'
  model.load_state_dict(torch.load(fn))
  
  generated = []
  for _ in range(100):
    generated.append(model.indices_to_string(model.sample(), i_to_c).split('X')[0])
  
  smiles = []    
  for g in generated:
    if Chem.MolFromSmiles(g) is not None: smiles.append(g)
  
  print (i, len(smiles), len(list(set(smiles))))    

0 8 8
1 31 31
2 27 27
5 46 46
10 61 61
20 73 73
40 86 86
60 89 89
80 88 88
99 89 89


In [11]:
# Smiles samples
model = RNNRegressor(512, 1, 46).cuda()

fn = '/content/gdrive/My Drive/Colab Notebooks/save/save_99.pt'
model.load_state_dict(torch.load(fn))
  
generated = []
for _ in range(100):
  gen = model.indices_to_string(model.sample(), i_to_c).split('X')[0]
  if Chem.MolFromSmiles(gen) is not None: print (gen)

COc1ccc(/N=C/c2c(O)ccc3oc4c(c23)CCCC4)cc1.Cl
Cc1c(Br)c([N+](=O)[O-])nn1CC(=O)Nc1sc2c(c1C#N)CCCCC2
COC(=O)N/N=C/c1cn(CCOc2cccc(Cl)c2)c2ccccc12
C/C(Cl)=C\CSc1nc2c(c(=O)[nH]c(=O)n2C)n1CCc1ccccc1
COc1ccc(/C=N/Nc2nc(C)cc(C)c2C#N)cc1
CCCCCCCSc1nc2c(c(=O)[nH]1)C(C)(C1CCCCC1)Cc1ccccc1-2
C/C(=N\NC(=O)CSc1n[nH]c(N)c1C#N)c1ccccc1
O=C1OC(c2ccco2)=N/C1=C\c1ccc(Br)cc1
Cc1ccc(OCC(=O)NC(=S)Nc2cccc(NC(=O)c3ccccc3)c2)cc1
Cc1ccc(Nc2cc(C)nc(-c3ccccc3)n2)cc1
COc1cc(/C=C/C(=O)c2ccc(C(C)(C)C)cc2)cc(OC)c1OC
Cc1cccc(N2C(=O)C3C(C2=O)N2CCCN2C3c2ccccc2)c1
O=C(COc1ccc(Cl)cc1Cl)N/N=C/c1ccccc1OCc1ccccc1
CCCCn1c(N/N=C(/C)c2ccc(OC)cc2)nc2c1c(=O)[nH]c(=O)n2C
CCc1ccccc1NC(=O)c1cc2cc(Br)ccc2oc1=O
CN(C)c1ccc(/C=N/Nc2nnc3c(nc2=O)[nH]c2ccc(Br)cc23)cc1
CCC(C)c1ccc(OCCOc2ccc(-n3ncc(C(=O)OCC(=O)O)c3)s2)cc1
O=C1C=CC(=O)N1CCP(=O)(O)CN1CCOCC1
CCCCCCCCCSc1nc2ccccc2c(=O)n1-c1ccccc1C
CCCCOc1ccc(-c2c(C#N)c(N)nc(SCC(=O)c3ccc(OC)cc3)c2C#N)cc1
COc1ccccc1NC(=O)C1Sc2ccccc2N1
CC(O)c1ccc(Oc2ccccc2-c2ccc(/C=N/Nc3cc(C)c(C)cc3C)o2)cc1
COc1ccc(