In [236]:
import torch
from torch import nn
from rdkit.Chem import AllChem as Chem
import numpy as np

In [266]:
smiles = ["CCN=C=O","NC(=O)CC(=O)O"]


In [267]:
def create_dict(smiles, add_tokens=False):
    vocabulary = []    
    if add_tokens: 
        vocabulary=["<sos>", "<eos>", "<pad>"]
    for smile in smiles:
        atoms = []
        for i in range(len(smile)):
            atoms.append(smile[i])
            
        vocabulary += list(set(atoms)-set(vocabulary))
    try:
        i = vocabulary.index("r")
        k = vocabulary.index("B")
        del vocabulary[i], vocabulary[k]
        vocabulary +=["Br"]
    except:
        print("No `Br` in SMILES")

    try:
        i = vocabulary.index("l")
        del vocabulary[i]
        vocabulary +=["Cl"]
    except:
        print("No `Cl` in SMILES")
    
    return {vocabulary[i]: i for i in range(len(vocabulary))}   


def tokenize(smiles, dictionary):
    token_smiles = []
    for smile in smiles:
        token_smile= []
        i = 0
        while i < len(smile):
            if (smile[i:i+2]=="Cl"):
                token_smile.append(dictionary["Cl"])
                i+=2
            elif (smile[i:i+2]=="Br"):
                token_smile.append(dictionary["Br"])
                i+=2
            else:
                token_smile.append(dictionary[smile[i]])
                i+=1
        token_smiles.append(token_smile)   
    return token_smiles


def token_to_onehot(tokenized_smiles, vocabulary_length):
    one_hot_ll = list()
    for smile in tokenized_smiles:
        one_hot_matrix=np.zeros([len(smile),vocabulary_length])
        for i, token in enumerate(smile):
            one_hot_matrix[i,token]=1
        one_hot_ll.append(one_hot_matrix)
    return np.stack(one_hot_ll)


In [289]:
dictionary = create_dict(smiles)
tokenized_smiles = tokenize(smiles,dictionary)
tokenized_smiles

No `Br` in SMILES
No `Cl` in SMILES


[[2, 2, 1, 0, 2, 0, 3], [1, 2, 5, 0, 3, 4, 2, 2, 5, 0, 3, 4, 3]]

In [290]:
[len(x) for x in tokenized_smiles]

[7, 13]

In [291]:
dictionary["<pad>"] = len(dictionary)
dictionary

{'=': 0, 'N': 1, 'C': 2, 'O': 3, ')': 4, '(': 5, '<pad>': 6}

In [292]:
tokenized_smiles[0] += [dictionary["<pad>"]]*6
tokenized_smiles[0]

[2, 2, 1, 0, 2, 0, 3, 6, 6, 6, 6, 6, 6]

In [293]:
vocabulary_length = len(dictionary)
vocabulary_length

7

In [299]:
onehot_tokens = token_to_onehot(tokenized_smiles, vocabulary_length)
print(onehot_tokens, onehot_tokens.shape)

[[[0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 1.]]

 [[0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]
  [1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]]] (2, 13, 7)


In [296]:
np.random.seed(1234)
embedding_layer=np.random.rand(7,4)
embedding_layer

array([[0.19151945, 0.62210877, 0.43772774, 0.78535858],
       [0.77997581, 0.27259261, 0.27646426, 0.80187218],
       [0.95813935, 0.87593263, 0.35781727, 0.50099513],
       [0.68346294, 0.71270203, 0.37025075, 0.56119619],
       [0.50308317, 0.01376845, 0.77282662, 0.88264119],
       [0.36488598, 0.61539618, 0.07538124, 0.36882401],
       [0.9331401 , 0.65137814, 0.39720258, 0.78873014]])

In [301]:
token_embeddings = np.matmul(onehot_tokens,embedding_layer)
print(token_embeddings[0])

[[0.95813935 0.87593263 0.35781727 0.50099513]
 [0.95813935 0.87593263 0.35781727 0.50099513]
 [0.77997581 0.27259261 0.27646426 0.80187218]
 [0.19151945 0.62210877 0.43772774 0.78535858]
 [0.95813935 0.87593263 0.35781727 0.50099513]
 [0.19151945 0.62210877 0.43772774 0.78535858]
 [0.68346294 0.71270203 0.37025075 0.56119619]
 [0.9331401  0.65137814 0.39720258 0.78873014]
 [0.9331401  0.65137814 0.39720258 0.78873014]
 [0.9331401  0.65137814 0.39720258 0.78873014]
 [0.9331401  0.65137814 0.39720258 0.78873014]
 [0.9331401  0.65137814 0.39720258 0.78873014]
 [0.9331401  0.65137814 0.39720258 0.78873014]]


In [303]:
onehot_tokens[0,0,:]

array([0., 0., 1., 0., 0., 0., 0.])

In [307]:
embedding_layer[2,:]

array([0.95813935, 0.87593263, 0.35781727, 0.50099513])

In [308]:
token_embeddings[0,0,:]

array([0.95813935, 0.87593263, 0.35781727, 0.50099513])

In [311]:
token_embeddings = torch.tensor(token_embeddings)

In [310]:
torch.manual_seed(1234)
rnn = nn.RNN(5,10)
rnn.eval()

RNN(5, 10)

In [312]:
token_embeddings.transpose(1,0,2)

TypeError: transpose() received an invalid combination of arguments - got (int, int, int), but expected one of:
 * (int dim0, int dim1)
 * (name dim0, name dim1)


In [122]:
(test(input_, h0))[0]

tensor([[[-0.2495,  0.1411,  0.3408, -0.1384, -0.1114, -0.1310, -0.4749,
           0.2967,  0.0385,  0.7428],
         [-0.1933,  0.6212,  0.6955,  0.3132, -0.6334,  0.1274,  0.2464,
           0.0062,  0.6489, -0.0681]],

        [[ 0.4559,  0.3059,  0.0520, -0.0938, -0.5694, -0.5282, -0.3077,
           0.0982,  0.5768,  0.1572],
         [ 0.3727,  0.4563,  0.1463, -0.2034, -0.5932, -0.2727, -0.5871,
           0.5612,  0.5828,  0.4849]],

        [[ 0.4340,  0.3374,  0.4522, -0.1372, -0.6896, -0.4655, -0.2838,
           0.1896,  0.4805, -0.2464],
         [-0.3124, -0.3364, -0.5678,  0.3728,  0.3359, -0.9287, -0.3578,
          -0.7565, -0.1925,  0.5648]]], grad_fn=<StackBackward>)

In [123]:
w_1=list(test.parameters())[0]
w_2=list(test.parameters())[1]
b_1=list(test.parameters())[2]
b_2=list(test.parameters())[3]

In [124]:
b_1 + b_2

tensor([ 0.1491,  0.3882,  0.2524,  0.1594, -0.4827, -0.1760, -0.1404, -0.1496,
         0.4425,  0.2729], grad_fn=<AddBackward0>)

In [125]:
h0 = torch.zeros(1,2,10)
for i in range(3):
    h0=torch.tanh((input_[i]@w_1.t()+b_1)+(h0@w_2.t()+b_2))
h0

tensor([[[ 0.4340,  0.3374,  0.4522, -0.1372, -0.6896, -0.4655, -0.2838,
           0.1896,  0.4805, -0.2464],
         [-0.3124, -0.3364, -0.5678,  0.3728,  0.3359, -0.9287, -0.3578,
          -0.7565, -0.1925,  0.5648]]], grad_fn=<TanhBackward>)

In [117]:
h0 = torch.zeros(1,2,10)
torch.tanh((input_[i]@w_1.t()+b_1)+(h0@w_2.t()+b_2))

tensor([[[ 0.1480,  0.3698,  0.2472,  0.1580, -0.4484, -0.1742, -0.1395,
          -0.1485,  0.4157,  0.2664],
         [ 0.1480,  0.3698,  0.2472,  0.1580, -0.4484, -0.1742, -0.1395,
          -0.1485,  0.4157,  0.2664]]], grad_fn=<TanhBackward>)