In [1]:
import torch
import numpy as np
from torch.utils.data import DataLoader

In [16]:
import torch
import re


_atoms = ['He', 'Li', 'Be', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'Cl', 'Ar',
          'Ca', 'Ti', 'Cr', 'Fe', 'Ni', 'Cu', 'Ga', 'Ge', 'As', 'Se',
          'Br', 'Kr', 'Rb', 'Sr', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh',
          'Pd', 'Ag', 'Cd', 'Sb', 'Te', 'Xe', 'Ba', 'La', 'Ce', 'Pr',
          'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Er', 'Tm', 'Yb',
          'Lu', 'Hf', 'Ta', 'Re', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb',
          'Bi', 'At', 'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'Pu', 'Am', 'Cm',
          'Bk', 'Cf', 'Es', 'Fm', 'Md', 'Lr', 'Rf', 'Db', 'Sg', 'Mt',
          'Ds', 'Rg', 'Fl', 'Mc', 'Lv', 'Ts', 'Og']


def get_tokenizer_re(atoms):
    return re.compile('('+'|'.join(atoms)+r'|\%\d\d|.)')


_atoms_re = get_tokenizer_re(_atoms)


__i2t = {
    0: 'unused', 1: '>', 2: '<', 3: '2', 4: 'F', 5: 'Cl', 6: 'N',
    7: '[', 8: '6', 9: 'O', 10: 'c', 11: ']', 12: '#',
    13: '=', 14: '3', 15: ')', 16: '4', 17: '-', 18: 'n',
    19: 'o', 20: '5', 21: 'H', 22: '(', 23: 'C',
    24: '1', 25: 'S', 26: 's', 27: 'Br'
}


__t2i = {
    '>': 1, '<': 2, '2': 3, 'F': 4, 'Cl': 5, 'N': 6, '[': 7, '6': 8,
    'O': 9, 'c': 10, ']': 11, '#': 12, '=': 13, '3': 14, ')': 15,
    '4': 16, '-': 17, 'n': 18, 'o': 19, '5': 20, 'H': 21, '(': 22,
    'C': 23, '1': 24, 'S': 25, 's': 26, 'Br': 27
}


def smiles_tokenizer(line, atoms=None):
    """
    Tokenizes SMILES string atom-wise using regular expressions. While this
    method is fast, it may lead to some mistakes: Sn may be considered as Tin
    or as Sulfur with Nitrogen in aromatic cycle. Because of this, you should
    specify a set of two-letter atoms explicitly.

    Parameters:
         atoms: set of two-letter atoms for tokenization
    """
    if atoms is not None:
        reg = get_tokenizer_re(atoms)
    else:
        reg = _atoms_re
    return reg.split(line)[1::2]


def encode(sm_list, pad_size=50):
    """
    Encodes list of smiles to tensor of tokens
    
    Args:
         sm_list (list) : List of SMILEs strings read from dataset
         pad_size (int) : An integer denoting the pad size
         
    Returns:
        tokens (Tensor) : Integer tensor containing indices of tokenized SMILE strings
                          of size [len(sm_list), pad_size]
        smiles_lens (Tensor) : Integer tensor containing the lengths of tokenized SMILE strings i.e.
                        the actual lengths before padding. Size is [len(sm_list)]

    """
    res = []
    lens = []
    for s in sm_list:
        tokens = ([1] + [__t2i[tok]
                  for tok in smiles_tokenizer(s)])[:pad_size - 1]
        lens.append(len(tokens))
        tokens += (pad_size - len(tokens)) * [2]
        res.append(tokens)
    tokens = torch.tensor(res, dtype=torch.int, device='cuda')
    smiles_lens = torch.tensor(lens, dtype=torch.int, device = 'cuda')
    return tokens, smiles_lens


def decode(tokens_tensor):
    """
    Decodes from tensor of tokens to list of smiles
    """

    smiles_res = []

    for i in range(tokens_tensor.shape[0]):
        cur_sm = ''
        for t in tokens_tensor[i].detach().cpu().numpy():
            if t == 2:
                break
            elif t > 2:
                cur_sm += __i2t[t]

        smiles_res.append(cur_sm)

    return smiles_res


def get_vocab_size():
    return len(__i2t)


In [20]:
import torch
from torch.utils.data import TensorDataset
from gentrl import tokenizer
import pandas as pd
import numpy as np


# so the plan here is to have 
class NewMolecularDataset:
    def __init__(self, sources=[], props=['logIC50', 'BFL', 'pipeline'],
                 with_missings=False):
        self.num_sources = len(sources)
        self.source_smiles = []
        self.source_props = []
        self.source_missings = []
        self.source_probs = []

        self.with_missings = with_missings

        self.len = 0
        # this whole for loop is useless because we only have 1 source (i.e)
        # only 1 source dict
        for source_descr in sources:
            cur_df = pd.read_csv(source_descr['path'])
            cur_smiles = list(cur_df[source_descr['smiles']].values)
            num_smiles = len(cur_smiles)
            num_props = len(props)
            cur_props = torch.zeros(num_smiles, num_props, device ='cuda') # by default it's float32 tensor
            cur_missings = torch.zeros(num_smiles, num_props, dtype=torch.int64, device='cuda')

            for i, prop in enumerate(props):
                if prop in source_descr:
                    if isinstance(source_descr[prop], str):
                        cur_props[:, i] = torch.from_numpy(
                            cur_df[source_descr[prop]].values)
                        # so this is where we read plogp from dataframe
                        # and set it to cur_props
                        # these are our labels
                        # currently cur_props is tensor on CPU
                    else:
                        cur_props[:, i] = torch.from_numpy(
                            cur_df[source_descr['smiles']].map(
                                source_descr[prop]).values)
                else:
                    cur_missings[:, i] = 1
            

            self.source_smiles.append(cur_smiles)
            self.source_props.append(cur_props)
            self.source_missings.append(cur_missings)
            self.source_probs.append(source_descr['prob'])

            self.len = max(self.len, int(num_smiles / source_descr['prob']))

        self.source_probs = np.array(self.source_probs).astype(np.float)

        self.source_probs /= self.source_probs.sum()
        
        
    def create_gpu_dataset(self):    
        trial = np.random.random()

        s = 0
        # here self.num_sources =1 
        for i in range(self.num_sources):
            # here self.source_probs = np.array([1])
            # so self.source_probs[0] = 1
            # so for s = 0 this if condition will always be true
            if (trial >= s) and (trial <= s + self.source_probs[i]):
                # here bin_len is same as num_smiles
                #bin_len = len(self.source_smiles[i])
                
                # here sm is just idx_th SMILE string
                #sm = self.source_smiles[i][idx % bin_len]
                sm_list = self.source_smiles[i]
                # here self.source_props[0] is cur_props
                # so props is the cur_prop value corresponding to 
                # idx_th SMILE string
                # props = self.source_props[i][idx % bin_len]
                props = self.source_props[i]
                # here self.source_missings[0] is cur_missings
                # so miss is the cur_missings value corresponding to 
                # idx_th SMILE string
                # miss = self.source_missings[i][idx % bin_len]
                miss = self.source_missings[i]
                y = props
                if self.with_missings:
                    y = torch.concat([props, miss])
            # so getitem just returns (idx_th SMILE string, idx_th prop value)
            s += self.source_probs[i]
        
        # so now we have sm_list and their labels (y values)
        # we need to tokenize the SMILES string in sm_list and convert the tokens
        # to indices
        
        tokens, smile_lens = encode(sm_list)
        
        # move the tokens, smile_lens and y tensors to device (CPU or GPU)
#         tokens.to('cuda')
#         smile_lens.to('cuda')
#         y.to('cuda')
        
        print("tokens device", tokens.device)
        print("smile_lens device", smile_lens.device)
        print("y device", y.device)
        
        dataset = TensorDataset(tokens, smile_lens, y)
        
        return dataset

In [21]:
nmd = NewMolecularDataset(sources=[
    {'path':'train_subset_100_000.csv',
     'smiles': 'SMILES',
     'prob': 1,
     'plogP' : 'plogP',
    }], 
    props=['plogP'])

In [25]:
gpu_dataset = nmd.create_gpu_dataset()

tokens device cuda:0
smile_lens device cuda:0
y device cuda:0


In [26]:
BATCH_SIZE = 50
LR = 1e-4
NUM_EPOCHS = 1
NUM_WORKERS = 0
PIN_MEMORY= False

In [27]:
train_loader = DataLoader(gpu_dataset, batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=NUM_WORKERS,
                          pin_memory=PIN_MEMORY, drop_last=True)

In [28]:
for x, x_lens, y in train_loader:
    print(f"x is {x}")
    print(f"x shape is {x.shape}")
    print(f"x device is {x.device}")
    print()
    print(f"x_lens is {x_lens}")
    print(f"x_lens shape is {x_lens.shape}")
    print(f"x_lens device is {x_lens.device}")
    print()
    print(f"y is {y}")
    print(f"y shape is {y.shape}")
    print(f"y device is {y.device}")
    break

x is tensor([[ 1, 23, 23,  ...,  2,  2,  2],
        [ 1, 23, 23,  ...,  2,  2,  2],
        [ 1, 23, 23,  ...,  2,  2,  2],
        ...,
        [ 1,  6, 12,  ...,  2,  2,  2],
        [ 1,  9, 13,  ...,  2,  2,  2],
        [ 1,  9, 13,  ...,  2,  2,  2]], device='cuda:0', dtype=torch.int32)
x shape is torch.Size([50, 50])
x device is cuda:0

x_lens is tensor([43, 39, 39, 43, 39, 34, 35, 32, 39, 30, 40, 37, 28, 41, 31, 34, 44, 36,
        43, 41, 38, 42, 39, 41, 37, 39, 39, 36, 41, 37, 42, 39, 28, 42, 40, 42,
        44, 40, 43, 43, 33, 41, 39, 44, 35, 33, 31, 43, 35, 35],
       device='cuda:0', dtype=torch.int32)
x_lens shape is torch.Size([50])
x_lens device is cuda:0

y is tensor([[ 0.7073],
        [ 1.5643],
        [ 0.4647],
        [ 0.2846],
        [ 1.3440],
        [ 1.2395],
        [-0.2849],
        [ 0.9831],
        [ 1.6079],
        [ 1.1284],
        [ 0.7045],
        [ 1.0118],
        [ 0.2000],
        [-2.8763],
        [ 1.3348],
        [ 0.3631],
        

In [35]:
class TrainStats():
    def __init__(self):
        self.stats = dict()

    def update(self, delta):
        for key in delta.keys():
            if key in self.stats.keys():
                self.stats[key].append(delta[key])
            else:
                self.stats[key] = [delta[key]]

    def reset(self):
        for key in self.stats.keys():
            self.stats[key] = []

    def print(self):
        for key in self.stats.keys():
            avg = sum(self.stats[key]) / len(self.stats[key])
            print(str(key) + ": {:4.4};".format(avg), end='')
        print()

In [41]:
elbo = torch.tensor(31, device='cuda')
rec_part = torch.tensor(31, device='cuda')
kldiv_part = torch.tensor(31, device='cuda')
log_p_y_by_z = torch.rand(31, device='cuda')
log_p_z_by_y = torch.rand(31, device='cuda')
cur_stats = {
            'loss': -elbo,
            'rec': rec_part,
            'kl': kldiv_part,
            'log_p_y_by_z': log_p_y_by_z.mean(),
            'log_p_z_by_y': log_p_z_by_y.mean()
        }

In [37]:
local_stats = TrainStats()

In [42]:
local_stats.update(cur_stats)

In [43]:
local_stats.stats

{'loss': [tensor(-3, device='cuda:0'), tensor(-31, device='cuda:0')],
 'rec': [tensor(3, device='cuda:0'), tensor(31, device='cuda:0')],
 'kl': [tensor(3, device='cuda:0'), tensor(31, device='cuda:0')],
 'log_p_y_by_z': [tensor(0.5874, device='cuda:0'),
  tensor(0.4797, device='cuda:0')],
 'log_p_z_by_y': [tensor(0.2277, device='cuda:0'),
  tensor(0.5117, device='cuda:0')]}

In [44]:
local_stats.print()

loss: -17.0;rec: 17.0;kl: 17.0;log_p_y_by_z: 0.5335;log_p_z_by_y: 0.3697;
