In [2]:
import pandas as pd
import torch
import pytorch_lightning as pl
import numpy as np

from torch.utils.data import DataLoader, Dataset
from tqdm import trange

from rxitect.models.pchembl_val_predictor import PChEMBLValueRegressor, get_tokens, identity

In [3]:
df_train = pd.read_csv("../data/processed/ligand_CHEMBL240_train_splityear=2015.csv")
df_test = pd.read_csv("../data/processed/ligand_CHEMBL240_test_splityear=2015.csv")

# df_full = pd.concat([df_train, df_test])
smiles = df_train.smiles
labels = df_train.pchembl_value.astype('float32')

In [4]:
tokens, _, _ = get_tokens(smiles)
tokens = ''.join(tokens) + ' '

In [5]:
reg = PChEMBLValueRegressor(tokens, 0.005)

In [6]:
reg

PChEMBLValueRegressor(
  (criterion): MSELoss()
  (embedding): Embedding(
    (embedding): Embedding(42, 128, padding_idx=41)
  )
  (encoder): LSTMEncoder(
    (rnn): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.8)
  )
  (mlp): MLP(
    (input_layer): Linear(in_features=128, out_features=128, bias=True)
    (out_layer): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [7]:
def seq2tensor(seqs, tokens, flip=True):
    tensor = np.zeros((len(seqs), len(seqs[0])))
    for i in trange(len(seqs), desc="Transforming sequences to tensors"):
        for j in range(len(seqs[i])):
            if seqs[i][j] in tokens:
                tensor[i, j] = tokens.index(seqs[i][j])
            else:
                tokens = tokens + seqs[i][j]
                tensor[i, j] = tokens.index(seqs[i][j])
    if flip:
        tensor = np.flip(tensor, axis=1).copy()
    return tensor, tokens


def pad_sequences(seqs, max_length=None, pad_symbol=' '):
    if max_length is None:
        max_length = -1
        for seq in seqs:
            max_length = max(max_length, len(seq))
    lengths = []
    for i in trange(len(seqs), desc="Padding sequences"):
        cur_len = len(seqs[i])
        lengths.append(cur_len)
        seqs[i] = seqs[i] + pad_symbol * (max_length - cur_len)
    return seqs, lengths


def process_smiles(smiles,
                   sanitized=True,
                   target=None,
                   augment=False,
                   pad=True,
                   tokenize=True,
                   tokens=None,
                   flip=False,
                   allowed_tokens=None):
    if not sanitized:
        # clean_smiles, clean_idx = sanitize_smiles(smiles, allowed_tokens=allowed_tokens)
        # clean_smiles = [clean_smiles[i] for i in clean_idx]
        # if target is not None:
        #     target = target[clean_idx]
        pass
    else:
        clean_smiles = smiles

    length = None
    if augment and target is not None:
        # clean_smiles, target = augment_smiles(clean_smiles, target)
        pass
    if pad:
        clean_smiles, length = pad_sequences(clean_smiles)
    tokens, token2idx, num_tokens = get_tokens(clean_smiles, tokens)
    if tokenize:
        clean_smiles, tokens = seq2tensor(clean_smiles, tokens, flip)

    return clean_smiles, target, length, tokens, token2idx, num_tokens

In [14]:
smi = process_smiles(smiles[:100], sanitized=True, target=labels, augment=False, pad=True,
            tokenize=True, tokens=tokens, flip=False)

Padding sequences: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 114880.96it/s]
Transforming sequences to tensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 706.84it/s]


In [19]:
smi[0].shape

(100, 407)

In [7]:
class SmilesDataset(Dataset):
    def __init__(self, smiles, labels, tokens=None, tokenize=True, sanitized=True, return_smiles=False):
        super(SmilesDataset, self).__init__()
        self.tokenize = tokenize
        self.return_smiles = return_smiles
        self.data, self.target, self.length, self.tokens, self.token2idx, self.num_tokens = process_smiles(
            smiles, sanitized=True, target=labels, augment=False, pad=True,
            tokenize=tokenize, tokens=tokens, flip=True)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sample = {}
        if self.return_smiles:
            sample['object'] = np.array([ord(self.tokens[int(i)]) for i in self.data[index]])
        sample['tokenized_smiles'] = self.data[index]
        sample['length'] = self.length[index]
        if self.target is not None:
            sample['labels'] = self.target[index]
        return sample

In [8]:
train_data = SmilesDataset(smiles, labels)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seqs[i] = seqs[i] + pad_symbol * (max_length - cur_len)
Padding sequences: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 7223/7223 [02:26<00:00, 49.20it/s]
Transforming sequences to tensors: 100%|█████████████████████████████████████████████████████████████████████████████████████| 7223/7223 [00:09<00:00, 726.18it/s]


In [9]:
train_loader = DataLoader(train_data,
                         batch_size=128,
                         shuffle=True,
                         num_workers=4,
                         pin_memory=True,
                         sampler=None)

In [10]:
trainer = pl.Trainer(max_epochs=26, accelerator='gpu', devices=1, log_every_n_steps=2)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
trainer.fit(reg, train_dataloaders=train_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params
------------------------------------------
0 | criterion | MSELoss     | 0     
1 | embedding | Embedding   | 5.4 K 
2 | encoder   | LSTMEncoder | 264 K 
3 | mlp       | MLP         | 16.6 K
------------------------------------------
286 K     Trainable params
0         Non-trainable params
286 K     Total params
1.145     Total estimated model params size (MB)


Epoch 25: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:03<00:00, 15.53it/s, loss=1.06, v_num=3]


In [13]:
reg.eval()

PChEMBLValueRegressor(
  (criterion): MSELoss()
  (embedding): Embedding(
    (embedding): Embedding(42, 128, padding_idx=41)
  )
  (encoder): LSTMEncoder(
    (rnn): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.8)
  )
  (mlp): MLP(
    (input_layer): Linear(in_features=128, out_features=128, bias=True)
    (out_layer): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [38]:
inp = torch.tensor(train_data[0]['tokenized_smiles']), torch.tensor(train_data[0]['length'])

In [48]:
inp = np.array([train_data[0]['tokenized_smiles'], train_data[0]['length']], dtype="float32")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [60]:
np.array(train_data[0]['length']).reshape(-1)

array([46])

In [46]:
torch.from_numpy(inp)

TypeError: expected np.ndarray (got list)

In [83]:
a = torch.from_numpy(train_data[7:8]['tokenized_smiles'].astype('float32'))
b = torch.tensor(train_data[7:8]['length'])

In [84]:
reg([a, b])

tensor([[5.4522]], grad_fn=<AddmmBackward0>)

In [73]:
train_data[:3]['labels']

0    6.040
1    3.990
2    6.465
Name: pchembl_value, dtype: float32

In [80]:
train_data[:]['labels'].mean()

5.3020287

In [85]:
train_data[7:8]

{'tokenized_smiles': array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,