In [1]:
import pandas as pd
import numpy as np

In [2]:
"""
This file is to encode SMILES into one-hot encodings
"""

import numpy as np

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader


class SMILESWithPropertiesDataset(Dataset):
    def __init__(self, smiles_encodings, properties):
        self.smiles_encodings = smiles_encodings
        self.properties = properties

    def __len__(self):
        return len(self.smiles_encodings)

    def __getitem__(self, idx):
        return self.smiles_encodings[idx], self.properties[idx]
    

def smile_to_hot(smile, largest_smile_len, alphabet):
    """Go from a single smile string to a one-hot encoding.
    """

    char_to_int = dict((c, i) for i, c in enumerate(alphabet))

    # pad with ' '
    smile += ' ' * (largest_smile_len - len(smile))

    # integer encode input smile
    integer_encoded = [char_to_int[char] for char in smile]

    # one hot-encode input smile
    onehot_encoded = list()
    for value in integer_encoded:
        letter = [0 for _ in range(len(alphabet))]
        letter[value] = 1
        onehot_encoded.append(letter)
    return integer_encoded, np.array(onehot_encoded)


def multiple_smile_to_hot(smiles_list, largest_molecule_len, alphabet):
    """Convert a list of smile strings to a one-hot encoding

    Returned shape (num_smiles x len_of_largest_smile x len_smile_encoding)
    """

    hot_list = []
    for s in tqdm(smiles_list, desc="Encoding SMILES"):  
        _, onehot_encoded = smile_to_hot(s, largest_molecule_len, alphabet)
        hot_list.append(onehot_encoded)
    return np.array(hot_list)

In [3]:
def get_smiles_encodings_for_dataset(file_path):
    """
    Returns encoding, alphabet and length of largest molecule in SMILES, 
    given a file containing SMILES molecules.

    input:
        csv file with molecules. Column's name must be 'smiles'.
    output:
        - smiles encoding (equivalent to file content)
        - smiles alphabet (character based)
        - longest smiles string
    """

    df = pd.read_csv(file_path)

    smiles_list = np.asanyarray(df.smiles)

    smiles_alphabet = list(set(''.join(smiles_list)))
    smiles_alphabet.append(' ')  # for padding

    largest_smiles_len = len(max(smiles_list, key=len))

    return smiles_list, smiles_alphabet, largest_smiles_len

In [4]:
file_path = './dataJ_250k_rndm_zinc_drugs_clean.txt'

encoding_list, encoding_alphabet, largest_molecule_len = get_smiles_encodings_for_dataset(file_path)

In [5]:
data = multiple_smile_to_hot(encoding_list, largest_molecule_len, encoding_alphabet)

Encoding SMILES: 100%|██████████| 249456/249456 [01:11<00:00, 3489.72it/s]


In [6]:
data.shape

(249456, 120, 35)

In [10]:
data[:10,:,:][0]

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])