In [3]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence, text

import os
import glob
import json
import rdkit
from tqdm import tqdm
import logging

In [18]:
# Folder path
DATASET_DIR = './datasets/zinc-preprocess/'
MAPPING_DIR = './mapping/'

# SMILES path
SMILES_FILENAME = '250k_rndm_zinc_drugs_clean' # do not include '.smi' extension

# MAPPING TABLE PATH
MAPPING_TABLE_DIR = MAPPING_DIR + SMILES_FILENAME
WORD_IDX_FILENAME = 'word_to_idx.json'
IDX_WORD_FILENAME = 'idx_to_word.json'

In [19]:
# Create mapping folder
if not os.path.isdir(MAPPING_TABLE_DIR):
    try:
        os.makedirs(MAPPING_TABLE_DIR)
    except OSError:
        logging.error("Creation of the directory failed")
    else:
        print("Successfully created the directory")
else:
    logging.warning("Directory alread exists")



In [4]:
# Tokenization
# note: when char_tokenizerl=True, then filters not working! so we need split \n
tokenizer = text.Tokenizer(char_level=True, lower=False)

# Load data
with open(DATASET_DIR + SMILES_FILENAME + '.smi', 'r') as f:
    for smiles in tqdm(f.readlines()):
        smiles = smiles.split('\n')[0]
        tokenizer.fit_on_texts(smiles)
        
    word_index = tokenizer.word_index
    print('Total number of SMILES: {}'.format(len(word_index)))

100%|██████████| 319616985/319616985 [5:27:12<00:00, 16279.95it/s]  

Total number of SMILES: 39





In [6]:
# Add prefix
SMILES_DIM = len(word_index)
word_index['<START>'] = SMILES_DIM + 1 
word_index['<PAD>'] = SMILES_DIM + 2
word_index['<EOL>'] = SMILES_DIM + 3 

word_index

{'C': 1,
 '@': 2,
 '[': 3,
 ']': 4,
 'H': 5,
 '1': 6,
 '2': 7,
 'O': 8,
 '(': 9,
 ')': 10,
 'N': 11,
 '=': 12,
 '3': 13,
 'l': 14,
 'S': 15,
 '#': 16,
 'B': 17,
 'r': 18,
 'F': 19,
 '4': 20,
 '/': 21,
 'c': 22,
 '+': 23,
 '\\': 24,
 'P': 25,
 '-': 26,
 'I': 27,
 'n': 28,
 'o': 29,
 '<START>': 30,
 '<PAD>': 31,
 '<EOL>': 32}

In [8]:
# index to word
index_word = {index:word for word, index in word_index.items()}
index_word

{1: 'C',
 2: '@',
 3: '[',
 4: ']',
 5: 'H',
 6: '1',
 7: '2',
 8: 'O',
 9: '(',
 10: ')',
 11: 'N',
 12: '=',
 13: '3',
 14: 'l',
 15: 'S',
 16: '#',
 17: 'B',
 18: 'r',
 19: 'F',
 20: '4',
 21: '/',
 22: 'c',
 23: '+',
 24: '\\',
 25: 'P',
 26: '-',
 27: 'I',
 28: 'n',
 29: 'o',
 30: '<START>',
 31: '<PAD>',
 32: '<EOL>'}

In [9]:
# Save
## using tokenizer class method
tokenizer_json = tokenizer.to_json()
with open(MAPPING_TABLE_DIR + '/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

## old school|
with open(MAPPING_TABLE_DIR + '/' + WORD_IDX_FILENAME, 'w') as wi, \
    open(MAPPING_TABLE_DIR + '/' + IDX_WORD_FILENAME, 'w') as iw:
        wi.write(json.dumps(word_index))
        iw.write(json.dumps(index_word))