# SUMMARY
This notebook preprocesses the data and exports a tokenizer used by the deep learning models. The code is hevaliy based on [this Kaggle notebook](https://www.kaggle.com/yasufuminakama/inchi-preprocess-2).

In [1]:
##### PACKAGES

import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import torch

In [None]:
##### MODULES

sys.path.append('../codes')
from tokenizer import Tokenizer
from preprocessing import split_form, split_form2

In [2]:
##### DATA IMPORT

train = pd.read_csv('../input/train_labels.csv')
print(f'train.shape: {train.shape}')

train.shape: (2424186, 2)


In [5]:
##### PREPARE INCHI FORMULAS

train['InChI_1'] = train['InChI'].progress_apply(lambda x: x.split('/')[1])
train['InChI_text'] = train['InChI_1'].progress_apply(split_form) + ' ' + \
                        train['InChI'].apply(lambda x: '/'.join(x.split('/')[2:])).progress_apply(split_form2).values

  0%|          | 0/2424186 [00:00<?, ?it/s]

  0%|          | 0/2424186 [00:00<?, ?it/s]

  0%|          | 0/2424186 [00:00<?, ?it/s]

In [6]:
##### FIT TOKENIZER

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['InChI_text'].values)
torch.save(tokenizer, '../input/tokenizer2.pth')
print('Saved tokenizer')

Saved tokenizer


In [10]:
##### DISPLAY DICTIONARY

tokenizer.stoi

{'(': 0,
 ')': 1,
 '+': 2,
 ',': 3,
 '-': 4,
 '/b': 5,
 '/c': 6,
 '/h': 7,
 '/i': 8,
 '/m': 9,
 '/s': 10,
 '/t': 11,
 '0': 12,
 '1': 13,
 '10': 14,
 '100': 15,
 '101': 16,
 '102': 17,
 '103': 18,
 '104': 19,
 '105': 20,
 '106': 21,
 '107': 22,
 '108': 23,
 '109': 24,
 '11': 25,
 '110': 26,
 '111': 27,
 '112': 28,
 '113': 29,
 '114': 30,
 '115': 31,
 '116': 32,
 '117': 33,
 '118': 34,
 '119': 35,
 '12': 36,
 '120': 37,
 '121': 38,
 '122': 39,
 '123': 40,
 '124': 41,
 '125': 42,
 '126': 43,
 '127': 44,
 '128': 45,
 '129': 46,
 '13': 47,
 '130': 48,
 '131': 49,
 '132': 50,
 '133': 51,
 '134': 52,
 '135': 53,
 '136': 54,
 '137': 55,
 '138': 56,
 '139': 57,
 '14': 58,
 '140': 59,
 '141': 60,
 '142': 61,
 '143': 62,
 '144': 63,
 '145': 64,
 '146': 65,
 '147': 66,
 '148': 67,
 '149': 68,
 '15': 69,
 '150': 70,
 '151': 71,
 '152': 72,
 '153': 73,
 '154': 74,
 '155': 75,
 '156': 76,
 '157': 77,
 '158': 78,
 '159': 79,
 '16': 80,
 '161': 81,
 '163': 82,
 '165': 83,
 '167': 84,
 '17': 85,
 '18': 

In [11]:
##### CONVERT TO TOKENS

lengths = []
tk0 = tqdm(train['InChI_text'].values, total = len(train))
for text in tk0:
    seq    = tokenizer.text_to_sequence(text)
    length = len(seq) - 2
    lengths.append(length)
train['InChI_length'] = lengths
train.to_pickle('../input/train2.pkl')
print('Saved preprocessed train.pkl')

  0%|          | 0/2424186 [00:00<?, ?it/s]

Saved preprocessed train.pkl


In [18]:
##### CHECK SAMPLE MOLECULES

for i in range(5):
    print(train['InChI'].values[i])
    print(train['InChI_text'].values[i])
    print('-' * 50)

InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12(13)11(4)14/h5-7,9,11,14H,8H2,1-4H3
C 13 H 20 O S /c 1 - 9 ( 2 ) 8 - 15 - 13 - 6 - 5 - 10 ( 3 ) 7 - 12 ( 13 ) 11 ( 4 ) 14 /h 5 - 7 , 9 , 11 , 14 H , 8 H 2 , 1 - 4 H 3
--------------------------------------------------
InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(10-14)11-17(23)19-15-4-5-18(24)21(15,3)9-7-16(19)20/h13-16,19H,4-11H2,1-3H3/t13-,14+,15+,16-,19-,20+,21+/m1/s1
C 21 H 30 O 4 /c 1 - 12 ( 22 ) 25 - 14 - 6 - 8 - 20 ( 2 ) 13 ( 10 - 14 ) 11 - 17 ( 23 ) 19 - 15 - 4 - 5 - 18 ( 24 ) 21 ( 15 , 3 ) 9 - 7 - 16 ( 19 ) 20 /h 13 - 16 , 19 H , 4 - 11 H 2 , 1 - 3 H 3 /t 13 - , 14 + , 15 + , 16 - , 19 - , 20 + , 21 + /m 1 /s 1
--------------------------------------------------
InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-12-10-20(28)30)27-11-9-16-21(23(25)31)26-29(22(16)24(27)32)18-5-3-4-6-19(18)33-2/h3-8,13H,9-12H2,1-2H3,(H2,25,31)
C 24 H 23 N 5 O 4 /c 1 - 14 - 13 - 15 ( 7 - 8 - 17 ( 14 ) 28 - 12 - 10 - 20 ( 28 ) 30 ) 27 - 11 - 9 - 16 - 21 ( 23 (