
Created on  June 18th

@author: hanshanley

This file contains preprocessing methods for creating processed SMILES that 
are fit for machine learning. As such vocabularies and one-hot encondings
of the SMILES are retreived using these methods. In addition, this file 
contains methods for converting SMILE encodings into their respective DEEP 
SMILES and SELFIE equivalents. Note that these three different encodings have 
vastly different vocab sizes so machine learning methods training on them 
may need to have different encodings so that they are able to model the vocabs. 

Information on SMILES: https://www.daylight.com/dayhtml/doc/theory/theory.smiles.html

Information on DEEP SMILES: https://github.com/baoilleach/deepsmiles

Information  on SELFIES: https://github.com/aspuru-guzik-group/selfies

In [None]:
!pip install selfies
!pip install deepsmiles
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

Collecting selfies
  Downloading https://files.pythonhosted.org/packages/01/67/f8f04deb5bc03b0ff8c7c994802620e62257373c56169485c7465c82235e/selfies-0.2.4-py3-none-any.whl
Installing collected packages: selfies
Successfully installed selfies-0.2.4
Collecting deepsmiles
  Downloading https://files.pythonhosted.org/packages/c4/aa/c043624e7cdac49811725dfc139423b5092bbf7cccb5a346d63ea0f364c1/deepsmiles-1.0.1-py2.py3-none-any.whl
Installing collected packages: deepsmiles
Successfully installed deepsmiles-1.0.1
--2020-07-24 09:41:05--  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.continuum.io (repo.continuum.io)... 104.18.201.79, 104.18.200.79, 2606:4700::6812:c94f, ...
Connecting to repo.continuum.io (repo.continuum.io)|104.18.201.79|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh [following]
--2020-07-24 09:41:05--  https://repo.anaconda.co

In [None]:
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
## RDKIT import 
import rdkit
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import RDConfig
from rdkit import rdBase
from rdkit.Chem.Draw import IPythonConsole

## Machine Learning Imports
import numpy as np
import tensorflow as tf
import os, re, time 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
from itertools import chain 
from sklearn.preprocessing import LabelEncoder


## Deep SMILES Imports 
import deepsmiles


## SELFIES imports
from selfies import encoder, decoder, selfies_alphabet  

Using TensorFlow backend.


In [None]:
## CONSTANTS 
MAX_LEN = 100
MAX_HEAVY_ATOMS = 50

CHEMICALS = 'A[cglmrstu]|B[aehik]|C[adefmnorsu]|D[bsy]|E[rsu]|F[elmr]|G[ade]|H[efgos]|I[nr]|Kr?|L[airuv]|M[dgnot]|N[abdeiop]|Os|P[abdmortu]|R[abefghnu]|S[bcegmnr]|T[abcehilm]|U(u[opst])?|V|W|Xe|Yb?|Z[nr]'
CHEMICALS_HEAVY = 'A[cglmrstu]|B[aehikr]?|C[adeflmnorsu]?|D[bsy]|E[rsu]|F[elmr]?|G[ade]|H[efgos]|I[nr]?|Kr?|L[airuv]|M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilm]|U(u[opst])?|V|W|Xe|Yb?|Z[nr]'

## Mapping for % conversion in ordinary SMILES
MAPPING = dict()
for i in range(10,100):
  MAPPING[i] = '['+str(i)+']'

In [None]:
## private method for finding character in string 
def find(string, ch):
  return [i for i, ltr in enumerate(string) if ltr == ch]

In [None]:
## PAD a string with PAD tocken until it is of the appropriate length 
def pad_out(string, max_len):
  pad_string = string + ['<PAD>'] * (max_len - len(string))
  return pad_string

In [None]:
def is_correct_smiles(smiles):
  if smiles == "":
      return False
  try:
    if MolFromSmiles(smiles, sanitize=True) is not None:
      return True
    return False
  except Exception:
    return False

In [None]:
## Get the lengths of the Smiles
def get_smiles_lens(smiles):
  smiles_lens = []
  for smile in smiles:
    smiles_lens.append(len(smile))
  return smiles_lens

In [None]:
## Return the indicies that need to be 
## removed becaue they are too long 
## or contain heavy elements 
def smile_drop_inds(smiles):
  regex = re.compile(CHEMICALS)
  regexh = re.compile(CHEMICALS_HEAVY)
  drop_indx = []
  for i in range(len(smiles)):
    heavys =  len(re.findall(CHEMICALS_HEAVY,smiles['smiles'][i]))
    if(regex.search(smiles['smiles'][i]) != None or len(smiles['smiles'][i]) >= MAX_LEN or  heavys > MAX_HEAVY_ATOMS):     
      drop_indx.append(i)
  return drop_indx

In [None]:
## Return the indicies that need to be 
## removed becaue they are too long 
## or contain heavy elements 
def smile_drop_inds2(smiles):
  regex = re.compile(CHEMICALS)
  regexh = re.compile(CHEMICALS_HEAVY)
  drop_indx = []
  for i in range(len(smiles)):
    heavys =  len(re.findall(CHEMICALS_HEAVY,smiles[i]))
    if(regex.search(smiles[i]) != None or len(smiles[i]) >= MAX_LEN or  heavys > MAX_HEAVY_ATOMS):     
      drop_indx.append(i)
  return drop_indx

In [None]:
## replace Br and Cl with single letters
def replace_halogens(string):
    br = re.compile('Br')
    cl = re.compile('Cl')
    string = br.sub('R', string)
    string = cl.sub('L', string)
    return string

## Replace % followed by double digit with
## appropriate characters for processing
def replace_percentages(string):
  perc_indicies = find(string,'%')
  new_string = string
  if len(perc_indicies) > 0:
    for i in perc_indicies:
      new_string = string[:i] + MAPPING[int(string[i+1:i+3])] + string[i + 3:]
  return new_string 

In [None]:
## converts smiles to deep smiles 
def get_deep_from_smiles(smiles_list):
  deep_smiles = []
  print("DeepSMILES version: %s" % deepsmiles.__version__)
  converter = deepsmiles.Converter(rings = True, branches = True)
  print(converter) # record the options used
  i = 0 
  for smile in smiles_list['Smiles']:
    if i % 10000 == 0:
      print(i)
    deep_smiles.append(converter.encode(smile))
    i+=1
  return deep_smiles


In [None]:
## converts smiles to deep smiles 
def get_smiles_from_deep(deep_list):
  smiles = []
  print("DeepSMILES version: %s" % deepsmiles.__version__)
  converter = deepsmiles.Converter(rings = True, branches = True)
  print(converter) # record the options used
  i = 0 
  for deep in deep_list:
    try:
      smile = converter.decode(deep)
    except deepsmiles.DecodeError as e:
      smile = None
      print("DecodeError! Error message was '%s'" % e.message)
    if (i%100000 == 0 ):
      print(i)
    i+= 1
    smiles.append(smile)
  return smiles


In [None]:
## Splits the selfies <molecule> into a list of character strings.
def split_selfie(molecule):
  return re.findall(r'\[.*?\]|\.', molecule)

In [None]:
#Returns the length of selfies <molecule>, in other words, the
## number of characters in the sequence.
def get_selfie_lens(molecule):
  return molecule.count('[') + molecule.count('.')

In [None]:
## converts smiles to selfies 
def get_selfies_from_smiles(smiles_list):
  selfies = []
  for smile in smiles_list:
    selfie_str = encoder(smile[:-1])
    selfies.append(selfie_str)
  return selfies

In [None]:
def get_selfie_alphabet(selfies_list):
  largest_selfies_len = max(get_selfie_lens(s) for s in selfies_list)
  all_selfies_chars = split_selfie(''.join(selfies_list))
  all_selfies_chars.append('<PAD>')
  selfies_alphabet = list(set(all_selfies_chars))
  return selfies_alphabet

In [None]:
## Takes processed smiles/deep smiles and returns the tokenized 
## versions of the smiles or deep semiles
## Note: Run replace halogens and replace percentages
## before running this method 
def tokenize_smiles(smiles):
  char_list = list(smiles)
  tokenized= []
  tokenized.append('<BOS>')
  i = 0 
  while i < len(char_list):
    char = char_list[i]
    tokenized.append(char)
    i= i+1
  tokenized.append('<EOS>')
  return tokenized

In [None]:
## Takes processed selfies smiles and returns the tokenized 
## versions of the selfies
def tokenize_selfies(selfies):
  char_list = split_selfie(selfies)
  tokenized= []
  tokenized.append('<BOS>')
  i = 0 
  while i < len(char_list):
    char = char_list[i]
    tokenized.append(char)
    i = i+1
  tokenized.append('<EOS>')
  return tokenized

In [None]:
def tokenize_and_pad_selfies(selfies):
  selfies_pad_tok = []
  ## Tokenize and Pad SMILES strings so that they are all of the same length 
  for selfie in selfies:
    selfies_pad_tok.append(pad_out(tokenize_selfies(selfie),max_len=MAX_LEN+2))
  return selfies_pad_tok

In [None]:
def tokenize_selfies2(selfies):
  selfies_pad_tok = []
  ## Tokenize fully
  for selfie in selfies:
    tok_self = tokenize_selfies(selfie)
    for tok in tok_self:
      selfies_pad_tok.append(tok)
  return selfies_pad_tok

In [None]:
def tokenize_smiles2(smiles_list):
  smiles_pad_tok = []
  ## Tokenize 
  for smile in smiles_list:
    tok_smile = tokenize_smiles(smile)
    for tok in tok_smile:
      smiles_pad_tok.append(tok)
  return smiles_pad_tok

In [None]:
def get_vocab_dict(smiles_list):
  flatten_list = list(chain.from_iterable(smiles_list)) 
  vocab = list(set(flatten_list))
  vocab_dict = {'<PAD>':0,'<BOS>':1,'<EOS>':2}
  index = 3
  for char in vocab:
    if char not in vocab_dict:
      vocab_dict[char] = index
      index = index + 1 
  inv_vocab_dict = {v: k for k, v in vocab_dict.items()}
  return vocab_dict, inv_vocab_dict

In [None]:
def get_selfies_vocab_dict(alphabet):
  vocab = alphabet
  vocab_dict = {'<PAD>':0,'<BOS>':1,'<EOS>':2}
  index = 3
  for char in vocab:
    if char not in vocab_dict:
      vocab_dict[char] = index
      index = index + 1 
  inv_vocab_dict = {v: k for k, v in vocab_dict.items()}
  return vocab_dict, inv_vocab_dict

In [None]:
def tokenize_and_pad_smiles(smiles_list):
  smiles_pad_tok = []
  ## Tokenize and Pad SMILES strings so that they are all of the same length 
  for smile in smiles_list:
    smiles_pad_tok.append(pad_out(tokenize_smiles(smile),max_len=MAX_LEN+2))
  return smiles_pad_tok

In [None]:
from sklearn.preprocessing import LabelEncoder
def integer_encode(smiles,vocab_dict):
  smiles_enc = []
  for char in smiles:
    enc = []
    for i in char:
      enc.append(vocab_dict[i])
    smiles_enc.append(enc)
  return smiles_enc

In [None]:
from sklearn.preprocessing import LabelEncoder
def integer_encode2(smiles,vocab_dict):
  smiles_enc = []
  for char in smiles:
    smiles_enc.append(vocab_dict[char])
  return smiles_enc

In [None]:
from sklearn.preprocessing import LabelEncoder
def integer_encode_selfies(selfies,vocab_dict):
  selfies_enc = []
  for char in selfies:
    enc = []
    for i in char:
      enc.append(vocab_dict[i])
    selfies_enc.append(enc)
  return selfies_enc

In [None]:
from sklearn.preprocessing import LabelEncoder
def integer_encode_selfies2(selfies,vocab_dict):
  selfies_enc = []
  for char in selfies:
    selfies_enc.append(vocab_dict[char])
  return selfies_enc

In [None]:
from sklearn.preprocessing import OneHotEncoder
def ohe_encode(smiles_int_enc, vocab_dict):
  onehot_encoder = OneHotEncoder(sparse=True)
  keys = np.array(list(vocab_dict.keys())).reshape(-1, 1)
  onehot_encoder.fit(keys)
  smiles_ohe = []
  d = np.array(smiles_int_enc)
  smiles_ohe = smiles_ohe.append(onehot_encoder.transform(d.reshape(-1,)))
  return smiles_ohe

In [None]:
# Initialize drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# Move to Google Drive 
%cd drive
%cd 'My Drive'
%cd 'MSc Stats Dissertation'

/content/drive
/content/drive/My Drive
/content/drive/My Drive/MSc Stats Dissertation


In [None]:
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
## IMPORTS
## RDKIT import 
import rdkit
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import RDConfig
from rdkit import rdBase
from rdkit.Chem.Draw import IPythonConsole

## Machine Learning Imports
import numpy as np
import tensorflow as tf
import os, re, time 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re

from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Bidirectional,Dropout, Embedding, \
LSTM, Multiply, Lambda, Permute, Reshape, Masking, Input, Softmax, Subtract, \
Concatenate,Dropout,MaxPooling1D,AveragePooling1D,BatchNormalization, Maximum, \
TimeDistributed, Activation

from selfies import encoder, decoder, selfies_alphabet  


In [None]:
import pandas as pd
train_path = './Datasets/splitted_data/tcga_rnaseq_train_fraction_0.9.csv'
test_path = './Datasets/splitted_data/tcga_rnaseq_test_fraction_0.1.csv'

train_gdc_path = './Datasets/splitted_data/gdsc_cell_line_ic50_train.csv'
test_gdc_path = './Datasets/splitted_data/gdsc_cell_line_ic50_test.csv'

rna_seq_train =  pd.read_csv(train_path,header=0)
rna_seq_test =  pd.read_csv(test_path,header=0)
gdsc_train =  pd.read_csv(train_gdc_path,header=0)
gdsc_test =  pd.read_csv(test_gdc_path,header=0)

In [None]:
train_smiles_path = './Datasets/splitted_data/train_chembl_22_clean_1576904_sorted_std_final.smi'
test_smiles_path = './Datasets/splitted_data/test_chembl_22_clean_1576904_sorted_std_final.smi'

In [None]:
smiles_train =  pd.read_csv(train_smiles_path,delimiter='\t',header=None)


In [None]:
gene_subset = pd.read_pickle('./Datasets/2128_genes.pkl')

In [None]:
drug_smiles = pd.read_csv('./Datasets/gdsc.smi',delimiter='\t',header=None)

In [None]:
something = pd.read_pickle('./Datasets/gdsc_transcriptomics_for_conditional_generation.pkl')

In [None]:
gene_expressions = []
smiles_pairs = []
ic50  = []
cell_lines =[]
site = []
histology = []
for index, drug_gene in gdsc_train.iterrows():
  cell_line = drug_gene['cell_line']
  drug = drug_gene['drug']
  if cell_line in list(something['cell_line']) and drug in  list(drug_smiles[1]):
    ic50.append(drug_gene['IC50'])
    cell_line_index = list(something['cell_line']).index(cell_line)
    gene_expressions.append(list(something['gene_expression'])[cell_line_index])
    drug_index = list(drug_smiles[1]).index(drug)
    smiles_pairs.append(drug_smiles[0][drug_index])
    cell_lines.append(cell_line)
    site.append(list(something['site'])[cell_line_index])
    histology.append(list(something['histology'])[cell_line_index])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
196782
196783
196784
196785
196786
196787
196788
196789
196790
196791
196792
196793
196794
196795
196796
196797
196798
196799
196800
196801
196802
196803
196804
196805
196806
196807
196808
196809
196810
196811
196812
196813
196814
196815
196816
196817
196818
196819
196820
196821
196822
196823
196824
196825
196826
196827
196828
196829
196830
196831
196832
196833
196834
196835
196836
196837
196838
196839
196840
196841
196842
196843
196844
196845
196846
196847
196848
196849
196850
196851
196852
196853
196854
196855
196856
196857
196858
196859
196860
196861
196862
196863
196864
196865
196866
196867
196868
196869
196870
196871
196872
196873
196874
196875
196876
196877
196878
196879
196880
196881
196882
196883
196884
196885
196886
196887
196888
196889
196890
196891
196892
196893
196894
196895
196896
196897
196898
196899
196900
196901
196902
196903
196904
196905
196906
196907
196908
196909
196910
196911
196912
196913
196914
1969

In [None]:
import numpy as np
np.save('gene_expressions',gene_expressions)
np.save('cell_lines',cell_lines)
np.save('ic50',ic50)
np.save('sites',site)
np.save('histologies',histology)
np.save('smiles_pairs',smiles_pairs)

In [None]:
train_path = './Datasets/fChEMBL_Smiles.csv' #'./Datasets/250k_rndm_zinc_drugs_clean_3.csv' #
smiles =  pd.read_csv(train_path,header=0)
smiles['Smiles']

In [None]:
deep_smiles = []
deep_smiles = get_deep_from_smiles(smiles)
smiles_n = get_smiles_from_deep(deep_smiles)

DeepSMILES version: 1.0.1
Converter(rings=True, branches=True)
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
131

In [None]:
smiles_p = []
for smile in smiles_n:
  smile = replace_halogens(smile)
  smiles_p.append(smile)
drop_inds = smile_drop_inds2(smiles_p)

In [None]:
smiles_fn = smiles.drop(drop_inds)
smiles_fn =smiles_fn.reset_index(drop=True)

In [None]:
d = pd.DataFrame(data=smiles_p)
e = pd.DataFrame(data=deep_smiles)

In [None]:
smiles_f = d.drop(drop_inds)
smiles_f = smiles_f.reset_index(drop=True)

In [None]:
deep_smiles_f = e.drop(drop_inds)
deep_smiles_f= deep_smiles_f.reset_index(drop=True)

In [None]:
import rdkit 
import rdkit.Chem.Descriptors as Descriptors
import Utils.sascorer as sascorer

In [None]:
sas  = []
qed = []
logp = []
index_remove = []
i = 0
for chem in smiles_fn['smiles']:
  m = rdkit.Chem.MolFromSmiles(chem)
  if m is not None:
    index_remove
    logp.append(Descriptors.MolLogP(m))
    qed.append(rdkit.Chem.QED.qed(m))
    sas.append(sascorer.calculateScore(m))
  else:
    print(chem)
    index_remove.append(i)
    print(i)
  if i % 1000 == 0:
    print(i)
  i+=1

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000


In [None]:
np.save('./vocab/zinc_sas',sas)
np.save('./vocab/zinc_qeds',qed)
np.save('./vocab/zinc_logp',logp)

In [None]:
deep_smiles_p = []
for smile in deep_smiles_f[0]:
  smile = replace_halogens(smile)
  deep_smiles_p.append(smile)

In [None]:
## Generate SELFIES
selfies = get_selfies_from_smiles(smiles_fn['smiles'])

In [None]:
selfies_alphabet = get_selfie_alphabet(selfies)
selfies_vocab, selfies_indexes = get_selfies_vocab_dict(selfies_alphabet)

In [None]:
## Get alaphabets 
deep_smiles_vocab, deep_smiles_indexes = get_vocab_dict(deep_smiles_p)


In [None]:
smiles_vocab, smiles_indexes = get_vocab_dict(smiles_f[0])

In [None]:
processed_smiles = tokenize_smiles2(smiles_f[0])

In [None]:
processed_deep_smiles = tokenize_smiles2(deep_smiles_p)

In [None]:
processed_selfies = tokenize_selfies2(selfies)

In [None]:
int_enc_smiles = integer_encode2(processed_smiles,smiles_vocab)

In [None]:
np.save('./vocab/zinc_train_smiles_X',int_enc_smiles)
#np.save('./vocab/train_smiles_X_ohe',ohe_encode_smiles)
np.save('./vocab/zinc_vocab',smiles_vocab)
np.save('./vocab/zinc_vocab_index',smiles_indexes)

In [None]:
del int_enc_smiles
del smiles_vocab
del smiles_indexes

In [None]:
int_enc_deep_smiles = integer_encode2(processed_deep_smiles,deep_smiles_vocab)

In [None]:
np.save('./vocab/train_deep_smiles_X',int_enc_deep_smiles)
np.save('./vocab/deep_vocab',deep_smiles_vocab)
np.save('./vocab/deep_vocab_index',deep_smiles_indexes)

In [None]:
del int_enc_deep_smiles
del deep_smiles_vocab
del deep_smiles_indexes

In [None]:
int_enc_selfies = integer_encode_selfies2(processed_selfies,selfies_vocab)


In [None]:
np.save('./vocab/zinc_train_selfies_X',int_enc_selfies)
np.save('./vocab/zinc_selfies_vocab',selfies_vocab)
np.save('./vocab/zinc_selfies_vocab_index',selfies_indexes)

In [None]:
del int_enc_selfies
del selfies_vocab
del selfies_indexes