In [1]:
import hashlib
import numpy as np
from numpy import array

completed_lines_hash = set()

sorted_smiles = []
lengths = []

#Read in data file line by line
for line in open("data.txt", "r"):
  
    #Ensure all smiles in original data file are unique
    hashValue = hashlib.md5(line.rstrip().encode('utf-8')).hexdigest()
  
    if hashValue not in completed_lines_hash:
        completed_lines_hash.add(hashValue)
        
        #Ensure all SMILES are between 35 and 75 characters in length
        if 33 < len(line) < 75:
            #Add start and end character
            line = line.rjust(len(line)+1, "G")
            line = line.rstrip('\n') + "E"
            
            length = len(line)
            sorted_smiles.append(line)
            lengths.append(length)

#Sort list of SMILES in order of decreasing length
sorted_smiles.sort(key=len, reverse=True)

#List of SMILES sequence lengths
lengths = sorted(lengths, key=int, reverse=True)
lengths = array(lengths)
np.save("smileslengths.npy", lengths)

In [2]:
#Save processed data to SMILES.txt
new = open("smiles.txt", "w")

for smiles in sorted_smiles:

    smiles = smiles.ljust(75, "0")
    #Copy over SMILES
    new.write(smiles)
    
#Close files
new.close()

In [3]:
#Read in processed data file
data = open("smiles.txt", "r").read()

#Create a list of the unique characters in the dataset
chars = list(set(data))

#Get size (in characters) of dataset
data_size = len(data) 

#Get number of unique characters in dataset
vocab_size = len(chars)

#Print dataset properties
print("Vocab size: " + str(vocab_size))
print("Data size: " + str(data_size))
print("Characters in data: " + str(chars))

Vocab size: 54
Data size: 37015800
Characters in data: ['s', 'A', 'G', 'e', '8', 'a', '@', 'V', 'g', 'O', 'C', '1', '6', 'p', '(', '.', '0', 'N', 'T', 'Z', 'K', 'c', '9', '5', 'L', ')', 'r', 'E', '/', 'n', 'I', 'F', 'M', 'R', '[', '3', '#', 'H', '\\', 'B', 'P', '-', '2', '4', 'b', 'o', 't', 'l', '7', 'S', '+', '=', ']', 'i']


In [4]:
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Create array from characters in the dataset
values = array(chars)
print("Array of unique characters:")
print(values)

#Create unique, numerical labels for each character between 0 and n-1, where n is the number of unique characters
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Array of labels for each character:")
print(integer_encoded)

#Encode characters into a one-hot encoding, resulting in an array of size [num unique chars, num unique chars]
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("Array of one-hot encoded characters:")
print(onehot_encoded)
print("Size of array of one-hot encoded characters: " + str(onehot_encoded.shape))

Array of unique characters:
['s' 'A' 'G' 'e' '8' 'a' '@' 'V' 'g' 'O' 'C' '1' '6' 'p' '(' '.' '0' 'N'
 'T' 'Z' 'K' 'c' '9' '5' 'L' ')' 'r' 'E' '/' 'n' 'I' 'F' 'M' 'R' '[' '3'
 '#' 'H' '\\' 'B' 'P' '-' '2' '4' 'b' 'o' 't' 'l' '7' 'S' '+' '=' ']' 'i']
Array of labels for each character:
[52 19 24 44 15 41 18 36 45 31 21  8 13 50  1  5  7 30 35 37 27 43 16 12
 28  2 51 22  6 48 26 23 29 33 38 10  0 25 39 20 32  4  9 11 42 49 53 47
 14 34  3 17 40 46]
Array of one-hot encoded characters:
[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Size of array of one-hot encoded characters: (54, 54)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
#Read in processed data file
data = open("smiles.txt", "r").read()
#Create a list of the dataset
datalist = list(data)
#Create an array of the dataset
dataarray = array(datalist)
#Fit one-hot encoding to dataarray
dataarray = dataarray.reshape(len(dataarray), 1)
OHESMILES = onehot_encoder.fit_transform(dataarray).astype(int)
print("Size of one-hot encoded array of data: " + str(OHESMILES.shape))
print("One-hot encoded array of data:")
print(OHESMILES)

Size of one-hot encoded array of data: (37015800, 54)
One-hot encoded array of data:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [7]:
#Save OHESMILES as a (compressed) file
np.savez_compressed("ohesmiles.npz", OHESMILES)

In [8]:
#Create integer SMILES data
INTSMILES = [np.where(r==1)[0][0] for r in OHESMILES]

In [9]:
#Save INTSMILES as a (compressed) file
np.savez_compressed("intsmiles.npz", INTSMILES)

In [10]:
#Save array with SMILES character, integer encoding, and one hot encoding (vocabulary)
values = np.reshape(values, (np.shape(values)[0], 1))
vocab = np.concatenate((values, integer_encoded.astype(object)), axis = 1)
vocab = vocab[vocab[:,1].argsort()]
vocabvalues = np.reshape(vocab[:,1], (-1,1))
vocabohe = onehot_encoder.fit_transform(vocabvalues)
vocabencodings = np.concatenate((vocab, vocabohe.astype(object)), axis = 1)
print(np.shape(vocabencodings))

np.save("vocab.npy", vocabencodings)

(54, 56)


In [11]:
print(vocabencodings)

[['#' 0 1.0 ... 0.0 0.0 0.0]
 ['(' 1 0.0 ... 0.0 0.0 0.0]
 [')' 2 0.0 ... 0.0 0.0 0.0]
 ...
 ['r' 51 0.0 ... 1.0 0.0 0.0]
 ['s' 52 0.0 ... 0.0 1.0 0.0]
 ['t' 53 0.0 ... 0.0 0.0 1.0]]
