In [1]:
import hashlib
import numpy as np
from numpy import array

completed_lines_hash = set()

completed_lines_hash = set()

#Save processed data to SMILES.txt
new = open("smiles.txt", "w")

#Read in data file line by line
for line in open("data.txt", "r"):
  
    #Ensure all smiles in original data file are unique
    hashValue = hashlib.md5(line.rstrip().encode('utf-8')).hexdigest()
  
    if hashValue not in completed_lines_hash:
        completed_lines_hash.add(hashValue)
        
        #Ensure all SMILES are between 35 and 75 characters in length
        if 34 < len(line) < 75:
            #Add start token
            line = line.rjust(len(line)+1, "G")

            #Copy over SMILES satisfying requirements
            new.write(line)
    
#Close files
new.close()

In [2]:
#Read in processed data file
data = open("smiles.txt", "r").read()

#Create a list of the unique characters in the dataset
chars = list(set(data))

#Get size (in characters) of dataset
data_size = len(data) 

#Get number of unique characters in dataset
vocab_size = len(chars)

#Print dataset properties
print("Vocab size: " + str(vocab_size))
print("Data size: " + str(data_size))
print("Characters in data: " + str(chars))

Vocab size: 53
Data size: 24825178
Characters in data: ['s', '6', '=', '4', 'g', 'T', 'c', ']', 'H', '[', 'A', '3', 'o', 'r', 'b', '.', 'p', 'G', '(', '\n', '1', 'S', '-', 'C', 'i', 'R', 'N', '7', 'B', '9', '+', 'V', ')', '/', '#', 'K', 'P', 'F', 'e', 'O', '@', 'L', 'l', '\\', 'n', '5', '8', '2', 'Z', 'I', 'M', 'a', 't']


In [3]:
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Create array from characters in the dataset
values = array(chars)
print("Array of unique characters:")
print(values)

#Create unique, numerical labels for each character between 0 and n-1, where n is the number of unique characters
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Array of labels for each character:")
print(integer_encoded)

#Encode characters into a one-hot encoding, resulting in an array of size [num unique chars, num unique chars]
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("Array of one-hot encoded characters:")
print(onehot_encoded)
print("Size of array of one-hot encoded characters: " + str(onehot_encoded.shape))

Array of unique characters:
['s' '6' '=' '4' 'g' 'T' 'c' ']' 'H' '[' 'A' '3' 'o' 'r' 'b' '.' 'p' 'G'
 '(' '\n' '1' 'S' '-' 'C' 'i' 'R' 'N' '7' 'B' '9' '+' 'V' ')' '/' '#' 'K'
 'P' 'F' 'e' 'O' '@' 'L' 'l' '\\' 'n' '5' '8' '2' 'Z' 'I' 'M' 'a' 't']
Array of labels for each character:
[51 13 17 11 44 34 42 39 24 37 19 10 48 50 41  6 49 23  2  0  8 33  5 21
 45 32 29 14 20 16  4 35  3  7  1 26 31 22 43 30 18 27 46 38 47 12 15  9
 36 25 28 40 52]
Array of one-hot encoded characters:
[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
Size of array of one-hot encoded characters: (53, 53)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [4]:
#Read in processed data file
data = open("smiles.txt", "r").read()
#Create a list of the dataset
datalist = list(data)
#Create an array of the dataset
dataarray = array(datalist)
#Fit one-hot encoding to dataarray
dataarray = dataarray.reshape(len(dataarray), 1)
OHESMILES = onehot_encoder.fit_transform(dataarray).astype(int)
print("Size of one-hot encoded array of data: " + str(OHESMILES.shape))
print("One-hot encoded array of data:")
print(OHESMILES)

Size of one-hot encoded array of data: (24825178, 53)
One-hot encoded array of data:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [5]:
#Save OHESMILES as a (compressed) file
np.savez_compressed("ohesmiles.npz", OHESMILES)

In [6]:
#Create integer SMILES data
INTSMILES = [np.where(r==1)[0][0] for r in OHESMILES]

In [7]:
#Save INTSMILES as a (compressed) file
np.savez_compressed("intsmiles.npz", INTSMILES)

In [8]:
#Save array with SMILES character, integer encoding, and one hot encoding (vocabulary)
values = np.reshape(values, (np.shape(values)[0], 1))
vocab = np.concatenate((values, integer_encoded.astype(object)), axis = 1)
vocab = vocab[vocab[:,1].argsort()]
vocabvalues = np.reshape(vocab[:,1], (-1,1))
vocabohe = onehot_encoder.fit_transform(vocabvalues)
vocabencodings = np.concatenate((vocab, vocabohe.astype(object)), axis = 1)
print(np.shape(vocabencodings))

np.save("vocab.npy", vocabencodings)

(53, 55)


In [9]:
print(vocabencodings)

[['\n' 0 1.0 ... 0.0 0.0 0.0]
 ['#' 1 0.0 ... 0.0 0.0 0.0]
 ['(' 2 0.0 ... 0.0 0.0 0.0]
 ...
 ['r' 50 0.0 ... 1.0 0.0 0.0]
 ['s' 51 0.0 ... 0.0 1.0 0.0]
 ['t' 52 0.0 ... 0.0 0.0 1.0]]
