In [2]:
#Ensure all SMILES are unique, at most 74 characters in length before padding, and pad with start character "{" 

import hashlib

completed_lines_hash = set()

#Save processed data to SMILES.txt
new = open("SMILES.txt", "w")

#Read in data file line by line
for line in open("data.txt", "r"):
  
    #Ensure all smiles in original data file are unique
    hashValue = hashlib.md5(line.rstrip().encode('utf-8')).hexdigest()
  
    if hashValue not in completed_lines_hash:
        completed_lines_hash.add(hashValue)
        
        #Only copy SMILES less than 75 characters in length to new file
        if len(line) < 75:
            #Pad each SMILES with start character "{"
            line = line.rjust(len(line)+1, "{")
            #Copy over SMILES satisfying requirements
            new.write(line)
    
#Close files
new.close()

In [26]:
#Read in processed data file
data = open("SMILES.txt", "r").read()

#Create a list of the unique characters in the dataset
chars = list(set(data))

#Get size (in characters) of dataset
data_size = len(data) 

#Get number of unique characters in dataset
vocab_size = len(chars)

#Print dataset properties
print("Vocab size: " + str(vocab_size))
print("Data size: " + str(data_size))
print("Characters in data: " + str(chars))

Vocab size: 54
Data size: 21981771
Characters in data: ['7', 'l', '/', 'M', '2', 'p', ']', 's', '{', '+', '-', 'o', 'K', '@', 'H', '.', 'A', 'T', 't', 'Z', '3', 'c', '\n', 'P', 'R', ')', '6', 'C', 'I', 'r', 'L', 'B', '5', '4', 'b', 'O', 'g', 'S', 'N', '8', 'e', 'V', 'a', 'u', '1', '(', '=', 'n', '[', '9', '\\', '#', 'i', 'F']


In [27]:
import numpy as np
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Create array from characters in the dataset
values = array(chars)
print("Array of unique characters:")
print(values)

#Create unique, numerical labels for each character between 0 and n-1, where n is the number of unique characters
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Array of labels for each character:")
print(integer_encoded)

#Encode characters into a one-hot encoding, resulting in an array of size [num unique chars, num unique chars]
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("Array of one-hot encoded characters:")
print(onehot_encoded)
print("Size of array of one-hot encoded characters: " + str(onehot_encoded.shape))

Array of unique characters:
['7' 'l' '/' 'M' '2' 'p' ']' 's' '{' '+' '-' 'o' 'K' '@' 'H' '.' 'A' 'T'
 't' 'Z' '3' 'c' '\n' 'P' 'R' ')' '6' 'C' 'I' 'r' 'L' 'B' '5' '4' 'b' 'O'
 'g' 'S' 'N' '8' 'e' 'V' 'a' 'u' '1' '(' '=' 'n' '[' '9' '\\' '#' 'i' 'F']
Array of labels for each character:
[14 45  7 27  9 48 38 50 53  4  5 47 25 18 23  6 19 33 51 35 10 41  0 30
 31  3 13 21 24 49 26 20 12 11 40 29 43 32 28 15 42 34 39 52  8  2 17 46
 36 16 37  1 44 22]
Array of one-hot encoded characters:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Size of array of one-hot encoded characters: (54, 54)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [38]:
#Read in processed data file
data = open("SMILES.txt", "r").read()

#Create a list of the dataset
datalist = list(data)

#Create an array of the dataset
dataarray = array(datalist)

#Fit one-hot encoding to dataarray
dataarray = dataarray.reshape(len(dataarray), 1)
OHESMILES = onehot_encoder.fit_transform(dataarray).astype(int)
print("Size of one-hot encoded array of data: " + str(OHESMILES.shape))
print("One-hot encoded array of data:")
print(OHESMILES)

Size of one-hot encoded array of data: (21981771, 54)
One-hot encoded array of data:
[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [41]:
#Save OHESMILES as a (compressed) file
np.savez_compressed("OHESMILES.npz", OHESMILES)