In [4]:
#Ensure all SMILES are unique, at most 74 characters in length before padding, and pad with start character "{" 

import hashlib

completed_lines_hash = set()

#Save processed data to SMILES.txt
new = open("smiles.txt", "w")

#Read in data file line by line
for line in open("data.txt", "r"):
  
    #Ensure all smiles in original data file are unique
    hashValue = hashlib.md5(line.rstrip().encode('utf-8')).hexdigest()
  
    if hashValue not in completed_lines_hash:
        completed_lines_hash.add(hashValue)
        
        #Only copy SMILES less than 74 characters in length to new file
        if len(line) < 74:
            #Pad each SMILES with start character "{"
            line = line.rjust(len(line)+1, "{")
            #Pad each SMILES with end characters "}" to 75 characters
            line = line.rstrip('\n')
            line = line.ljust(75, "}")
            line = line + "\n"
            #Copy over SMILES satisfying requirements
            new.write(line)
    
#Close files
new.close()

In [5]:
#Read in processed data file
data = open("smiles.txt", "r").read()

#Create a list of the unique characters in the dataset
chars = list(set(data))

#Get size (in characters) of dataset
data_size = len(data) 

#Get number of unique characters in dataset
vocab_size = len(chars)

#Print dataset properties
print("Vocab size: " + str(vocab_size))
print("Data size: " + str(data_size))
print("Characters in data: " + str(chars))

Vocab size: 55
Data size: 34131372
Characters in data: ['=', 'Z', 'e', '3', '2', '@', '6', ')', 'O', 'N', '/', '\\', 'I', 'p', 'B', 's', '\n', '#', '-', '(', 'L', '4', '.', '9', '8', 'V', 'M', 'g', '7', 'i', '1', 'b', 'P', 'A', 'K', 'H', '{', 'o', 'T', ']', 'S', 'F', 't', '}', 'r', 'u', 'l', 'a', 'C', '+', '5', 'R', '[', 'c', 'n']


In [6]:
import numpy as np
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Create array from characters in the dataset
values = array(chars)
print("Array of unique characters:")
print(values)

#Create unique, numerical labels for each character between 0 and n-1, where n is the number of unique characters
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Array of labels for each character:")
print(integer_encoded)

#Encode characters into a one-hot encoding, resulting in an array of size [num unique chars, num unique chars]
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print("Array of one-hot encoded characters:")
print(onehot_encoded)
print("Size of array of one-hot encoded characters: " + str(onehot_encoded.shape))

Array of unique characters:
['=' 'Z' 'e' '3' '2' '@' '6' ')' 'O' 'N' '/' '\\' 'I' 'p' 'B' 's' '\n' '#'
 '-' '(' 'L' '4' '.' '9' '8' 'V' 'M' 'g' '7' 'i' '1' 'b' 'P' 'A' 'K' 'H'
 '{' 'o' 'T' ']' 'S' 'F' 't' '}' 'r' 'u' 'l' 'a' 'C' '+' '5' 'R' '[' 'c'
 'n']
Array of labels for each character:
[17 35 42 10  9 18 13  3 29 28  7 37 24 48 20 50  0  1  5  2 26 11  6 16
 15 34 27 43 14 44  8 40 30 19 25 23 53 47 33 38 32 22 51 54 49 52 45 39
 21  4 12 31 36 41 46]
Array of one-hot encoded characters:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Size of array of one-hot encoded characters: (55, 55)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
#Read in processed data file
data = open("smiles.txt", "r").read()

#Create a list of the dataset
datalist = list(data)

#Create an array of the dataset
dataarray = array(datalist)

#Fit one-hot encoding to dataarray
dataarray = dataarray.reshape(len(dataarray), 1)
OHESMILES = onehot_encoder.fit_transform(dataarray).astype(int)
print("Size of one-hot encoded array of data: " + str(OHESMILES.shape))
print("One-hot encoded array of data:")
print(OHESMILES)

Size of one-hot encoded array of data: (34131372, 55)
One-hot encoded array of data:
[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [1 0 0 ... 0 0 0]]


In [8]:
#Save OHESMILES as a (compressed) file
np.savez_compressed("ohesmiles.npz", OHESMILES)