# [HBL] It seems the size of the hidden layer is a critical parameter to get the non-identical numbers.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as Calculator




In [2]:
#Data Cleaning
data = pd.read_excel("inputdata.xlsx")
data['EC_value'], data['EC_error'] = zip(*data['ELE_COD'].map(lambda x: x.split('±')))
data.head()

Unnamed: 0,NUM,A,B,MOLFRC(for A and B),T,P,ELE_COD,EC_value,EC_error
0,1,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,298.15,101,1.166 ± 0.058,1.166,0.058
1,2,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,299.15,101,1.203 ± 0.06,1.203,0.06
2,3,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,300.15,101,1.242 ± 0.062,1.242,0.062
3,4,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,301.15,101,1.271 ± 0.064,1.271,0.064
4,5,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,302.15,101,1.289 ± 0.064,1.289,0.064


In [3]:
#Setting up for molecular descriptors
n = data.shape[0]
list_of_descriptors = ['NumHeteroatoms','MolWt','ExactMolWt','NOCount','NumHDonors','RingCount','NumAromaticRings','NumSaturatedRings','NumAliphaticRings']
calc = Calculator(list_of_descriptors)
D = len(list_of_descriptors)
d = len(list_of_descriptors)*2 + 3
print(n,d)

2523 21


In [4]:
#setting up the x and y matrices
X = np.zeros((n,d))
X[:,-3] = data['T']
X[:,-2] = data['P']
for i in range(n):
    A = Chem.MolFromSmiles(data['A'][i])
    B = Chem.MolFromSmiles(data['B'][i])
    X[i][:D]    = calc.CalcDescriptors(A)
    X[i][D:2*D] = calc.CalcDescriptors(B)

y = data['EC_value']



In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)


In [27]:
#Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(100,),activation='logistic',solver='sgd',learning_rate_init=0.01,max_iter=500)

mlp.fit(X_train, y_train)

#print mlp.score(X_test,y_test)



MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [28]:
mlp.predict(X_test)

array(['0.001 ', '6.12 ', '0.0002334 ', '6.12 ', '0.384 ', '0.002 ',
       '6.12 ', '6.12 ', '0.002 ', '0.0002334 ', '0.001 ', '0.001 ',
       '0.001 ', '0.002 ', '0.001136 ', '0.0002334 ', '0.001136 ',
       '1.437 ', '0.002 ', '6.12 ', '0.001 ', '0.001 ', '0.0002334 ',
       '0.001318 ', '0.002 ', '6.12 ', '0.384 ', '0.001 ', '6.12 ',
       '0.384 ', '0.002 ', '0.002 ', '0.001136 ', '6.12 ', '0.002 ',
       '0.384 ', '6.12 ', '6.12 ', '0.001318 ', '0.001 ', '1.437 ',
       '0.001 ', '6.12 ', '6.12 ', '6.12 ', '0.002 ', '6.12 ', '6.12 ',
       '0.001 ', '6.12 ', '0.001 ', '0.384 ', '1.437 ', '0.001318 ',
       '0.002 ', '6.12 ', '0.001136 ', '0.001 ', '0.001 ', '6.12 ',
       '0.001 ', '0.001318 ', '0.001318 ', '0.001136 ', '0.001318 ',
       '0.001 ', '0.001 ', '6.12 ', '6.12 ', '0.001136 ', '0.001 ',
       '6.12 ', '0.001 ', '0.001 ', '0.001 ', '1.437 ', '0.001 ', '6.12 ',
       '0.001 ', '6.12 ', '1.437 ', '0.001 ', '0.384 ', '0.001 ', '6.12 ',
       '6.12 ', '0.001 '

In [23]:
y_test

714         0.193 
1048        1.128 
2345     0.001182 
89          3.427 
1726        0.483 
1328        0.384 
143         5.066 
2197         4.61 
1009        0.962 
1940    0.0002459 
52          2.666 
284         9.153 
1200        0.102 
1005        0.816 
2038     0.008471 
1956    0.0003212 
2039      0.00833 
1813        0.832 
1359        0.997 
430          3.04 
1107        0.759 
1478        0.552 
1955    0.0003521 
2422     0.002776 
1353        0.328 
192         4.781 
1560        0.251 
1221         0.14 
2176         3.79 
1703        0.322 
           ...    
1983     0.007001 
2236         4.62 
987         0.462 
316         5.152 
755         0.356 
1244        0.143 
956         0.634 
2502      0.01283 
1123        0.489 
1797         1.04 
1600        0.434 
578         3.422 
1830        1.129 
1270        0.109 
2214         3.95 
202         5.825 
2133          6.3 
1656        0.504 
1641        0.641 
592         4.677 
257         5.647 
6           