# [HBL] It seems the size of the hidden layer is a critical parameter to get the non-identical numbers.

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as Calculator




In [8]:
#Data Cleaning
data = pd.read_excel("inputdata.xlsx")
data['EC_value'], data['EC_error'] = zip(*data['ELE_COD'].map(lambda x: x.split('±')))
data.head()

Unnamed: 0,NUM,A,B,MOLFRC(for A and B),T,P,ELE_COD,EC_value,EC_error
0,1,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,298.15,101,1.166 ± 0.058,1.166,0.058
1,2,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,299.15,101,1.203 ± 0.06,1.203,0.06
2,3,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,300.15,101,1.242 ± 0.062,1.242,0.062
3,4,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,301.15,101,1.271 ± 0.064,1.271,0.064
4,5,[O-]S(=O)(=O)C.c1c[n+](cn1CCCC)C,O,0.004,302.15,101,1.289 ± 0.064,1.289,0.064


In [9]:
#Setting up for molecular descriptors
n = data.shape[0]
list_of_descriptors = ['NumHeteroatoms','MolWt','ExactMolWt','NOCount','NumHDonors','RingCount','NumAromaticRings','NumSaturatedRings','NumAliphaticRings']
calc = Calculator(list_of_descriptors)
D = len(list_of_descriptors)
d = len(list_of_descriptors)*2 + 3
print(n,d)

2523 21


In [10]:
#setting up the x and y matrices
X = np.zeros((n,d))
X[:,-3] = data['T']
X[:,-2] = data['P']
for i in range(n):
    A = Chem.MolFromSmiles(data['A'][i])
    B = Chem.MolFromSmiles(data['B'][i])
    X[i][:D]    = calc.CalcDescriptors(A)
    X[i][D:2*D] = calc.CalcDescriptors(B)

y = data['EC_value']



In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)


In [46]:
#Neural Network
mlp = MLPRegressor(hidden_layer_sizes=(20,),activation='tanh',solver='sgd',learning_rate_init=0.0001,max_iter=5000)

mlp.fit(X_train, y_train)

#print mlp.score(X_test,y_test)

MLPRegressor(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.0001, max_iter=5000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [47]:
mlp.predict(X_test)

array([ 0.84254373,  0.84254373,  2.31312827,  2.31312827,  0.84254373,
        2.31312827,  2.31312827,  2.31312827,  0.84254373,  2.31312827,
        2.35385911,  0.63546052,  2.31312827,  2.31312827,  2.31312827,
        2.31312827,  2.31312827,  0.63546052,  0.84254373,  2.31312827,
        2.31312827,  2.35385911,  2.31312827,  2.31312827,  2.31312827,
        2.31312827,  0.84254373,  2.31312827,  2.31312827,  0.84254373,
        2.31312827,  2.31312827,  2.31312827,  0.84254373,  2.31312827,
        2.31312827,  2.31312827,  2.31312827,  2.31312827,  2.31312827,
        0.4077224 ,  0.84254373,  2.31312827,  2.31312827,  2.31312827,
        0.84254373,  2.31312827,  2.31312827,  2.31312827,  2.31312827,
        0.84254373,  2.31312827,  2.31312827,  2.31312827,  2.31312827,
        0.84254373,  2.31312827,  0.84254373,  2.31312827,  2.31312827,
        2.31312827,  0.84254373,  2.31312827,  0.84254373,  0.84254373,
        2.31312827,  2.31312827,  0.84254373,  2.31312827,  2.31

In [37]:

y_test

741        0.108 
1069       0.848 
166        6.932 
233        9.587 
748        0.559 
1387       0.828 
1047       1.066 
1293       0.458 
756        0.316 
241        10.61 
1828        5.97 
1642       0.733 
2385    0.001038 
955        0.608 
450        5.018 
945        0.375 
2349    0.000896 
1644       0.773 
1124       0.203 
331        7.053 
1042       0.851 
1790       0.562 
2489     0.01098 
455        5.612 
151        5.554 
2459     0.00851 
921        1.211 
229        8.985 
244         4.12 
823        0.384 
          ...    
515          3.2 
113        4.849 
1497       1.417 
1025       0.396 
961        0.772 
904        0.872 
272        7.825 
2436     0.00847 
818        0.595 
2069    0.011357 
833        0.908 
1637        0.25 
2454     0.00926 
264        6.727 
1253       0.559 
1836        2.06 
2035    0.008999 
1306       0.346 
383        5.116 
578        3.422 
234        9.734 
79         2.966 
574        3.133 
1323       0.595 
2044    0.