In [1]:
import numpy as np
import pandas as pd
import rdkit
import os
import math


from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

In [2]:
from sklearn.ensemble import RandomForestRegressor

## Clean Input Training DF

In [4]:
os.getcwd()

'/hpc/group/rekerlab/jrl78/CoAggregators/simulations'

In [4]:
os.chdir("KNIME")

In [7]:
trainDF = pd.read_csv('SolubilityTraining.csv')
trainDF.head()
trainDF = trainDF[ ['SMILES', 'Solubility']]
trainDF.head()

Unnamed: 0,SMILES,Solubility
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127
1,O=C1Nc2cccc3cccc1c23,-3.254767
2,Clc1ccc(C=O)cc1,-2.177078
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065


In [8]:
trainDF.to_csv('CLEANSolTrain.csv')

In [5]:
cleanTrainDF = pd.read_csv('CLEANSolTrain.csv', index_col = 'Unnamed: 0')
cleanTrainDF.head()

Unnamed: 0,SMILES,Solubility
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127
1,O=C1Nc2cccc3cccc1c23,-3.254767
2,Clc1ccc(C=O)cc1,-2.177078
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065


In [6]:
len(cleanTrainDF)

9982

## Convert Smiles to rdkit mols and calculate props

In [10]:
trainMols = []
for x in cleanTrainDF['SMILES']:
    trainMols.append(Chem.MolFromSmiles(x))
    
cleanTrainDF['Mol'] = trainMols

cleanTrainDF.head()



Unnamed: 0,SMILES,Solubility,Mol
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab580>
1,O=C1Nc2cccc3cccc1c23,-3.254767,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab530>
2,Clc1ccc(C=O)cc1,-2.177078,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab4e0>
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab5d0>
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab620>


In [11]:
descr = Descriptors._descList
calc = [x[1] for x in descr]

def describe_mol(mol):
    ds_n = []
    for d in calc:
        v = d(mol)
        if v > np.finfo(np.float32).max: 	# postprocess descriptors for freak large values
            ds_n.append(np.finfo(np.float32).max)
        elif math.isnan(v):
            ds_n.append(np.float32(0.0))
        else:
            ds_n.append(np.float32(v))
    
    return list(ds_n)

In [12]:
fps = []

In [13]:
fps = [describe_mol(x) for x in cleanTrainDF['Mol']]





In [62]:
fps[0]

[2.2968316,
 0.0,
 2.2968316,
 0.0,
 0.25967622,
 392.51,
 346.142,
 391.28137,
 142.0,
 0.0,
 0.07799283,
 -1.0,
 1.0,
 0.07799283,
 0.4347826,
 0.6956522,
 0.95652175,
 79.904,
 10.037414,
 2.044071,
 -2.2309818,
 2.1340172,
 -2.9960048,
 4.4945207,
 -0.8701305,
 0.0,
 210.37733,
 16.520815,
 16.845993,
 18.43199,
 10.56066,
 10.364975,
 10.364975,
 8.317232,
 8.317232,
 4.6824875,
 4.6824875,
 3.134242,
 3.134242,
 0.44,
 47399.242,
 25.67756,
 18.774017,
 24.808022,
 158.5206,
 21.464481,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 96.81524,
 12.841643,
 0.0,
 27.687773,
 21.464481,
 0.0,
 0.0,
 0.0,
 0.0,
 109.65688,
 27.687773,
 0.0,
 0.0,
 0.0,
 16.981451,
 0.0,
 0.0,
 0.0,
 32.170803,
 0.0,
 0.0,
 109.65688,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 16.981451,
 0.0,
 4.483031,
 109.2779,
 0.0,
 0.0,
 28.066753,
 0.0,
 1.1236149,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 23.35727,
 3.6286218,
 6.890495,
 1.0,
 23.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0

In [14]:
temp = pd.DataFrame(data = fps)
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,198,199,200,201,202,203,204,205,206,207
0,2.296832,0.000000,2.296832,0.000000,0.259676,392.510010,346.141998,391.281372,142.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0
1,11.435093,0.006296,11.435093,0.006296,0.644215,169.182999,162.126999,169.052765,62.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.074697,0.652222,10.074697,0.652222,0.546650,140.569000,135.529007,140.002899,46.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.573312,-1.369330,11.573312,0.000000,0.136407,756.226013,713.890015,754.227295,264.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.440876,0.383760,5.440876,0.383760,0.490100,422.524994,392.285004,422.220551,164.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9977,11.736681,-0.262076,11.736681,0.262076,0.578688,264.368988,240.177002,264.183777,106.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9978,13.313695,-2.745034,13.313695,0.134831,0.336110,444.440002,420.247986,444.153259,170.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9979,9.461760,0.398704,9.461760,0.398704,0.652274,150.220993,136.108994,150.104462,60.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9980,10.242105,-0.579622,10.242105,0.164510,0.419905,454.610992,416.307007,454.283173,180.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
bigTrainDF = pd.merge(cleanTrainDF, temp, left_index = True, right_index = True)
#bigTrainDF.drop(columns = 'FP', inplace = True)
bigTrainDF

Unnamed: 0,SMILES,Solubility,Mol,0,1,2,3,4,5,6,...,198,199,200,201,202,203,204,205,206,207
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab580>,2.296832,0.000000,2.296832,0.000000,0.259676,392.510010,346.141998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0
1,O=C1Nc2cccc3cccc1c23,-3.254767,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab530>,11.435093,0.006296,11.435093,0.006296,0.644215,169.182999,162.126999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Clc1ccc(C=O)cc1,-2.177078,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab4e0>,10.074697,0.652222,10.074697,0.652222,0.546650,140.569000,135.529007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab5d0>,11.573312,-1.369330,11.573312,0.000000,0.136407,756.226013,713.890015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd1ab620>,5.440876,0.383760,5.440876,0.383760,0.490100,422.524994,392.285004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9977,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C,-3.010000,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd162e40>,11.736681,-0.262076,11.736681,0.262076,0.578688,264.368988,240.177002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9978,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...,-2.930000,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd162e90>,13.313695,-2.745034,13.313695,0.134831,0.336110,444.440002,420.247986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9979,c1(cc(ccc1C(C)C)C)O,-2.190000,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd162ee0>,9.461760,0.398704,9.461760,0.398704,0.652274,150.220993,136.108994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9980,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,-3.980000,<rdkit.Chem.rdchem.Mol object at 0x7fa7dd162f30>,10.242105,-0.579622,10.242105,0.164510,0.419905,454.610992,416.307007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
bigTrainDF.to_csv("FULL_SOL_training_set.csv")

## Make the RF Model

In [7]:
import sklearn
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold, KFold

In [8]:
bigTrainDF = pd.read_csv("FULL_SOL_training_set.csv")

In [18]:
regr = RandomForestRegressor(n_estimators = 150, max_depth=10, min_samples_split = 5, random_state=0)


In [9]:
X_train = bigTrainDF.iloc[:, 4:]
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,198,199,200,201,202,203,204,205,206,207
0,2.296832,0.0,2.296832,0.0,0.259676,392.51001,346.141998,391.281372,142.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0
1,11.435093,0.006296,11.435093,0.006296,0.644215,169.182999,162.126999,169.052765,62.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.074697,0.652222,10.074697,0.652222,0.54665,140.569,135.529007,140.002899,46.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.573312,-1.36933,11.573312,0.0,0.136407,756.226013,713.890015,754.227295,264.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.440876,0.38376,5.440876,0.38376,0.4901,422.524994,392.285004,422.220551,164.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
Y_train = bigTrainDF['Solubility']

In [11]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,198,199,200,201,202,203,204,205,206,207
0,2.296832,0.000000,2.296832,0.000000,0.259676,392.510010,346.141998,391.281372,142.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0
1,11.435093,0.006296,11.435093,0.006296,0.644215,169.182999,162.126999,169.052765,62.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.074697,0.652222,10.074697,0.652222,0.546650,140.569000,135.529007,140.002899,46.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.573312,-1.369330,11.573312,0.000000,0.136407,756.226013,713.890015,754.227295,264.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.440876,0.383760,5.440876,0.383760,0.490100,422.524994,392.285004,422.220551,164.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9977,11.736681,-0.262076,11.736681,0.262076,0.578688,264.368988,240.177002,264.183777,106.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9978,13.313695,-2.745034,13.313695,0.134831,0.336110,444.440002,420.247986,444.153259,170.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9979,9.461760,0.398704,9.461760,0.398704,0.652274,150.220993,136.108994,150.104462,60.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9980,10.242105,-0.579622,10.242105,0.164510,0.419905,454.610992,416.307007,454.283173,180.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
a = []

b= [1.12,2.413,3.1234]
c= [3,4,5]


In [32]:
a.append(np.array(b))
a.append(c)
a

[array([1.12  , 2.413 , 3.1234]), [3, 4, 5]]

In [33]:
flat_list = [item for sublist in a for item in sublist]
flat_list

[1.12, 2.413, 3.1234, 3, 4, 5]

In [63]:
def modelRF(X,y, n_splits = 3):
    
    
    skf = KFold(n_splits = n_splits)
    i = 1
    
    preds = []
    actuals = []
    allR2 = []
    
    # Define arrays to store metrics so we can average at the end
    #reg_MCC = np.array([])
    
    #reg_bal_acc = np.array([])
    
    #reg_f1 = np.array([])
    
    r2_array = np.array([])
    
    for train,test in skf.split(X,y):
        
        #print(train)
        X1_train = np.array(X.iloc[train])
        y1_train = np.array(np.ravel(y.iloc[train]))
        
        
        X1_test = X.iloc[test]
        y1_test = y.iloc[test]
    
        print("Now training Random Forest Model {}".format(i))
        print("-------------------------------------")
        clf = RandomForestRegressor(n_estimators = 150, max_depth=10, min_samples_split = 5)
        print("Making Predictions...")
        clf.fit(X1_train, y1_train)

        y_pred = clf.predict(X1_test)
        y_true = y1_test
        
        preds.append(y_pred)
        actuals.append(y_true)
        print("Predictions Done!")
        #balanced_accuracy_results = balanced_accuracy_score(y_true, y_pred)
        #mcc_results = mcc(y_true, y_pred)
        #f1_results = f1_score(y_true, y_pred, average='micro')
        
        #R2 = r2_score(y_true, y_pred)
        
        #roc = roc_auc_score(y_true, y_pred)
        
        ########################################################################
        
        # Adding calculated metrics to the running total arrays
        
        #reg_MCC = np.append(reg_MCC, mcc_results )
        #reg_bal_acc = np.append(reg_bal_acc, balanced_accuracy_results )
        #reg_f1 = np.append(reg_f1, f1_results)
        
        #r2_array = np.append(r2_array, R2)
        
        
        #R2 = r2_score(y_true, y_pred)
        
        #print("Metrics:")
        #print('balanced_accuracy_result is: ' + str(balanced_accuracy_results))
        #print('mcc result is: ' + str(mcc_results))
        #print('f1 score is: ' + str(f1_results))
        #print("roc auc is: " + str(roc))
        #print("R2 Score is: {}".format(R2))
        print('-------------------------------------')
        print("")
        i = i+1
    
    
    all_preds = [item for sublist in preds for item in sublist]
    all_actual = [item for sublist in actuals for item in sublist]
    R2 = r2_score(all_actual, all_preds)
    print('-------------------------------------')
    print("Metrics:")
    print("")
    #print("Mean MCC: {}".format(reg_MCC.mean()))
    #print("Mean Balanced Accuracy: {}".format(reg_bal_acc.mean()))
    #print("Mean F1 Score: {}".format(reg_f1.mean()))
    print("Mean R2 Score is: {}".format(R2))
    print("")
    print('-------------------------------------')
    return R2

In [64]:
run_1 = modelRF(X_train, Y_train, 10)

Now training Random Forest Model 1
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 2
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 3
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 4
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 5
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 6
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 7
-------------------------------------
Making Predicti

Try:
Gaussian NB
Decision Tree
SVM
MLP
DeepDrug from ChemicalX

In [65]:
run_1

0.7560345914187081

In [50]:
runs = []
runs.append(run_1)
runs

[0.7556723445424945]

In [51]:
for i in range(9):
    runs.append(modelRF(X_train, Y_train, 10))

Now training Random Forest Model 1
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 2
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 3
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 4
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 5
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 6
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 7
-------------------------------------
Making Predicti

Predictions Done!
-------------------------------------

Now training Random Forest Model 2
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 3
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 4
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 5
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 6
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 7
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Mode

In [52]:
runs

[0.7556723445424945,
 0.7568248831616972,
 0.7562319111888272,
 0.7576074832187057,
 0.7569885069076489,
 0.7575390824211432,
 0.7570523544790009,
 0.7571385337673762,
 0.7566197650799678,
 0.7563410326174622]

In [57]:
sum(runs)/10

0.7568015897384324

In [75]:
realModel = RandomForestRegressor(n_estimators = 150, max_depth=10, min_samples_split = 5)
realModel.fit(X_train, Y_train)

RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=150)

In [80]:
proDF = pd.read_excel("SolubilityProdrugsApprovedAndClinicalTrials.xlsx")
proDF.head()

Unnamed: 0,Prodrug Name,PubChem ID,Canonical SMILES,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Ceftaroline fosamil,9852981,CCON=C(C1=NSC(=N1)NP(=O)(O)O)C(=O)NC2C3N(C2=O)...,,,,
1,Ceftobiprole medocaril,135413544,CC1=C(OC(=O)O1)COC(=O)N2CCC(C2)N3CCC(=CC4=C(N5...,,,,
2,Fludarabine phosphate,30751,C1=NC2=C(N=C(N=C2N1[C@H]3[C@H]([C@@H]([C@H](O3...,,,,
3,Fosamprenavir,131536,CC(C)CN(C[C@H]([C@H](CC1=CC=CC=C1)NC(=O)O[C@H]...,,,,
4,Fosaprepitant,135413538,C[C@H](C1=CC(=CC(=C1)C(F)(F)F)C(F)(F)F)O[C@@H]...,,,,


In [81]:
apiDF = pd.read_excel("SolubilityAPIApprovedAndClinicalTrials.xlsx")
apiDF.head()

Unnamed: 0,API Name,PubChem ID,Canonical SMILES,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Ceftaroline,59451342,CCON=C(C1=NSC(=N1)N)C(=O)NC2C3N(C2=O)C(=C(CS3)...,,,,
1,Ceftobiprole,135413542,C1CNCC1N2CCC(=CC3=C(N4C(C(C4=O)NC(=O)C(=NO)C5=...,,,,
2,Fludarabine,657237,C1=NC2=C(N=C(N=C2N1C3C(C(C(O3)CO)O)O)F)N,,,,
3,Amprenavir,65016,CC(C)CN(C[C@H]([C@H](CC1=CC=CC=C1)NC(=O)O[C@H]...,,,,
4,Aprepitant,135413536,C[C@H](C1=CC(=CC(=C1)C(F)(F)F)C(F)(F)F)O[C@@H]...,,,,


In [79]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 4.6 MB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9
Note: you may need to restart the kernel to use updated packages.


In [12]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [27]:
def modelSVM(X,y, n_splits = 3):
    
    
    skf = KFold(n_splits = n_splits)
    i = 1
    
    # Define arrays to store metrics so we can average at the end
    preds = []
    actuals = []
    allR2 = []
    

    r2_array = np.array([])
    
    for train,test in skf.split(X,y):

        X1_train = np.array(X.iloc[train])
        y1_train = np.array(np.ravel(y.iloc[train]))
        
        
        X1_test = X.iloc[test]
        y1_test = y.iloc[test]
    
        print("Now training Random Forest Model {}".format(i))
        print("-------------------------------------")
        clf = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.3))
        print("Making Predictions...")
        clf.fit(X1_train, y1_train)

        y_pred = clf.predict(X1_test)
        y_true = y1_test
        
        preds.append(y_pred)
        actuals.append(y_true)
        print("Predictions Done!")
        
        ########################################################################
        
        # Adding calculated metrics to the running total arrays
        
        #reg_MCC = np.append(reg_MCC, mcc_results )
        #reg_bal_acc = np.append(reg_bal_acc, balanced_accuracy_results )
        #reg_f1 = np.append(reg_f1, f1_results)
        
        #r2_array = np.append(r2_array, R2)
        
        
        #R2 = r2_score(y_true, y_pred)
        
        #print("Metrics:")
        #print('balanced_accuracy_result is: ' + str(balanced_accuracy_results))
        #print('mcc result is: ' + str(mcc_results))
        #print('f1 score is: ' + str(f1_results))
        #print("roc auc is: " + str(roc))
        #print("R2 Score is: {}".format(R2))
        print('-------------------------------------')
        print("")
        i = i+1
    
    
    all_preds = [item for sublist in preds for item in sublist]
    all_actual = [item for sublist in actuals for item in sublist]
    R2 = r2_score(all_actual, all_preds)
    print('-------------------------------------')
    print("Metrics:")
    print("")
    #print("Mean MCC: {}".format(reg_MCC.mean()))
    #print("Mean Balanced Accuracy: {}".format(reg_bal_acc.mean()))
    #print("Mean F1 Score: {}".format(reg_f1.mean()))
    print("Mean R2 Score is: {}".format(R2))
    print("")
    print('-------------------------------------')
    return R2

In [39]:
SVM_1 = modelSVM(X_train, Y_train, 3)

Now training Random Forest Model 1
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 2
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 3
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

-------------------------------------
Metrics:

Mean R2 Score is: 0.6850786149334125

-------------------------------------


In [40]:
from sklearn.naive_bayes import GaussianNB

In [41]:
def modelGNB(X,y, n_splits = 3):
    
    
    skf = KFold(n_splits = n_splits)
    i = 1
    
    # Define arrays to store metrics so we can average at the end
    preds = []
    actuals = []
    allR2 = []
    

    r2_array = np.array([])
    
    for train,test in skf.split(X,y):

        X1_train = np.array(X.iloc[train])
        y1_train = np.array(np.ravel(y.iloc[train]))
        
        
        X1_test = X.iloc[test]
        y1_test = y.iloc[test]
    
        print("Now training Random Forest Model {}".format(i))
        print("-------------------------------------")
        clf = GaussianNB()
        print("Making Predictions...")
        clf.fit(X1_train, y1_train)

        y_pred = clf.predict(X1_test)
        y_true = y1_test
        
        preds.append(y_pred)
        actuals.append(y_true)
        print("Predictions Done!")
        
        ########################################################################
        
        # Adding calculated metrics to the running total arrays
        
        #reg_MCC = np.append(reg_MCC, mcc_results )
        #reg_bal_acc = np.append(reg_bal_acc, balanced_accuracy_results )
        #reg_f1 = np.append(reg_f1, f1_results)
        
        #r2_array = np.append(r2_array, R2)
        
        
        #R2 = r2_score(y_true, y_pred)
        
        #print("Metrics:")
        #print('balanced_accuracy_result is: ' + str(balanced_accuracy_results))
        #print('mcc result is: ' + str(mcc_results))
        #print('f1 score is: ' + str(f1_results))
        #print("roc auc is: " + str(roc))
        #print("R2 Score is: {}".format(R2))
        print('-------------------------------------')
        print("")
        i = i+1
    
    
    all_preds = [item for sublist in preds for item in sublist]
    all_actual = [item for sublist in actuals for item in sublist]
    R2 = r2_score(all_actual, all_preds)
    print('-------------------------------------')
    print("Metrics:")
    print("")
    #print("Mean MCC: {}".format(reg_MCC.mean()))
    #print("Mean Balanced Accuracy: {}".format(reg_bal_acc.mean()))
    #print("Mean F1 Score: {}".format(reg_f1.mean()))
    print("Mean R2 Score is: {}".format(R2))
    print("")
    print('-------------------------------------')
    return R2

In [44]:
GNB_1 = modelGNB(X_train, Y_train, 3)

Now training Random Forest Model 1
-------------------------------------
Making Predictions...


ValueError: Unknown label type: (array([-13.1719, -12.95  , -12.0605, ...,   1.5682,   1.5808,   1.6988]),)

In [45]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

In [73]:
def modelMLP(X,y, n_splits = 3):
     
    skf = KFold(n_splits = n_splits)
    i = 1
    
    # Define arrays to store metrics so we can average at the end
    preds = []
    actuals = []
    allR2 = []
    
    r2_array = np.array([])
    
    for train,test in skf.split(X,y):

        X1_train = np.array(X.iloc[train])
        y1_train = np.array(np.ravel(y.iloc[train]))
               
        X1_test = X.iloc[test]
        y1_test = y.iloc[test]
    
        print("Now training Random Forest Model {}".format(i))
        print("-------------------------------------")
        clf = MLPRegressor(hidden_layer_sizes = (500,),activation = 'logistic',
                           learning_rate = 'adaptive', alpha = 0.001, 
                           max_iter=1000)
        print("Making Predictions...")
        clf.fit(X1_train, y1_train)

        y_pred = clf.predict(X1_test)
        y_true = y1_test
        
        preds.append(y_pred)
        actuals.append(y_true)
        print("Predictions Done!")
        
        ########################################################################
        
        # Adding calculated metrics to the running total arrays

        print('-------------------------------------')
        print("")
        i = i+1
    
    
    all_preds = [item for sublist in preds for item in sublist]
    all_actual = [item for sublist in actuals for item in sublist]
    R2 = r2_score(all_actual, all_preds)
    print('-------------------------------------')
    print("Metrics:")
    print("")
    print("Mean R2 Score is: {}".format(R2))
    print("")
    print('-------------------------------------')
    print(all_actual[0:5])
    print(all_preds[0:5])
    return R2

In [74]:
MLP_1 = modelMLP(X_train, Y_train, 3)

Now training Random Forest Model 1
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 2
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

Now training Random Forest Model 3
-------------------------------------
Making Predictions...
Predictions Done!
-------------------------------------

-------------------------------------
Metrics:

Mean R2 Score is: 0.4637477905216385

-------------------------------------
[-3.616127121, -3.254767098, -2.177077944, -3.924409095, -4.662064583]
[-4.441930509289405, -3.015461027011558, -2.02987521158854, -4.441930508997143, -4.441930508997143]


In [20]:
regr = RandomForestRegressor(n_estimators = 150, max_depth=10, min_samples_split = 5, random_state=0)

In [21]:
regr.fit(X_train, Y_train)

RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=150,
                      random_state=0)

In [22]:
regr.score(X_train, Y_train)

0.8911637119045437