### Imports

In [252]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics.pairwise import laplacian_kernel
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from matplotlib import pyplot as plt


### Training Data

In [253]:
training_data = pd.read_csv("../training data.csv")
training_data.head()

Unnamed: 0,PDB codes,VDWAALS 1,EEL 1,EGB 1,ESURF 1,GGAS 1,GSOLV 1,TOTAL 1,hbdist_mean1,hbdist_std1,...,GGAS 2,GSOLV 2,TOTAL 2,hbdist_mean2,hbdist_std2,hbnum_mean2,hbnum_std2,hbnum_mean_<.35_2,hbnum_std_<.35_2,Output
0,"4hj8 , 2la5",2.857144,6.34685,15.364635,2.727722,14.53047,15.364635,-113.49,-720.46,739.96,...,20.68032,18.0,-89.04,-3797.32,3785.39,-12.5,-3886.36,3772.9,-113.46,0
1,"1osb , 2mki",2.857144,6.319618,87.585415,4.537219,65.985015,87.585415,-563.32,-11193.39,11154.77,...,11.741259,6.296703,-85.56,-260.76,305.67,-10.57,-346.32,295.11,-51.21,1
2,"5wzj , 1je8",2.857143,5.946841,31.663337,2.959525,40.330669,31.663337,-230.34,130.79,-82.34,...,26.929071,49.412587,-309.02,-25367.65,25386.51,-44.66,-25676.67,25341.85,-334.82,0
3,"3moj , 1wwd",2.857143,6.500905,13.814186,2.719199,18.559441,13.814186,-118.26,-7867.57,7908.33,...,9.180819,9.595405,-69.9,-995.71,1011.87,-8.73,-1065.6,1003.13,-62.47,1
4,"3rn2 , 2kx5",2.857141,7.035113,16.63037,2.416485,7.082917,16.63037,-79.85,-13297.79,13326.09,...,20.0999,13.93007,-90.79,-5001.44,5006.58,-12.27,-5092.23,4994.32,-97.91,0


### Useful functions

In [254]:
def cross_val(model, x, y, cv = 5, fit_params = None):
    """
    It prints the mean of cross_val_score

    """
    from sklearn.model_selection import cross_val_score
    print(type(model))
    score = cross_val_score(model, x, y, cv = cv, n_jobs = -1, fit_params = fit_params)
    print(score)
    print("mean of cross val score:")
    print(np.mean(score))
    print()
    print()

def simple_test(model, x, y):
    """
    For a model already trained
    """

    from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
    from sklearn.metrics import confusion_matrix


    y_pred = model.predict(x)

    print("--------------results-----------------------------")
    print("accuracy\t", accuracy_score(y, y_pred))
    print("f1 score\t", f1_score(y, y_pred))
    print("precision\t", precision_score(y, y_pred))
    print("recall\t\t", recall_score(y, y_pred))
    print("confusion matrix:\n", confusion_matrix(y, y_pred))
    print()

In [255]:
x_train = np.array(training_data.iloc[:,1:-1])
y_train = np.array(training_data.iloc[:,-1])

### SVM

In [256]:
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

svm_params = [{
    "svm__C" : [0.1, 0.8, 1],
    "svm__kernel": ["poly", "rbf", "sigmoid", laplacian_kernel],
    "svm__gamma" : [0.25, 0.3, 0.5, 'auto']
}]

grid_search = GridSearchCV(svm_pipeline, svm_params, cv = 5, n_jobs= -1)

grid_search.fit(x_train,y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'svm__C': 1, 'svm__gamma': 0.25, 'svm__kernel': <function laplacian_kernel at 0x000002D17A8AECB0>}
0.8372808161609135


In [257]:
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(C=1, gamma=0.25, kernel=laplacian_kernel))
])

In [258]:
cross_val(svm_clf, x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.84408602 0.82749326 0.83827493 0.8490566  0.82749326]
mean of cross val score:
0.8372808161609135




### Random Forest

In [259]:
rf_pipeline = Pipeline([
    ("Scaler", StandardScaler()),
    ("rf", RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=22))
])

rf_params = [{
    "rf__max_features" : [0.25, 0.3, 0.4, 0.5, 0.6, 0.8, "sqrt"]
}]

grid_search = GridSearchCV(rf_pipeline, rf_params, cv = 5, n_jobs= -1)

grid_search.fit(x_train,y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'rf__max_features': 0.6}
0.9202576587543112


In [260]:
rf_clf = Pipeline([
    ("Scaler", StandardScaler()),
    ("rf_clf", RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=22))
])

In [261]:
cross_val(rf_clf, x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.91666667 0.92183288 0.90566038 0.92991914 0.90026954]
mean of cross val score:
0.914869721473495




### AdaBoosting

In [262]:
ada_pipeline = Pipeline([
    ("Scaler", StandardScaler()),
    ("ada_clf", AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", learning_rate=0.75))
])

ada_params = [{
    "ada_clf__n_estimators": [1000, 4000, 10000]
}]

grid_search = GridSearchCV(ada_pipeline, ada_params, cv = 5, n_jobs= -1 )

grid_search.fit(x_train,y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)


{'ada_clf__n_estimators': 10000}
0.973596498855172


In [263]:
ada_clf = Pipeline([
    ("Scaler", StandardScaler()),
    ("ada_clf", AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=10000, algorithm="SAMME.R", learning_rate=0.75))
])

In [264]:
cross_val(ada_clf, x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.97849462 0.96495957 0.97574124 0.97574124 0.97304582]
mean of cross val score:
0.973596498855172




### GradientBoosting

In [265]:
gra_pipeline = Pipeline([
    ("Scaler", StandardScaler()),
    ("gra_clf", GradientBoostingClassifier(learning_rate=1.0, random_state=0))
])

gra_params = [{
    "gra_clf__n_estimators" : [1000, 1480, 1550, 3000],
    "gra_clf__max_depth": [1,4]
}]

grid_search = GridSearchCV(gra_pipeline, gra_params, cv = 5, n_jobs= -1 )
grid_search.fit(x_train,y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'gra_clf__max_depth': 1, 'gra_clf__n_estimators': 1480}
0.9725168825899198


In [266]:
gra_clf = Pipeline([
    ("Scaler", StandardScaler()),
    ("gra_clf", GradientBoostingClassifier(n_estimators = 1480, learning_rate=1.0, max_depth=1, random_state=0))
])

In [267]:
cross_val(gra_clf, x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.9811828  0.96765499 0.97574124 0.97843666 0.95956873]
mean of cross val score:
0.9725168825899198




### Voting

In [268]:
voting_clf0 = VotingClassifier(
 estimators=[('rf', rf_clf), ('ada', ada_clf), ('gra', gra_clf)],
 voting='hard'
 )

In [269]:
cross_val(voting_clf0, x_train, y_train)

<class 'sklearn.ensemble._voting.VotingClassifier'>
[0.98387097 0.96495957 0.97304582 0.98382749 0.9703504 ]
mean of cross val score:
0.9752108512303279




In [270]:
voting_clf1 = VotingClassifier(
 estimators=[('svm', svm_clf), ('rf', rf_clf), ('ada', ada_clf)],
 voting='hard'
 )

In [271]:
cross_val(voting_clf1, x_train, y_train)

<class 'sklearn.ensemble._voting.VotingClassifier'>
[0.94354839 0.92722372 0.94339623 0.95687332 0.94339623]
mean of cross val score:
0.942887574993479




In [272]:
voting_clf2 = VotingClassifier(
 estimators=[('svm', svm_clf), ('rf', rf_clf), ('gra', gra_clf)],
 voting='hard'
 )

In [273]:
cross_val(voting_clf2, x_train, y_train)

<class 'sklearn.ensemble._voting.VotingClassifier'>
[0.94086022 0.93800539 0.93261456 0.9541779  0.93261456]
mean of cross val score:
0.9396545227951194




In [274]:
voting_clf3 = VotingClassifier(
 estimators=[('svm', svm_clf), ('ada', ada_clf), ('gra', gra_clf)],
 voting='hard'
 )

In [275]:
cross_val(voting_clf3, x_train, y_train)

<class 'sklearn.ensemble._voting.VotingClassifier'>
[0.9811828  0.96226415 0.98113208 0.97574124 0.9703504 ]
mean of cross val score:
0.9741341332637742




In [276]:
voting_clf4 = VotingClassifier(
 estimators=[('svm', svm_clf), ('rf', rf_clf), ('ada', ada_clf), ('gra', gra_clf)],
 voting='hard'
 )

In [277]:
cross_val(voting_clf4, x_train, y_train)

<class 'sklearn.ensemble._voting.VotingClassifier'>
[0.94623656 0.94070081 0.95148248 0.96226415 0.94878706]
mean of cross val score:
0.9498942120974988




Let us try the best classifiers on the training data whose features were selected according to Fisher information.

In [278]:
fisher_training_data = pd.read_csv("fisher_trainingdata.csv")
fisher_training_data.head()

Unnamed: 0,EEL 1,EGB 1,ESURF 1,GGAS 1,GSOLV 1,TOTAL 1,hbdist_mean1,hbdist_std1,hbnum_mean1,hbnum_std1,...,GGAS 2,GSOLV 2,TOTAL 2,hbdist_mean2,hbdist_std2,hbnum_mean2,hbnum_std2,hbnum_mean_<.35_2,hbnum_std_<.35_2,Output
0,6.34685,15.364635,2.727722,14.53047,15.364635,-113.49,-720.46,739.96,-13.55,-833.96,...,20.68032,18.0,-89.04,-3797.32,3785.39,-12.5,-3886.36,3772.9,-113.46,0
1,6.319618,87.585415,4.537219,65.985015,87.585415,-563.32,-11193.39,11154.77,-72.18,-11756.71,...,11.741259,6.296703,-85.56,-260.76,305.67,-10.57,-346.32,295.11,-51.21,1
2,5.946841,31.663337,2.959525,40.330669,31.663337,-230.34,130.79,-82.34,-28.29,-99.55,...,26.929071,49.412587,-309.02,-25367.65,25386.51,-44.66,-25676.67,25341.85,-334.82,0
3,6.500905,13.814186,2.719199,18.559441,13.814186,-118.26,-7867.57,7908.33,-15.61,-7985.83,...,9.180819,9.595405,-69.9,-995.71,1011.87,-8.73,-1065.6,1003.13,-62.47,1
4,7.035113,16.63037,2.416485,7.082917,16.63037,-79.85,-13297.79,13326.09,-14.63,-13377.64,...,20.0999,13.93007,-90.79,-5001.44,5006.58,-12.27,-5092.23,4994.32,-97.91,0


In [279]:
fisher_x_train = np.array(fisher_training_data.iloc[:,:-1])

In [280]:
cross_val(voting_clf0, fisher_x_train, y_train)

<class 'sklearn.ensemble._voting.VotingClassifier'>
[0.97849462 0.96765499 0.96765499 0.9703504  0.9703504 ]
mean of cross val score:
0.9709010810654146




In [281]:
cross_val(ada_clf, fisher_x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.9811828  0.96495957 0.96765499 0.97574124 0.96765499]
mean of cross val score:
0.9714387154740167




In [282]:
cross_val(gra_clf, fisher_x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.97311828 0.96226415 0.96765499 0.96765499 0.96765499]
mean of cross val score:
0.9676694780164043




Let us try it on the less correlated training data.

In [283]:
less_correlated_train = pd.read_csv("LessCorrelatedFeatures.csv")
lc_x_train = np.array(less_correlated_train.iloc[:,:-1])

In [284]:
cross_val(voting_clf0, lc_x_train, y_train)

<class 'sklearn.ensemble._voting.VotingClassifier'>
[0.9811828  0.96495957 0.97304582 0.9703504  0.97574124]
mean of cross val score:
0.9730559661478713




In [285]:
cross_val(ada_clf, lc_x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.97849462 0.96226415 0.97574124 0.97304582 0.9703504 ]
mean of cross val score:
0.9719792481813176




In [286]:
cross_val(gra_clf, lc_x_train, y_train)

<class 'sklearn.pipeline.Pipeline'>
[0.97849462 0.96495957 0.97843666 0.97304582 0.97304582]
mean of cross val score:
0.9735964988551722




## Recap: Best Classifiers

1. Voting Classifier with Random Forest, Ada Boost and Gradient Boosting.  
[0.9811828  0.96765499 0.96495957 0.97304582 0.9703504 ]  
mean of cross val score:  
0.9714387154740167

2. Voting  Classifier with SVM, Ada Boost and Gradient Boosting.  
    [0.9811828  0.96226415 0.98113208 0.97574124 0.9703504 ]  
    mean of cross val score:  
    0.9741341332637742  


3. Separately Gradient Boosting on less correlated data and Ada Boost.  

    Gradient Boosting on less correlated data:  
    [0.97849462 0.96495957 0.97843666 0.97304582 0.97304582]  
    mean of cross val score:  
    0.9735964988551722  


    Ada Boost:  
    [0.97849462 0.96495957 0.97574124 0.97574124 0.97304582]  
    mean of cross val score:  
    0.973596498855172  
