In [10]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.metrics import (
    explained_variance_score,
    max_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [2]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 2, "least likely": 3}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)

In [3]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [5]:
xgb =  xgboost.XGBClassifier(learning_rate=0.2, n_estimators=49, max_depth=4, random_state=0, reg_alpha=1, reg_lambda=1, eval_metric='mlogloss')

lgbm =  LGBMClassifier(learning_rate=0.038035006480658606, max_depth=3, n_estimators=45,
               random_state=0, reg_alpha=1, reg_lambda=1)

cb = CatBoostClassifier(depth=4, iterations=50, learning_rate=0.1400545927979144, random_seed=seed, verbose=False)
                       
gbm = GradientBoostingClassifier(learning_rate=0.1281808322184159, max_depth=4, criterion='mse',
                          max_features='sqrt', n_estimators=36, random_state=seed)
                       
rf =  RandomForestClassifier(max_depth=3, max_features='log2', n_estimators=50,
                       random_state=0)

dt = DecisionTreeClassifier(max_depth=4, max_features='sqrt', random_state=0)

et = ExtraTreesClassifier(criterion='entropy', max_depth=4, max_features='log2',
                     n_estimators=35, random_state=0)

knn = KNeighborsClassifier(n_neighbors=7)

svc = SVC(C=293.53707592157997)

lr = LogisticRegression(C=0.5, max_iter=2500, random_state=0, solver='liblinear')

results = []
names = []
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']

inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [6]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gbm),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    ("KNN", knn),
    ("SVM", svc),
    ("LR", lr),

]

stacker = StackingClassifier(
    estimators=estimators,
    final_estimator= CatBoostClassifier(depth=4, iterations=50, learning_rate=0.1400545927979144, random_seed=seed, verbose=False)
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring
)
print("Stacking CV results for all scores:", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Stacking CV results for all scores: {'fit_time': array([1.94617128, 1.74862695, 1.77864289, 1.79806399, 1.78339314,
       1.79791689, 1.7261939 , 1.98247504, 1.86981392, 1.7487731 ]), 'score_time': array([0.02770472, 0.03203082, 0.02699804, 0.03013301, 0.02835011,
       0.02613306, 0.02711916, 0.02672791, 0.02885413, 0.02848268]), 'test_accuracy': array([0.76666667, 0.73333333, 0.8       , 0.72413793, 0.72413793,
       0.72413793, 0.68965517, 0.72413793, 0.68965517, 0.82758621]), 'test_balanced_accuracy': array([0.64814815, 0.6       , 0.63333333, 0.5952381 , 0.64444444,
       0.64444444, 0.59259259, 0.64444444, 0.56296296, 0.66666667]), 'test_f1_weighted': array([0.72504826, 0.69625508, 0.73245614, 0.653587  , 0.6907001 ,
       0.70172414, 0.67485709, 0.70172414, 0.63751856, 0.75040034]), 'test_precision_weighted': array([0.84090909, 0.79329004, 0.69047619, 0.60032841, 0.76657825,
       0.6907001 , 0.67385057, 0.6907001 , 0.59310345, 0.6876588 ]), 'test_recall_weighted': array([

In [7]:
bagging_cb = BaggingClassifier(base_estimator=cb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

cv_results = model_selection.cross_validate(bagging_cb, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
print('Bagging Classifier CV results for all scores:', '\n', cv_results, '\n')
print('Bagging Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Bagging Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('Bagging F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Bagging Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Bagging Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Bagging Classifier CV results for all scores: 
 {'fit_time': array([2.06861997, 0.10271096, 0.0917201 , 0.09286308, 0.10095525,
       0.09271884, 0.08946514, 0.08996797, 0.09125805, 0.09165788]), 'score_time': array([0.02079105, 0.01474595, 0.01579785, 0.02039599, 0.01656079,
       0.01457405, 0.01393199, 0.0147841 , 0.01440692, 0.01490211]), 'test_accuracy': array([0.73333333, 0.73333333, 0.76666667, 0.79310345, 0.79310345,
       0.72413793, 0.68965517, 0.75862069, 0.79310345, 0.82758621]), 'test_balanced_accuracy': array([0.67407407, 0.62222222, 0.61111111, 0.68571429, 0.68888889,
       0.67407407, 0.57777778, 0.62222222, 0.67407407, 0.71111111]), 'test_f1_weighted': array([0.72166667, 0.71630781, 0.7       , 0.75684572, 0.75704023,
       0.71728171, 0.66625616, 0.71786834, 0.76731382, 0.79066688]), 'test_precision_weighted': array([0.71960784, 0.71296296, 0.65      , 0.82387268, 0.82186981,
       0.7137931 , 0.68842365, 0.68390805, 0.77767695, 0.85775862]), 'test_recall_weight

In [8]:
model1 = xgb.fit(X_train, Y_train)
model2 = gbm.fit(X_train, Y_train)
model3 = lgbm.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train) #output predictions are not in the same dimensions as all others so needed to be removed (voting classifier error otherwise)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
model8 = knn.fit(X_train, Y_train)
model9 = svc.fit(X_train, Y_train)
model10 = lr.fit(X_train, Y_train)


vote = VotingClassifier([("xgbr", model1), ("gbr", model2), ("lgbm", model3),
                      ("rf", model5), ("et", model6),
                       ("dt", model7), ("knn", model8), ("svm", model9),
                       ("lr", model10)], voting='hard')

cv_results = model_selection.cross_validate(vote, X, Y, cv=outer_cv, scoring=scoring, error_score="raise")

print("Voting CV results for all scores", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy']))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted']))
print('Recall CV Average', np.median(cv_results['test_recall_weighted']))


Voting CV results for all scores {'fit_time': array([0.28574324, 0.28074217, 0.2882719 , 0.31050992, 0.33951497,
       0.33150816, 0.30249882, 0.31926131, 0.28318   , 0.30078006]), 'score_time': array([0.02752995, 0.02706313, 0.02568102, 0.03376102, 0.03119493,
       0.02898026, 0.03142118, 0.0269351 , 0.02729607, 0.02639604]), 'test_accuracy': array([0.66666667, 0.7       , 0.8       , 0.75862069, 0.72413793,
       0.72413793, 0.68965517, 0.72413793, 0.75862069, 0.82758621]), 'test_balanced_accuracy': array([0.56666667, 0.53333333, 0.63333333, 0.65238095, 0.58518519,
       0.62962963, 0.57777778, 0.6       , 0.60740741, 0.66666667]), 'test_f1_weighted': array([0.64555556, 0.63003096, 0.73245614, 0.72580982, 0.65688054,
       0.69853513, 0.66625616, 0.68194192, 0.70588235, 0.74952978]), 'test_precision_weighted': array([0.67619048, 0.61180124, 0.69047619, 0.79310345, 0.60217786,
       0.69959432, 0.68842365, 0.64442191, 0.67241379, 0.68495298]), 'test_recall_weighted': array([0.6

In [12]:
target_names = ['Most likely', 'Probable', 'Least likely']
stacker.fit(X_train, Y_train)
predictions = list(stacker.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 Most likely       0.00      0.00      0.00         7
    Probable       0.67      0.75      0.71        32
Least likely       0.94      0.75      0.83        20

    accuracy                           0.66        59
   macro avg       0.53      0.50      0.51        59
weighted avg       0.68      0.66      0.67        59



In [13]:
target_names = ['Most likely', 'Probable', 'Least likely']
vote.fit(X_train, Y_train)
predictions = list(vote.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 Most likely       0.00      0.00      0.00         7
    Probable       0.69      0.91      0.78        32
Least likely       1.00      0.70      0.82        20

    accuracy                           0.73        59
   macro avg       0.56      0.54      0.54        59
weighted avg       0.71      0.73      0.70        59



In [15]:
target_names = ['Most likely', 'Probable', 'Least likely']
bagging_cb.fit(X_train, Y_train)
predictions = list(bagging_cb.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 Most likely       0.00      0.00      0.00         7
    Probable       0.71      0.84      0.77        32
Least likely       1.00      0.80      0.89        20

    accuracy                           0.73        59
   macro avg       0.57      0.55      0.55        59
weighted avg       0.72      0.73      0.72        59

