In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.metrics import *
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, RandomOverSampler
from imblearn.base import BaseSampler
from imblearn.datasets import make_imbalance

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [2]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 2, "least likely": 3}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)

In [3]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [4]:
print('Before OverSampling, the shape of X: {}'.format(X.shape))
print('Before OverSampling, the shape of y: {} \n'.format(Y.shape))

sm = SMOTE(random_state=seed)
X, Y = sm.fit_resample(X, Y)

print('After OverSampling, the shape of X: {}'.format(X.shape))
print('After OverSampling, the shape of y: {} \n'.format(Y.shape))

print("After OverSampling, counts of label '1': {}".format(sum(Y==1)))
print("After OverSampling, counts of label '2': {}".format(sum(Y==2)))
print("After OverSampling, counts of label '3': {}".format(sum(Y==3)))


Before OverSampling, the shape of X: (293, 6)
Before OverSampling, the shape of y: (293,) 

After OverSampling, the shape of X: (447, 6)
After OverSampling, the shape of y: (447,) 

After OverSampling, counts of label '1': 149
After OverSampling, counts of label '2': 149
After OverSampling, counts of label '3': 149


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [15]:
xgb =  xgboost.XGBClassifier(learning_rate=0.2, n_estimators=40, max_depth=4, random_state=0, reg_alpha=2, reg_lambda=3, eval_metric='mlogloss')


lgbm =  LGBMClassifier(learning_rate=0.2, max_depth=4, n_estimators=33, random_state=0,
               reg_alpha=1, reg_lambda=10)

cb = CatBoostClassifier(depth=4, iterations=50, learning_rate=0.18265036304577847, random_seed=seed, verbose=False)


gb = GradientBoostingClassifier(learning_rate=0.1872026709317995, max_depth=4,
                           max_features='sqrt', n_estimators=50,
                           random_state=0)
                       
rf = RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=25,
                       random_state=0)

dt = DecisionTreeClassifier(max_depth=4, max_features='sqrt', random_state=0)

et = ExtraTreesClassifier(max_depth=4, max_features='log2', n_estimators=50,
                     random_state=0)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')

svc = SVC(C=196.72280894954662)

lr = LogisticRegression(C=0.5, max_iter=1000, random_state=0, solver='liblinear')
results = []
names = []
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']

inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [16]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gbm),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    ("KNN", knn),
    ("SVM", svc),
    ("LR", lr),

]

stacker = StackingClassifier(
    estimators=estimators,
    final_estimator= GradientBoostingClassifier(learning_rate=0.1872026709317995, max_depth=4,
                           max_features='sqrt', n_estimators=50,
                           random_state=0)
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring
)
print("Stacking CV results for all scores:", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Stacking CV results for all scores: {'fit_time': array([2.06740689, 1.90029907, 1.89683795, 1.97785902, 1.98129892,
       2.01210189, 2.10046005, 2.03830218, 2.00066495, 1.89941096]), 'score_time': array([0.02955985, 0.02902102, 0.02932978, 0.0288558 , 0.03001976,
       0.02939415, 0.03406501, 0.02835894, 0.02941418, 0.027807  ]), 'test_accuracy': array([0.77777778, 0.77777778, 0.82222222, 0.8       , 0.86666667,
       0.86666667, 0.82222222, 0.75      , 0.84090909, 0.88636364]), 'test_balanced_accuracy': array([0.77777778, 0.77777778, 0.82222222, 0.8       , 0.86666667,
       0.86666667, 0.82222222, 0.74920635, 0.84285714, 0.88412698]), 'test_f1_weighted': array([0.76762821, 0.77723885, 0.82365462, 0.80235856, 0.86488095,
       0.8697479 , 0.81900452, 0.7512225 , 0.83904735, 0.88428759]), 'test_precision_weighted': array([0.77183601, 0.79891775, 0.83918129, 0.81196581, 0.86847662,
       0.88663968, 0.83413078, 0.75626457, 0.84120775, 0.88792335]), 'test_recall_weighted': array([

In [17]:
bagging = BaggingClassifier(base_estimator=gb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

cv_results = model_selection.cross_validate(bagging, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
print('Bagging Classifier CV results for all scores:', '\n', cv_results, '\n')
print('Bagging Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Bagging Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('Bagging F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Bagging Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Bagging Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Bagging Classifier CV results for all scores: 
 {'fit_time': array([1.26468492, 0.20484591, 0.20970702, 0.20502996, 0.20488691,
       0.20482302, 0.20526004, 0.20272493, 0.20372391, 0.2029767 ]), 'score_time': array([0.05916619, 0.05281806, 0.05088878, 0.05072713, 0.05406904,
       0.05058694, 0.04927516, 0.05077505, 0.04990911, 0.04994106]), 'test_accuracy': array([0.73333333, 0.8       , 0.82222222, 0.8       , 0.91111111,
       0.88888889, 0.84444444, 0.81818182, 0.84090909, 0.84090909]), 'test_balanced_accuracy': array([0.73333333, 0.8       , 0.82222222, 0.8       , 0.91111111,
       0.88888889, 0.84444444, 0.81587302, 0.84285714, 0.83809524]), 'test_f1_weighted': array([0.71212121, 0.80135954, 0.82471264, 0.80172414, 0.91020115,
       0.88804845, 0.84554598, 0.81818182, 0.83676042, 0.83708479]), 'test_precision_weighted': array([0.72222222, 0.80396825, 0.83053221, 0.80672269, 0.91316527,
       0.88849206, 0.85014006, 0.81818182, 0.85082645, 0.83868093]), 'test_recall_weight

In [18]:
model1 = xgb.fit(X_train, Y_train)
model2 = gbm.fit(X_train, Y_train)
model3 = lgbm.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train) #output predictions are not in the same dimensions as all others so needed to be removed (voting classifier error otherwise)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
model8 = knn.fit(X_train, Y_train)
model9 = svc.fit(X_train, Y_train)
model10 = lr.fit(X_train, Y_train)


vote = VotingClassifier([("xgbr", model1), ("gbr", model2), ("lgbm", model3),
                      ("rf", model5), ("et", model6),
                       ("dt", model7), ("knn", model8), ("svm", model9),
                       ("lr", model10)], voting='hard')

cv_results = model_selection.cross_validate(vote, X, Y, cv=outer_cv, scoring=scoring, error_score="raise")

print("Voting CV results for all scores", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy']))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted']))
print('Recall CV Average', np.median(cv_results['test_recall_weighted']))


Voting CV results for all scores {'fit_time': array([0.29156089, 0.29241323, 0.29292393, 0.29046011, 0.29306602,
       0.29485393, 0.29837918, 0.29136086, 0.29524589, 0.29566813]), 'score_time': array([0.02731419, 0.02718401, 0.02736592, 0.02706385, 0.02765584,
       0.02978611, 0.0273447 , 0.02774334, 0.02724791, 0.02839971]), 'test_accuracy': array([0.75555556, 0.82222222, 0.82222222, 0.86666667, 0.88888889,
       0.95555556, 0.8       , 0.77272727, 0.79545455, 0.81818182]), 'test_balanced_accuracy': array([0.75555556, 0.82222222, 0.82222222, 0.86666667, 0.88888889,
       0.95555556, 0.8       , 0.77142857, 0.8       , 0.81587302]), 'test_f1_weighted': array([0.74500561, 0.81882218, 0.82301587, 0.86666667, 0.88721264,
       0.95553084, 0.8       , 0.77429467, 0.7832578 , 0.8139132 ]), 'test_precision_weighted': array([0.74444444, 0.82434641, 0.82845651, 0.86666667, 0.88935574,
       0.95694444, 0.8       , 0.77705628, 0.81127451, 0.81341453]), 'test_recall_weighted': array([0.7

In [19]:
target_names = ['most likely', 'probable','least likely']
stacker.fit(X_train, Y_train)
predictions = list(stacker.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.79      0.87      0.83        31
    probable       0.75      0.67      0.71        27
least likely       0.91      0.91      0.91        32

    accuracy                           0.82        90
   macro avg       0.82      0.81      0.81        90
weighted avg       0.82      0.82      0.82        90



In [20]:
vote.fit(X_train, Y_train)
predictions = list(vote.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.87      0.84      0.85        31
    probable       0.70      0.78      0.74        27
least likely       0.93      0.88      0.90        32

    accuracy                           0.83        90
   macro avg       0.83      0.83      0.83        90
weighted avg       0.84      0.83      0.84        90



In [21]:
bagging.fit(X_train, Y_train)
predictions = list(bagging.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.83      0.81      0.82        31
    probable       0.76      0.81      0.79        27
least likely       0.94      0.91      0.92        32

    accuracy                           0.84        90
   macro avg       0.84      0.84      0.84        90
weighted avg       0.85      0.84      0.85        90

