In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression

from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [2]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 2, "possible": 3, 'least likely':4}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)

In [3]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [18]:
xgb =  xgboost.XGBClassifier(learning_rate=0.15120340125789705, n_estimators=50, max_depth=4, random_state=0, reg_alpha=3, reg_lambda=10,
                            eval_metric='mlogloss')

lgbm =  LGBMClassifier(learning_rate=0.2, max_depth=1, n_estimators=50, random_state=0,
               reg_alpha=1, reg_lambda=1)

cb = CatBoostClassifier(depth=4, iterations=50, learning_rate=0.1964188294579477, random_seed=seed, verbose=False)

gb = GradientBoostingClassifier(learning_rate=0.19101624063861414, max_depth=1,
                           max_features='auto', n_estimators=47,
                           random_state=0)
                       
rf = RandomForestClassifier(criterion='entropy', max_depth=4, max_features='log2',
                       n_estimators=24, random_state=0)

dt = DecisionTreeClassifier(max_depth=3, max_features='sqrt', random_state=0)

et = ExtraTreesClassifier(criterion='entropy', max_depth=4, max_features='log2',
                     n_estimators=10, random_state=0)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7)

svc = SVC(C=999.9984985741586)

lr = LogisticRegression(max_iter=1000, penalty='l1', random_state=0,
                   solver='liblinear')

results = []
names = []
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']

inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True,  random_state=seed)

In [19]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gb),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    ("KNN", knn),
    ("SVM", svc),
    ("LR", lr),

]

stacker = StackingClassifier(
    estimators=estimators,
    final_estimator= xgboost.XGBClassifier(learning_rate=0.15120340125789705, n_estimators=50, max_depth=4, random_state=0, reg_alpha=3, reg_lambda=10,
                            eval_metric='mlogloss')
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring
)
print("Stacking CV results for all scores:", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Stacking CV results for all scores: {'fit_time': array([1.88520694, 1.76581812, 1.76031995, 1.71832895, 1.75551915,
       1.76247597, 1.79440689, 1.77660227, 1.74836898, 1.74558115]), 'score_time': array([0.02221894, 0.02179098, 0.02321601, 0.02212787, 0.02232099,
       0.02153993, 0.02228284, 0.02208471, 0.02176499, 0.02138591]), 'test_accuracy': array([0.47368421, 0.60526316, 0.57894737, 0.55263158, 0.76315789,
       0.57894737, 0.44736842, 0.51351351, 0.54054054, 0.64864865]), 'test_balanced_accuracy': array([0.44444444, 0.56111111, 0.48888889, 0.47222222, 0.68333333,
       0.52361111, 0.37916667, 0.42142857, 0.50694444, 0.58888889]), 'test_f1_weighted': array([0.43101504, 0.55927533, 0.53192228, 0.5018797 , 0.7296201 ,
       0.57703295, 0.40633843, 0.48616729, 0.52934363, 0.58357041]), 'test_precision_weighted': array([0.43157895, 0.59811213, 0.60328947, 0.47009569, 0.83901602,
       0.65225564, 0.37570489, 0.51880141, 0.52416052, 0.55887192]), 'test_recall_weighted': array([

In [20]:
bagging_xgbr = BaggingClassifier(base_estimator=xgb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

cv_results = model_selection.cross_validate(bagging_xgbr, X , Y, cv=outer_cv, scoring=scoring)
print('Bagging Classifier CV results for all scores:', '\n', cv_results, '\n')
print('Bagging Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Bagging Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('Bagging F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Bagging Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Bagging Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Bagging Classifier CV results for all scores: 
 {'fit_time': array([3.97414875, 2.59140897, 2.57272887, 2.61888814, 2.59214211,
       2.60070086, 2.59987307, 2.58918405, 2.63562202, 2.5896709 ]), 'score_time': array([0.17132521, 0.17053294, 0.16788125, 0.16725397, 0.16960907,
       0.17054725, 0.16903996, 0.17251992, 0.17005181, 0.16960478]), 'test_accuracy': array([0.55263158, 0.63157895, 0.60526316, 0.47368421, 0.60526316,
       0.57894737, 0.42105263, 0.59459459, 0.56756757, 0.62162162]), 'test_balanced_accuracy': array([0.48333333, 0.58888889, 0.51666667, 0.38888889, 0.55      ,
       0.50902778, 0.34791667, 0.47678571, 0.49444444, 0.57222222]), 'test_f1_weighted': array([0.46790271, 0.59245152, 0.56835953, 0.41559799, 0.54601416,
       0.54641437, 0.3876504 , 0.54516055, 0.5097651 , 0.56235183]), 'test_precision_weighted': array([0.41459418, 0.61784897, 0.6625    , 0.38935407, 0.50451128,
       0.58004386, 0.36095648, 0.56      , 0.48149058, 0.52908343]), 'test_recall_weight

In [21]:
model1 = xgb.fit(X_train, Y_train)
model2 = gb.fit(X_train, Y_train)
model3 = lgbm.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
model8 = knn.fit(X_train, Y_train)
model9 = svc.fit(X_train, Y_train)
model10 = lr.fit(X_train, Y_train)

vote = VotingClassifier([("xgbr", model1), ("gbr", model2), ("lgbm", model3),
                      ("rf", model5), ("et", model6),
                       ("dt", model7), ("knn", model8), ("svm", model9),
                       ("lr", model10)])

cv_results = model_selection.cross_validate(
        vote, X, Y, cv=outer_cv, scoring=scoring
)
print("Voting CV results for all scores", cv_results)

print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Voting CV results for all scores {'fit_time': array([0.32234502, 0.32751107, 0.32775593, 0.31215286, 0.35000181,
       0.34168601, 0.35109115, 0.34852195, 0.32956123, 0.30857491]), 'score_time': array([0.02325201, 0.02260399, 0.022264  , 0.02459121, 0.02359724,
       0.02434802, 0.02451491, 0.02357602, 0.02246666, 0.0222621 ]), 'test_accuracy': array([0.55263158, 0.60526316, 0.60526316, 0.52631579, 0.68421053,
       0.63157895, 0.5       , 0.56756757, 0.59459459, 0.62162162]), 'test_balanced_accuracy': array([0.49444444, 0.56111111, 0.50555556, 0.43333333, 0.61111111,
       0.54236111, 0.40416667, 0.46428571, 0.51111111, 0.53888889]), 'test_f1_weighted': array([0.49416102, 0.52977839, 0.54795615, 0.4546332 , 0.64276969,
       0.58858711, 0.4621422 , 0.49966809, 0.51667457, 0.54654655]), 'test_precision_weighted': array([0.62974871, 0.50263158, 0.63031377, 0.43421053, 0.74330144,
       0.65635965, 0.4610984 , 0.55001155, 0.46698002, 0.50337838]), 'test_recall_weighted': array([0.5

In [22]:
target_names = ['most likely', 'probable', 'possible','least likely']
stacker.fit(X_train, Y_train)
predictions = list(stacker.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.33      0.10      0.15        10
    probable       0.46      0.70      0.56        27
    possible       0.25      0.12      0.17        16
least likely       0.83      0.87      0.85        23

    accuracy                           0.55        76
   macro avg       0.47      0.45      0.43        76
weighted avg       0.51      0.55      0.51        76



In [23]:
vote.fit(X_train, Y_train)
predictions = list(vote.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.17      0.10      0.12        10
    probable       0.47      0.81      0.59        27
    possible       0.50      0.06      0.11        16
least likely       0.81      0.74      0.77        23

    accuracy                           0.54        76
   macro avg       0.49      0.43      0.40        76
weighted avg       0.54      0.54      0.48        76



In [24]:
bagging_xgbr.fit(X_train, Y_train)
predictions = list(bagging_xgbr.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.33      0.10      0.15        10
    probable       0.52      0.81      0.64        27
    possible       0.43      0.19      0.26        16
least likely       0.88      0.91      0.89        23

    accuracy                           0.62        76
   macro avg       0.54      0.50      0.49        76
weighted avg       0.58      0.62      0.57        76

