In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression

from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [2]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 2, "possible": 3, 'least likely':4}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)

In [3]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [8]:
xgb =  xgboost.XGBClassifier(learning_rate=0.15120340125789705, n_estimators=50, max_depth=4, random_state=0, reg_alpha=3, reg_lambda=10,
                            eval_metric='mlogloss')

lgbm =  LGBMClassifier(learning_rate=0.2, max_depth=1, n_estimators=50, random_state=0,
               reg_alpha=1, reg_lambda=1)

cb = CatBoostClassifier(depth=4, iterations=50, learning_rate=0.1964188294579477, random_seed=seed, verbose=False)

gb = GradientBoostingClassifier(learning_rate=0.19101624063861414, max_depth=1,
                           max_features='auto', n_estimators=47,
                           random_state=0)
                       
rf = RandomForestClassifier(criterion='entropy', max_depth=4, max_features='log2',
                       n_estimators=24, random_state=0)

dt = DecisionTreeClassifier(max_depth=3, max_features='sqrt', random_state=0)

et = ExtraTreesClassifier(criterion='entropy', max_depth=4, max_features='log2',
                     n_estimators=10, random_state=0)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7)

svc = SVC(C=999.9984985741586)

lr = LogisticRegression(max_iter=1000, penalty='l1', random_state=0,
                   solver='liblinear')

results = []
names = []
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']

inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True,  random_state=seed)

In [9]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gb),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    ("KNN", knn),
    ("SVM", svc),
    ("LR", lr),

]

stacker = StackingClassifier(
    estimators=estimators,
    final_estimator= xgboost.XGBClassifier(learning_rate=0.15120340125789705, n_estimators=50, max_depth=4, random_state=0, reg_alpha=3, reg_lambda=10,
                            eval_metric='mlogloss')
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring
)
print("Stacking CV results for all scores:", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Stacking CV results for all scores: {'fit_time': array([1.77260208, 1.66073322, 1.75240588, 1.72378087, 1.73308182]), 'score_time': array([0.02458811, 0.02298594, 0.02501702, 0.02559614, 0.02425313]), 'test_accuracy': array([0.53947368, 0.55263158, 0.56      , 0.52      , 0.50666667]), 'test_balanced_accuracy': array([0.48174391, 0.46934985, 0.49176364, 0.44656863, 0.49607843]), 'test_f1_weighted': array([0.51086651, 0.52821096, 0.54187141, 0.49101215, 0.48976589]), 'test_precision_weighted': array([0.51907895, 0.5421123 , 0.56617544, 0.46571429, 0.48450956]), 'test_recall_weighted': array([0.53947368, 0.55263158, 0.56      , 0.52      , 0.50666667])}
Accuracy CV Average 0.5394736842105263
Balanced Accuracy CV Average 0.4817439096850862
F1 CV Average 0.5108665092217725
Precision CV Average 0.5190789473684211
Recall CV Average 0.5394736842105263


In [10]:
bagging_xgbr = BaggingClassifier(base_estimator=xgb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

cv_results = model_selection.cross_validate(bagging_xgbr, X , Y, cv=outer_cv, scoring=scoring)
print('Bagging Classifier CV results for all scores:', '\n', cv_results, '\n')
print('Bagging Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Bagging Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('Bagging F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Bagging Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Bagging Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Bagging Classifier CV results for all scores: 
 {'fit_time': array([4.05002213, 2.54948711, 2.65910196, 2.61676216, 2.64616489]), 'score_time': array([0.17299795, 0.16878319, 0.17158389, 0.172261  , 0.16837621]), 'test_accuracy': array([0.59210526, 0.52631579, 0.65333333, 0.42666667, 0.6       ]), 'test_balanced_accuracy': array([0.53894831, 0.42874097, 0.5951612 , 0.38022876, 0.53137255]), 'test_f1_weighted': array([0.53232678, 0.46869468, 0.62400825, 0.41022916, 0.54285917]), 'test_precision_weighted': array([0.53474861, 0.47841345, 0.66181762, 0.43244444, 0.59276596]), 'test_recall_weighted': array([0.59210526, 0.52631579, 0.65333333, 0.42666667, 0.6       ])} 

Bagging Accuracy CV Average 0.5921052631578947
Bagging Balanced Accuracy CV Average 0.5313725490196078
Bagging F1 CV Average 0.5323267755910267
Bagging Precision CV Average 0.5347486093281986
Bagging Recall CV Average 0.5921052631578947


In [12]:
model1 = xgb.fit(X_train, Y_train)
model2 = gb.fit(X_train, Y_train)
model3 = lgbm.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
model8 = knn.fit(X_train, Y_train)
model9 = svc.fit(X_train, Y_train)
model10 = lr.fit(X_train, Y_train)

vote = VotingClassifier([("xgbr", model1), ("gbr", model2), ("lgbm", model3),
                      ("rf", model5), ("et", model6),
                       ("dt", model7), ("knn", model8), ("svm", model9),
                       ("lr", model10)])

cv_results = model_selection.cross_validate(
        vote, X, Y, cv=outer_cv, scoring=scoring
)
print("Voting CV results for all scores", cv_results)

print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Voting CV results for all scores {'fit_time': array([0.3238709 , 0.48452425, 0.32647204, 0.29443717, 0.34964395]), 'score_time': array([0.02710986, 0.02485299, 0.02542996, 0.02642775, 0.02511096]), 'test_accuracy': array([0.57894737, 0.57894737, 0.64      , 0.53333333, 0.56      ]), 'test_balanced_accuracy': array([0.49627154, 0.46689886, 0.5476967 , 0.43970588, 0.48333333]), 'test_f1_weighted': array([0.49852902, 0.51189228, 0.58457811, 0.4681994 , 0.48605051]), 'test_precision_weighted': array([0.68098339, 0.53398693, 0.62879433, 0.48366667, 0.43817345]), 'test_recall_weighted': array([0.57894737, 0.57894737, 0.64      , 0.53333333, 0.56      ])}
Accuracy CV Average 0.5789473684210527
Balanced Accuracy CV Average 0.4833333333333334
F1 CV Average 0.4985290179195997
Precision CV Average 0.5339869281045753
Recall CV Average 0.5789473684210527


In [13]:
target_names = ['most likely', 'probable', 'possible','least likely']
stacker.fit(X_train, Y_train)
predictions = list(stacker.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.33      0.10      0.15        10
    probable       0.46      0.70      0.56        27
    possible       0.25      0.12      0.17        16
least likely       0.83      0.87      0.85        23

    accuracy                           0.55        76
   macro avg       0.47      0.45      0.43        76
weighted avg       0.51      0.55      0.51        76



In [15]:
vote.fit(X_train, Y_train)
predictions = list(vote.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.17      0.10      0.12        10
    probable       0.47      0.81      0.59        27
    possible       0.50      0.06      0.11        16
least likely       0.81      0.74      0.77        23

    accuracy                           0.54        76
   macro avg       0.49      0.43      0.40        76
weighted avg       0.54      0.54      0.48        76



In [None]:
bagging_xgbr.fit(X_train, Y_train)
predictions = list(bagging_xgbr.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))