In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.metrics import (
    explained_variance_score,
    max_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [2]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data['BPlabel_encoded'] = data['BPlabel'].map( {'most likely':1,'probable':2, 'least likely':3})
Y = data["BPlabel_encoded"]
classes_weights_all = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=Y
)
data = data.drop(["BPlabel"], 1)

In [3]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [5]:
xgb =  xgboost.XGBClassifier(learning_rate=0.2, n_estimators=49, max_depth=4, random_state=0, reg_alpha=1, reg_lambda=1, eval_metric='mlogloss',scale_pos_weight=0.21)

lgbm =  LGBMClassifier(learning_rate=0.038035006480658606, max_depth=3, n_estimators=45,
               random_state=0, reg_alpha=1, reg_lambda=1)

cb = CatBoostClassifier(depth=4, iterations=50, learning_rate=0.1400545927979144, random_seed=seed, verbose=False)
                       
gbm = GradientBoostingClassifier(learning_rate=0.1281808322184159, max_depth=4, criterion='mse',
                          max_features='sqrt', n_estimators=36, random_state=seed)
                       
rf =  RandomForestClassifier(max_depth=3, max_features='log2', n_estimators=50,
                       random_state=0)

dt = DecisionTreeClassifier(max_depth=4, max_features='sqrt', random_state=0)

et = ExtraTreesClassifier(criterion='entropy', max_depth=4, max_features='log2',
                     n_estimators=35, random_state=0)

knn = KNeighborsClassifier(n_neighbors=7)

svc = SVC(C=293.53707592157997)

lr = LogisticRegression(C=0.5, max_iter=2500, random_state=0, solver='liblinear')

results = []
names = []
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']

inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [6]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gbm),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    #("KNN", knn),
    ("SVM", svc),
    ("LR", lr),

]

stacker = StackingClassifier(
    estimators=estimators,
    final_estimator= CatBoostClassifier(depth=4, iterations=50, learning_rate=0.1400545927979144, random_seed=seed, verbose=False)
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring, fit_params={'sample_weight': classes_weights_all}
)
print("Stacking CV results for all scores:", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

In [7]:
bagging_cb = BaggingClassifier(base_estimator=cb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

cv_results = model_selection.cross_validate(bagging_cb, X , Y, cv=outer_cv, scoring=scoring, error_score="raise", fit_params={'sample_weight': classes_weights_all})
print('Bagging Classifier CV results for all scores:', '\n', cv_results, '\n')
print('Bagging Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Bagging Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('Bagging F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Bagging Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Bagging Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


Bagging Classifier CV results for all scores: 
 {'fit_time': array([2.28603792, 0.10756707, 0.10463572, 0.11188912, 0.1036911 ,
       0.09913802, 0.09700489, 0.10543203, 0.11677694, 0.09764194]), 'score_time': array([0.02085209, 0.01583385, 0.01692629, 0.01525784, 0.01502204,
       0.01457405, 0.018893  , 0.01550293, 0.0140729 , 0.01535106]), 'test_accuracy': array([0.73333333, 0.6       , 0.76666667, 0.72413793, 0.79310345,
       0.68965517, 0.55172414, 0.75862069, 0.75862069, 0.79310345]), 'test_balanced_accuracy': array([0.74074074, 0.53333333, 0.71111111, 0.63809524, 0.82222222,
       0.66666667, 0.48888889, 0.66666667, 0.74074074, 0.73333333]), 'test_f1_weighted': array([0.73809524, 0.61882436, 0.75641472, 0.69093231, 0.78927969,
       0.68683386, 0.56444029, 0.74910394, 0.75812808, 0.78400053]), 'test_precision_weighted': array([0.75128205, 0.64153439, 0.75396825, 0.77365164, 0.81792059,
       0.69932079, 0.58569376, 0.74137931, 0.76681939, 0.78347515]), 'test_recall_weight

In [8]:
model1 = xgb.fit(X_train, Y_train)
model2 = gbm.fit(X_train, Y_train)
model3 = lgbm.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train) #output predictions are not in the same dimensions as all others so needed to be removed (voting classifier error otherwise)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
#model8 = knn.fit(X_train, Y_train)
model9 = svc.fit(X_train, Y_train)
model10 = lr.fit(X_train, Y_train)


vote = VotingClassifier([("xgbr", model1), ("gbr", model2), ("lgbm", model3),
                      ("rf", model5), ("et", model6),
                       ("dt", model7), ("svm", model9),
                       ("lr", model10)], voting='hard')

cv_results = model_selection.cross_validate(vote, X, Y, cv=outer_cv, scoring=scoring, error_score="raise", fit_params={'sample_weight': classes_weights_all})

print("Voting CV results for all scores", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy']))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted']))
print('Recall CV Average', np.median(cv_results['test_recall_weighted']))


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

In [9]:
classes_weights_all = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=Y_train
)
target_names = ['Most likely', 'Probable', 'Least likely']
stacker.fit(X_train, Y_train,sample_weight=classes_weights_all)
predictions = list(stacker.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in lang

In [10]:
vote.fit(X_train, Y_train,sample_weight=classes_weights_all)
predictions = list(vote.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


              precision    recall  f1-score   support

 Most likely       0.22      0.57      0.32         7
    Probable       0.74      0.53      0.62        32
Least likely       0.94      0.85      0.89        20

    accuracy                           0.64        59
   macro avg       0.64      0.65      0.61        59
weighted avg       0.75      0.64      0.68        59



In [11]:
bagging_cb.fit(X_train, Y_train,sample_weight=classes_weights_all)
predictions = list(bagging_cb.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 Most likely       0.23      0.43      0.30         7
    Probable       0.72      0.66      0.69        32
Least likely       0.94      0.80      0.86        20

    accuracy                           0.68        59
   macro avg       0.63      0.63      0.62        59
weighted avg       0.74      0.68      0.70        59

