In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.metrics import *
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, RandomOverSampler
from imblearn.base import BaseSampler
from imblearn.datasets import make_imbalance

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [2]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 2, "least likely": 3}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)

In [3]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [4]:
print('Before OverSampling, the shape of X: {}'.format(X.shape))
print('Before OverSampling, the shape of y: {} \n'.format(Y.shape))

sm = SMOTE(random_state=seed)
X, Y = sm.fit_resample(X, Y)

print('After OverSampling, the shape of X: {}'.format(X.shape))
print('After OverSampling, the shape of y: {} \n'.format(Y.shape))

print("After OverSampling, counts of label '1': {}".format(sum(Y==1)))
print("After OverSampling, counts of label '2': {}".format(sum(Y==2)))
print("After OverSampling, counts of label '3': {}".format(sum(Y==3)))


Before OverSampling, the shape of X: (293, 6)
Before OverSampling, the shape of y: (293,) 

After OverSampling, the shape of X: (447, 6)
After OverSampling, the shape of y: (447,) 

After OverSampling, counts of label '1': 149
After OverSampling, counts of label '2': 149
After OverSampling, counts of label '3': 149


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [15]:
xgb =  xgboost.XGBClassifier(learning_rate=0.2, n_estimators=40, max_depth=4, random_state=0, reg_alpha=2, reg_lambda=3, eval_metric='mlogloss')


lgbm =  LGBMClassifier(learning_rate=0.2, max_depth=4, n_estimators=33, random_state=0,
               reg_alpha=1, reg_lambda=10)

cb = CatBoostClassifier(depth=4, iterations=50, learning_rate=0.18265036304577847, random_seed=seed, verbose=False)


gb = GradientBoostingClassifier(learning_rate=0.1872026709317995, max_depth=4,
                           max_features='sqrt', n_estimators=50,
                           random_state=0)
                       
rf = RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=25,
                       random_state=0)

dt = DecisionTreeClassifier(max_depth=4, max_features='sqrt', random_state=0)

et = ExtraTreesClassifier(max_depth=4, max_features='log2', n_estimators=50,
                     random_state=0)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')

svc = SVC(C=196.72280894954662)

lr = LogisticRegression(C=0.5, max_iter=1000, random_state=0, solver='liblinear')
results = []
names = []
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']

inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [None]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gbm),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    ("KNN", knn),
    ("SVM", svc),
    ("LR", lr),

]

stacker = StackingClassifier(
    estimators=estimators,
    final_estimator= GradientBoostingClassifier(learning_rate=0.1872026709317995, max_depth=4,
                           max_features='sqrt', n_estimators=50,
                           random_state=0)
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring
)
print("Stacking CV results for all scores:", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


In [None]:
bagging = BaggingClassifier(base_estimator=gb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

cv_results = model_selection.cross_validate(bagging, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
print('Bagging Classifier CV results for all scores:', '\n', cv_results, '\n')
print('Bagging Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Bagging Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy'] ))
print('Bagging F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Bagging Precision CV Average', np.median(cv_results['test_precision_weighted'] ))
print('Bagging Recall CV Average', np.median(cv_results['test_recall_weighted'] ))


In [None]:
model1 = xgb.fit(X_train, Y_train)
model2 = gbm.fit(X_train, Y_train)
model3 = lgbm.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train) #output predictions are not in the same dimensions as all others so needed to be removed (voting classifier error otherwise)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
model8 = knn.fit(X_train, Y_train)
model9 = svc.fit(X_train, Y_train)
model10 = lr.fit(X_train, Y_train)


vote = VotingClassifier([("xgbr", model1), ("gbr", model2), ("lgbm", model3),
                      ("rf", model5), ("et", model6),
                       ("dt", model7), ("knn", model8), ("svm", model9),
                       ("lr", model10)], voting='hard')

cv_results = model_selection.cross_validate(vote, X, Y, cv=outer_cv, scoring=scoring, error_score="raise")

print("Voting CV results for all scores", cv_results)
print('Accuracy CV Average', np.median(cv_results['test_accuracy']))
print('Balanced Accuracy CV Average', np.median(cv_results['test_balanced_accuracy']))
print('F1 CV Average', np.median(cv_results['test_f1_weighted'] ))
print('Precision CV Average', np.median(cv_results['test_precision_weighted']))
print('Recall CV Average', np.median(cv_results['test_recall_weighted']))


In [None]:
target_names = ['most likely', 'probable','least likely']
stacker.fit(X_train, Y_train)
predictions = list(stacker.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

In [None]:
vote.fit(X_train, Y_train)
predictions = list(vote.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

In [None]:
bagging.fit(X_train, Y_train)
predictions = list(bagging.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))