In [1]:
import sys
import pandas as pd
import numpy as np
from numpy import sort
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
import scipy.cluster
from numpy import absolute, mean, sort, std
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from scipy.stats import spearmanr

import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

from sklearn import datasets, metrics, preprocessing, model_selection
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split, KFold,StratifiedKFold,RepeatedKFold, cross_val_score, cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV, validation_curve, learning_curve
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error, max_error

import skopt
from skopt import BayesSearchCV 

from missingpy import MissForest

import shap
from BorutaShap import BorutaShap

import xgboost
import lightgbm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import *
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, RandomOverSampler
from imblearn.base import BaseSampler
from imblearn.datasets import make_imbalance

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from warnings import filterwarnings
filterwarnings("ignore")

import matplotlib.pyplot as plt
import missingno as msno

seed = 0

data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 2, "least likely": 3}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)


xgbr = xgboost.XGBClassifier(random_state=seed, objective='reg:squarederror', verbosity = 0, eval_metric='mlogloss') 
xgbr_params = {
    'max_depth':  (1, 4), 
    'learning_rate': (0.01, 0.2, 'log-uniform'),  
    'n_estimators':  (10, 50), 
    'reg_alpha':  (1, 10, 'log-uniform'), 
    'reg_lambda':  (1, 10, 'log-uniform')} 

lgbm = LGBMClassifier(random_state=seed)
lgbm_params = {
    "max_depth": (1, 4),
    "learning_rate": (0.01, 0.2, "log-uniform"),
    "n_estimators": (10, 50),
    "reg_alpha": (1, 10, "log-uniform"),
    "reg_lambda": (1, 10, "log-uniform"),
}

catboost = CatBoostClassifier(random_seed=seed, verbose=False)
cat_params = {
     "iterations": (10, 50),
     'learning_rate': (0.01, 0.2, 'log-uniform'), 
     'depth':  (1, 4), 
}


gbr = GradientBoostingClassifier(random_state=seed)
gbr_params = {
    'learning_rate': (0.01, 0.2),
    'max_depth': (1, 4),
    "max_features":["log2","sqrt", "auto"],
    "criterion": ["friedman_mse", "mse"],
    'n_estimators': (10, 50)
    }

rfr = RandomForestClassifier(random_state=seed)
rfr_params={'n_estimators': (10, 50), 
             'max_features': ['sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :['gini', 'entropy']} 

dt = DecisionTreeClassifier(random_state=seed)
dt_params= {"criterion": ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2'],
            'max_depth' : (1, 4)}

extra = ExtraTreesClassifier(random_state=seed)
extra_params ={'n_estimators': (10, 50), 
             'max_features': ['sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :['gini', 'entropy']}



knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors':[7,9,11,13,15,17],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan', 'minkowski']}


lr = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto',random_state=seed)
lr_params= {
    'penalty':['l1', 'l2'], 
    'C': [0.5, 1, 5, 10], 
    'max_iter':[500, 1000, 2500]}

svc = SVC()
svc_params = {
    'kernel': ['rbf'],
   'C': (1e0, 1e3),
   'gamma': ['scale', 'auto']}

inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)


results = []
names = []
medians =[]
models_list_balancedac = []
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted']



X = pd.read_csv("selected_features_training_data.csv", header=0)

X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]


print('Before OverSampling, the shape of X: {}'.format(X.shape))
print('Before OverSampling, the shape of y: {} \n'.format(Y.shape))

sm = SMOTE(random_state=seed)
X, Y = sm.fit_resample(X, Y)

print('After OverSampling, the shape of X: {}'.format(X.shape))
print('After OverSampling, the shape of y: {} \n'.format(Y.shape))

print("After OverSampling, counts of label '1': {}".format(sum(Y==1)))
print("After OverSampling, counts of label '2': {}".format(sum(Y==2)))
print("After OverSampling, counts of label '3': {}".format(sum(Y==3)))


X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)

df3 = pd.DataFrame(data= X, columns= X.columns)
df3.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in  X.columns.values
]

models = []

models.append(('XGBR', BayesSearchCV(xgbr, xgbr_params, cv=inner_cv,iid=False,n_jobs=1, random_state=seed))) 
models.append(("LGBM", BayesSearchCV(lgbm, lgbm_params, cv=inner_cv, iid=False, n_jobs=1, random_state=seed)))
models.append(("CB", BayesSearchCV(catboost, cat_params, cv=inner_cv, iid=False, n_jobs=1, random_state=seed)))
models.append(('GBR', BayesSearchCV(gbr, gbr_params, cv=inner_cv,iid=False, n_jobs=1, random_state=seed)))
models.append(('RFR', BayesSearchCV(rfr, rfr_params, cv=inner_cv,iid=False, n_jobs=1, random_state=seed)))
models.append(('DT', BayesSearchCV(dt, dt_params, cv=inner_cv, iid=False, n_jobs=1, random_state=seed)))
models.append(('ExtraTrees', BayesSearchCV(extra, extra_params, cv=inner_cv, iid=False, n_jobs=1, random_state=seed)))


Before OverSampling, the shape of X: (293, 6)
Before OverSampling, the shape of y: (293,) 

After OverSampling, the shape of X: (447, 6)
After OverSampling, the shape of y: (447,) 

After OverSampling, counts of label '1': 149
After OverSampling, counts of label '2': 149
After OverSampling, counts of label '3': 149


In [2]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.median(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.median(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.median(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.median(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.median(nested_cv_results['test_recall_weighted'] ))
    model.fit(X, Y)
    print('\n')
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print("Best Estimator:", model.best_estimator_)
    best_model = model.best_estimator_
    print('\n')
    best_model.fit(X_train, Y_train)
    y_pred_train = best_model.predict(X_train)
    y_pred = best_model.predict(X_test)
    best_model.fit(X, Y)
    median_balancedac = np.median(nested_cv_results['test_balanced_accuracy'])
    models_list_balancedac.append((best_model, median_balancedac))


print('All results:', results)         

best_model1, best_balancedac = sorted(models_list_balancedac, key = lambda x: x[1], reverse=True)[0]
print('Best model by median balanced accuracy:',best_model1)

XGBR Nested CV results for all scores: 
 {'fit_time': array([214.40943527, 209.28818798, 218.61170673, 233.73691201,
       208.13124204, 235.27923417, 229.29567194, 247.09558296,
       270.28088284, 234.30987597]), 'score_time': array([0.00602794, 0.00555897, 0.00691819, 0.00613403, 0.00609827,
       0.00604987, 0.0065949 , 0.00602388, 0.00608015, 0.00592303]), 'test_accuracy': array([0.77777778, 0.88888889, 0.86666667, 0.84444444, 0.86666667,
       0.88888889, 0.77777778, 0.79545455, 0.79545455, 0.79545455]), 'test_balanced_accuracy': array([0.77777778, 0.88888889, 0.86666667, 0.84444444, 0.86666667,
       0.88888889, 0.77777778, 0.79365079, 0.7968254 , 0.79365079]), 'test_f1_weighted': array([0.76622302, 0.88849206, 0.86853448, 0.84287318, 0.86057692,
       0.89029786, 0.77755531, 0.79545455, 0.78948992, 0.78911408]), 'test_precision_weighted': array([0.77020202, 0.89331322, 0.87394958, 0.85185185, 0.87165775,
       0.89305556, 0.77837302, 0.79545455, 0.80284906, 0.79108392]),

In [3]:
results = []
names = []
othermodels = []

othermodels.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('LR', BayesSearchCV(lr, lr_params, cv=inner_cv, iid=False, n_jobs=1)))

X2 = MinMaxScaler().fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(
    X2, Y, test_size=0.2, random_state=seed)

df3 = pd.DataFrame(data= X2, columns= X.columns)
df3.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in  X.columns.values
]
X_importance = df3

for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2 , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2 , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.median(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.median(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.median(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.median(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.median(nested_cv_results['test_recall_weighted'] ))
    model.fit(X2, Y)
    print('\n')
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print("Best Estimator:", model.best_estimator_)
    best_model = model.best_estimator_
    print('\n')
    best_model.fit(X_train, Y_train)
    y_pred_train = best_model.predict(X_train)
    y_pred = best_model.predict(X_test)
    best_model.fit(X2, Y)
    median_balancedac = np.median(nested_cv_results['test_balanced_accuracy'])
    models_list_balancedac.append((best_model, median_balancedac))


print('All results:', results)   


best_model1, best_balancedac = sorted(models_list_balancedac, key = lambda x: x[1], reverse=True)[0]
print('Best model by median balanced accuracy:', best_model1)


KNN Nested CV results for all scores: 
 {'fit_time': array([59.874928  , 64.31847095, 57.32496619, 63.03482509, 59.57117295,
       68.18453884, 65.91971922, 67.21196914, 57.24764514, 71.15620303]), 'score_time': array([0.00643706, 0.00511813, 0.00524879, 0.00521803, 0.00620484,
       0.00531411, 0.00467587, 0.00660682, 0.00364971, 0.0045321 ]), 'test_accuracy': array([0.75555556, 0.62222222, 0.75555556, 0.77777778, 0.8       ,
       0.82222222, 0.71111111, 0.72727273, 0.63636364, 0.72727273]), 'test_balanced_accuracy': array([0.75555556, 0.62222222, 0.75555556, 0.77777778, 0.8       ,
       0.82222222, 0.71111111, 0.72857143, 0.64285714, 0.72380952]), 'test_f1_weighted': array([0.74928161, 0.61253561, 0.73723443, 0.77030812, 0.79278674,
       0.81410256, 0.70660522, 0.72011218, 0.61909046, 0.71677397]), 'test_precision_weighted': array([0.7464986 , 0.62826063, 0.76884921, 0.77597841, 0.79411765,
       0.82174688, 0.70512821, 0.72737907, 0.63673744, 0.73041163]), 'test_recall_weig

In [4]:
xgb =  xgboost.XGBClassifier(learning_rate=0.2, n_estimators=40, max_depth=4, random_state=0, reg_alpha=2, reg_lambda=3)


lgbm =  LGBMClassifier(learning_rate=0.2, max_depth=4, n_estimators=33, random_state=0,
               reg_alpha=1, reg_lambda=10)

cb = CatBoostClassifier(depth=4, iterations=50, learning_rate=0.18265036304577847, random_seed=seed, verbose=False)


gb = GradientBoostingClassifier(learning_rate=0.1872026709317995, max_depth=4,
                           max_features='sqrt', n_estimators=50,
                           random_state=0)
                       
rf = RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=25,
                       random_state=0)

dt = DecisionTreeClassifier(max_depth=4, max_features='sqrt', random_state=0)

et = ExtraTreesClassifier(max_depth=4, max_features='log2', n_estimators=50,
                     random_state=0)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')

svc = SVC(C=196.72280894954662)

lr = LogisticRegression(C=0.5, max_iter=1000, random_state=0, solver='liblinear')

In [6]:
target_names = ['most likely', 'probable','least likely']
xgb.fit(X_train, Y_train)
predictions = list(xgb.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.81      0.81      0.81        31
    probable       0.69      0.74      0.71        27
least likely       0.90      0.84      0.87        32

    accuracy                           0.80        90
   macro avg       0.80      0.80      0.80        90
weighted avg       0.80      0.80      0.80        90



In [7]:
lgbm.fit(X_train, Y_train)
predictions = list(lgbm.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.79      0.84      0.81        31
    probable       0.71      0.74      0.73        27
least likely       0.93      0.84      0.89        32

    accuracy                           0.81        90
   macro avg       0.81      0.81      0.81        90
weighted avg       0.82      0.81      0.81        90



In [8]:
cb.fit(X_train, Y_train)
predictions = list(cb.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.79      0.84      0.81        31
    probable       0.72      0.67      0.69        27
least likely       0.91      0.91      0.91        32

    accuracy                           0.81        90
   macro avg       0.80      0.80      0.80        90
weighted avg       0.81      0.81      0.81        90



In [9]:
gb.fit(X_train, Y_train)
predictions = list(gb.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.84      0.87      0.86        31
    probable       0.79      0.81      0.80        27
least likely       0.97      0.91      0.94        32

    accuracy                           0.87        90
   macro avg       0.87      0.86      0.86        90
weighted avg       0.87      0.87      0.87        90



In [10]:
rf.fit(X_train, Y_train)
predictions = list(rf.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.76      0.71      0.73        31
    probable       0.64      0.67      0.65        27
least likely       0.88      0.91      0.89        32

    accuracy                           0.77        90
   macro avg       0.76      0.76      0.76        90
weighted avg       0.77      0.77      0.77        90



In [11]:
dt.fit(X_train, Y_train)
predictions = list(dt.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.83      0.77      0.80        31
    probable       0.65      0.56      0.60        27
least likely       0.76      0.91      0.83        32

    accuracy                           0.76        90
   macro avg       0.75      0.75      0.74        90
weighted avg       0.75      0.76      0.75        90



In [12]:
et.fit(X_train, Y_train)
predictions = list(et.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.85      0.74      0.79        31
    probable       0.81      0.63      0.71        27
least likely       0.76      1.00      0.86        32

    accuracy                           0.80        90
   macro avg       0.81      0.79      0.79        90
weighted avg       0.81      0.80      0.79        90



In [13]:
knn.fit(X_train, Y_train)
predictions = list(knn.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.81      0.84      0.83        31
    probable       0.62      0.48      0.54        27
least likely       0.73      0.84      0.78        32

    accuracy                           0.73        90
   macro avg       0.72      0.72      0.72        90
weighted avg       0.73      0.73      0.73        90



In [14]:
svc.fit(X_train, Y_train)
predictions = list(svc.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.60      0.29      0.39        31
    probable       0.35      0.74      0.48        27
least likely       0.72      0.41      0.52        32

    accuracy                           0.47        90
   macro avg       0.56      0.48      0.46        90
weighted avg       0.57      0.47      0.46        90



In [15]:
lr.fit(X_train, Y_train)
predictions = list(lr.predict(X_test))
print(classification_report(Y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.81      0.68      0.74        31
    probable       0.36      0.44      0.40        27
least likely       0.55      0.53      0.54        32

    accuracy                           0.56        90
   macro avg       0.57      0.55      0.56        90
weighted avg       0.58      0.56      0.57        90

