In [2]:
import sys
import pandas as pd
import numpy as np
from numpy import absolute, mean, sort, std

import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

from sklearn import datasets, metrics, preprocessing, model_selection
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split, KFold,RepeatedKFold, cross_val_score, cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV, validation_curve, learning_curve
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error, max_error

import skopt
from skopt import BayesSearchCV 

from missingpy import MissForest

import shap
from BorutaShap import BorutaShap

import xgboost
import lightgbm
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from warnings import filterwarnings
filterwarnings("ignore")

import matplotlib.pyplot as plt

seed = 0


data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 0.5, "least likely": 0.1}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)

X = pd.read_csv("imputed_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

xgbr = xgboost.XGBRegressor(random_state=seed, objective='reg:squarederror') 
xgbr_params = {
    'max_depth':  (1, 4), 
    'learning_rate': (0.01, 0.2, 'log-uniform'),  
    'n_estimators':  (10, 50), 
    'reg_alpha':  (1, 10, 'log-uniform'), 
    'reg_lambda':  (1, 10, 'log-uniform')} 

lgbm = LGBMRegressor(random_state=seed)
lgbm_params = {
    "max_depth": (1, 4),
    "learning_rate": (0.01, 0.2, "log-uniform"),
    "n_estimators": (10, 50),
    "reg_alpha": (1, 10, "log-uniform"),
    "reg_lambda": (1, 10, "log-uniform"),
}

catboost = CatBoostRegressor(random_seed=seed, verbose=False)
cat_params = {
     "iterations": (10, 50),
     'learning_rate': (0.01, 0.2, 'log-uniform'), 
     'depth':  (1, 4), 
}


gbr = GradientBoostingRegressor(random_state=seed)
gbr_params = {
    'learning_rate': (0.01, 0.2),
    'max_depth': (1, 4),
    'n_estimators': (10, 50)
    }

rfr = RandomForestRegressor(random_state=seed)
rfr_params={'n_estimators': (10, 50), 
             'max_depth' : (1, 4)} 

dt = DecisionTreeRegressor(random_state=seed)
dt_params= {
            'max_depth' : (1, 4)}

extra = ExtraTreesRegressor(random_state=seed)
extra_params ={'n_estimators': (10, 50), 
             'max_depth' : (1, 4),}

knr = KNeighborsRegressor()
knr_params = {
    'n_neighbors':[7,9,11,13,15,17],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan']}


lasso = Lasso(random_state=seed)
lasso_params =  {"alpha": (0.001, 0.01, 0.1),
                "max_iter": (500, 1000, 5000),}

elastic = ElasticNet(random_state=seed, tol=1)
elastic_params = {
    "max_iter": (500, 1000, 5000),
    "alpha": (0.001, 0.01, 0.1),
    "l1_ratio": np.arange(0.0, 1.0)}

svr = SVR()
svr_params = {
    'kernel': ['rbf'],
   'C': (1e0, 1e3),
   'gamma': (1e-4, 1e-3)}

inner_cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=seed)
outer_cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=seed)

models = []

models.append(('XGBR', BayesSearchCV(xgbr, xgbr_params, cv=inner_cv,iid=False,n_jobs=-1, random_state=seed))) 
models.append(("LGBM", BayesSearchCV(lgbm, lgbm_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))
models.append(("CB", BayesSearchCV(catboost, cat_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))
models.append(('GBR', BayesSearchCV(gbr, gbr_params, cv=inner_cv,iid=False, n_jobs=-1, random_state=seed)))
models.append(('RFR', BayesSearchCV(rfr, rfr_params, cv=inner_cv,iid=False, n_jobs=-1, random_state=seed)))
models.append(('DT', BayesSearchCV(dt, dt_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))
models.append(('ExtraTrees', BayesSearchCV(extra, extra_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))


results = []
names = []
medians =[]
scoring = ['r2', 'neg_mean_squared_error', 'max_error', 'neg_mean_absolute_error',
          'explained_variance','neg_root_mean_squared_error',
           'neg_median_absolute_error'] 

models_list_r2 = []
models_list_predr2 = []

def press_statistic(Y, y_pred2, xs):
    res = y_pred2 - Y
    hat = xs.dot(np.linalg.pinv(xs))
    den = 1 - np.diagonal(hat)
    sqr = np.square(res / den)
    return sqr.sum()


def predicted_r2(Y, y_pred2, xs):
    press = press_statistic(Y=Y, y_pred2=y_pred2, xs=xs)
    sst = np.square(Y - Y.mean()).sum()
    return 1 - press / sst


def r2(Y, y_pred2):
    sse = np.square(y_pred2 - Y).sum()
    sst = np.square(Y - Y.mean()).sum()
    return 1 - sse / sst


df3 = pd.DataFrame(data=X, columns=X.columns)
df3.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]
X_importance = X_test

for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X, Y, cv=outer_cv, scoring=scoring, error_score="raise")
    names.append(name)
    medians.append(np.median(nested_cv_results['test_r2']))
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'r2 Nested CV Median', np.median(nested_cv_results['test_r2']))
    print(name, 'MSE Nested CV Median', np.median(nested_cv_results['test_neg_mean_squared_error'] ))
    print(name, 'RMSE Nested CV Median', np.median(nested_cv_results['test_neg_root_mean_squared_error'] ))
    print(name, 'Explained Variance Nested CV Median', np.median(nested_cv_results['test_explained_variance'] ))
    print(name, 'MAE Nested CV Median', np.median(nested_cv_results['test_neg_mean_absolute_error'] ))
    model.fit(X, Y)
    print('\n')
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print("Best Estimator:", model.best_estimator_)
    best_model = model.best_estimator_
    print('\n')
    print('Non-nested CV Results:')
    best_model.fit(X_train, Y_train)
    y_pred_train = best_model.predict(X_train)
    y_pred = best_model.predict(X_test)
    print(name, 'Train MSE:', mean_squared_error(Y_train, y_pred_train), 'Test MSE:', mean_squared_error(Y_test, y_pred))
    print(name, 'Train Explained Variance Score:', explained_variance_score(Y_train, y_pred_train), 'Test Explained Variance Score:', explained_variance_score(Y_test, y_pred))
    print(name, 'Train MAE:', mean_absolute_error(Y_train, y_pred_train),'Test MAE:', mean_absolute_error(Y_test, y_pred))
    print(name, 'Train Max Error:', max_error(Y_train, y_pred_train), 'Test Max Error:', max_error(Y_test, y_pred))
    print(name, 'Train r2:', r2_score(Y_train, y_pred_train), 'Test r2:', r2_score(Y_test, y_pred))
    print('\n')
    best_model.fit(X, Y)
    y_pred2 = best_model.predict(X)
    print(name, "Best model predicted r2:", predicted_r2(Y, y_pred2, X))
    #explainer = shap.TreeExplainer(best_model)
    #shap_values = explainer.shap_values(X)
    #X_importance = pd.DataFrame(data=X, columns=df3.columns)
    #print(name,'ALL FEATURES Ranked SHAP Importance:', X.columns[np.argsort(np.abs(shap_values).mean(0))[::-1]])
    #fig, ax = plt.subplots()
    #shap.summary_plot(shap_values, X)
    #fig.savefig("shap_summary_all_features" + name +".svg", format='svg', dpi=1200, bbox_inches = "tight")
    median_r2 = np.median(nested_cv_results['test_r2'])
    models_list_r2.append((best_model,  median_r2))
    predr2_score = predicted_r2(Y, y_pred2, X)
    models_list_predr2.append((best_model, predr2_score))

print('All r2 results:', results)         

best_model1, best_r2 = sorted(models_list_r2, key = lambda x: x[1], reverse=True)[0]
best_model2, best_pred_r2 = sorted(models_list_predr2, key = lambda x: x[1], reverse=True)[0]
print('Best model by median r2:',best_model1)
print('Best model by predicted r2:',best_model2)


XGBR Nested CV results for all scores: 
 {'fit_time': array([68.122051  , 69.29945993, 66.472054  , 61.73464584, 77.94836974,
       70.76323199, 67.33586287, 64.97866011, 71.87181067, 63.99226904,
       66.33580422, 67.78286719, 64.30247092, 71.86601901, 73.15075302]), 'score_time': array([0.00450206, 0.00434494, 0.00426197, 0.00471807, 0.00524402,
       0.00380683, 0.00420904, 0.00406098, 0.00423121, 0.00444508,
       0.00418091, 0.00423694, 0.00419402, 0.00416899, 0.00477791]), 'test_r2': array([0.45634346, 0.45634309, 0.46720414, 0.53580059, 0.57091602,
       0.46280731, 0.4935618 , 0.53677778, 0.382604  , 0.56331327,
       0.51347667, 0.51085961, 0.48929966, 0.536598  , 0.45951789]), 'test_neg_mean_squared_error': array([-0.04244925, -0.05463119, -0.05009597, -0.04321857, -0.03660862,
       -0.04670382, -0.03503022, -0.03695132, -0.06444117, -0.05112117,
       -0.05048046, -0.04733181, -0.03709436, -0.04430145, -0.04983393]), 'test_max_error': array([-0.55527312, -0.5959481

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

CB Nested CV results for all scores: 
 {'fit_time': array([56.40243506, 60.3034749 , 58.40363288, 57.87120008, 59.22897696,
       60.82355905, 62.2489841 , 59.26476312, 60.643538  , 61.69011688,
       63.33669496, 58.96419001, 58.24007607, 62.91285706, 60.36637402]), 'score_time': array([0.00313091, 0.0031848 , 0.00416827, 0.0034399 , 0.00320983,
       0.00377011, 0.00416589, 0.00335383, 0.002918  , 0.00411105,
       0.00428915, 0.00450706, 0.00415111, 0.00432181, 0.00388503]), 'test_r2': array([0.45188422, 0.43159639, 0.57269041, 0.51794558, 0.57185913,
       0.5049758 , 0.47628526, 0.55569704, 0.4335117 , 0.61077326,
       0.53481301, 0.49782161, 0.56923489, 0.60087321, 0.39051467]), 'test_neg_mean_squared_error': array([-0.04279743, -0.05711795, -0.04017766, -0.04488093, -0.03652815,
       -0.04303767, -0.03622523, -0.03544213, -0.05912764, -0.04556522,
       -0.04826666, -0.04859344, -0.03128832, -0.03815671, -0.05619621]), 'test_max_error': array([-0.4887055 , -0.58719773,

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu



Best Parameters: 
OrderedDict([('depth', 4), ('iterations', 50), ('learning_rate', 0.14116463818396946)])

Best Estimator: <catboost.core.CatBoostRegressor object at 0x7fb771183c70>


Non-nested CV Results:
CB Train MSE: 0.015927370485606825 Test MSE: 0.04532170744038869
CB Train Explained Variance Score: 0.8341452606553582 Test Explained Variance Score: 0.4514824857398617
CB Train MAE: 0.09566126430846826 Test MAE: 0.16682793838042506
CB Train Max Error: 0.40119607810430546 Test Max Error: 0.49908953502992537
CB Train r2: 0.834048975828141 Test r2: 0.4195553215599963


CB Best model predicted r2: 0.5257829726475238
GBR Nested CV results for all scores: 
 {'fit_time': array([60.07768893, 57.83799887, 57.86588383, 58.24393487, 58.91827726,
       61.59390306, 61.47258687, 64.12149191, 64.29345202, 53.49569988,
       59.98653603, 55.21280098, 54.314394  , 58.50508094, 57.10587192]), 'score_time': array([0.00266099, 0.00273991, 0.00351214, 0.00345302, 0.00273466,
       0.00339198, 0.0