In [2]:
import sys
import pandas as pd
import numpy as np
from numpy import absolute, mean, sort, std

import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

from sklearn import datasets, metrics, preprocessing, model_selection
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split, KFold,RepeatedKFold, cross_val_score, cross_validate, cross_val_predict, GridSearchCV, RandomizedSearchCV, validation_curve, learning_curve
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error, max_error

import skopt
from skopt import BayesSearchCV 

from missingpy import MissForest

import shap
from BorutaShap import BorutaShap

import xgboost
import lightgbm
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from warnings import filterwarnings
filterwarnings("ignore")

import matplotlib.pyplot as plt
import missingno as msno

seed = 0

data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 0.5, "least likely": 0}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)

X = pd.read_csv("imputed_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

xgbr = xgboost.XGBRegressor(random_state=seed, objective='reg:squarederror') 
xgbr_params = {
    'max_depth':  (1, 4), 
    'learning_rate': (0.01, 0.2, 'log-uniform'),  
    'n_estimators':  (10, 50), 
    'reg_alpha':  (1, 10, 'log-uniform'), 
    'reg_lambda':  (1, 10, 'log-uniform')} 

lgbm = LGBMRegressor(random_state=seed)
lgbm_params = {
    "max_depth": (1, 4),
    "learning_rate": (0.01, 0.2, "log-uniform"),
    "n_estimators": (10, 50),
    "reg_alpha": (1, 10, "log-uniform"),
    "reg_lambda": (1, 10, "log-uniform"),
}

catboost = CatBoostRegressor(random_seed=seed, verbose=False)
cat_params = {
     "iterations": (10, 50),
     'learning_rate': (0.01, 0.2, 'log-uniform'), 
     'depth':  (1, 4), 
}


gbr = GradientBoostingRegressor(random_state=seed)
gbr_params = {
    'learning_rate': (0.01, 0.2),
    'max_depth': (1, 4),
    'n_estimators': (10, 50)
    }

rfr = RandomForestRegressor(random_state=seed)
rfr_params={'n_estimators': (10, 50), 
             'max_depth' : (1, 4)} 

dt = DecisionTreeRegressor(random_state=seed)
dt_params= {
            'max_depth' : (1, 4)}

extra = ExtraTreesRegressor(random_state=seed)
extra_params ={'n_estimators': (10, 50), 
             'max_depth' : (1, 4),}

knr = KNeighborsRegressor()
knr_params = {
    'n_neighbors':[7,9,11,13,15,17],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan']}


lasso = Lasso(random_state=seed)
lasso_params =  {"alpha": (0.001, 0.01, 0.1),
                "max_iter": (500, 1000, 5000),}

elastic = ElasticNet(random_state=seed, tol=1)
elastic_params = {
    "max_iter": (500, 1000, 5000),
    "alpha": (0.001, 0.01, 0.1),
    "l1_ratio": np.arange(0.0, 1.0)}

svr = SVR()
svr_params = {
    'kernel': ['rbf'],
   'C': (1e0, 1e3),
   'gamma': (1e-4, 1e-3)}

inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

models = []

models.append(('XGBR', BayesSearchCV(xgbr, xgbr_params, cv=inner_cv,iid=False,n_jobs=-1, random_state=seed))) 
models.append(("LGBM", BayesSearchCV(lgbm, lgbm_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))
models.append(("CB", BayesSearchCV(catboost, cat_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))
models.append(('GBR', BayesSearchCV(gbr, gbr_params, cv=inner_cv,iid=False, n_jobs=-1, random_state=seed)))
models.append(('RFR', BayesSearchCV(rfr, rfr_params, cv=inner_cv,iid=False, n_jobs=-1, random_state=seed)))
models.append(('DT', BayesSearchCV(dt, dt_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))
models.append(('ExtraTrees', BayesSearchCV(extra, extra_params, cv=inner_cv, iid=False, n_jobs=-1, random_state=seed)))


results = []
names = []
medians =[]
scoring = ['r2', 'neg_mean_squared_error', 'max_error', 'neg_mean_absolute_error',
          'explained_variance','neg_root_mean_squared_error',
           'neg_median_absolute_error'] 

models_list_r2 = []
models_list_predr2 = []

def press_statistic(Y, y_pred2, xs):
    res = y_pred2 - Y
    hat = xs.dot(np.linalg.pinv(xs))
    den = 1 - np.diagonal(hat)
    sqr = np.square(res / den)
    return sqr.sum()


def predicted_r2(Y, y_pred2, xs):
    press = press_statistic(Y=Y, y_pred2=y_pred2, xs=xs)
    sst = np.square(Y - Y.mean()).sum()
    return 1 - press / sst


def r2(Y, y_pred2):
    sse = np.square(y_pred2 - Y).sum()
    sst = np.square(Y - Y.mean()).sum()
    return 1 - sse / sst


df3 = pd.DataFrame(data=X, columns=X.columns)
df3.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]
X_importance = X_test

for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X, Y, cv=outer_cv, scoring=scoring, error_score="raise")
    names.append(name)
    medians.append(np.median(nested_cv_results['test_r2']))
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'r2 Nested CV Median', np.median(nested_cv_results['test_r2']))
    print(name, 'MSE Nested CV Median', np.median(nested_cv_results['test_neg_mean_squared_error'] ))
    print(name, 'RMSE Nested CV Median', np.median(nested_cv_results['test_neg_root_mean_squared_error'] ))
    print(name, 'Explained Variance Nested CV Median', np.median(nested_cv_results['test_explained_variance'] ))
    print(name, 'MAE Nested CV Median', np.median(nested_cv_results['test_neg_mean_absolute_error'] ))
    model.fit(X, Y)
    print('\n')
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print("Best Estimator:", model.best_estimator_)
    best_model = model.best_estimator_
    print('\n')
    print('Non-nested CV Results:')
    best_model.fit(X_train, Y_train)
    y_pred_train = best_model.predict(X_train)
    y_pred = best_model.predict(X_test)
    print(name, 'Train MSE:', mean_squared_error(Y_train, y_pred_train), 'Test MSE:', mean_squared_error(Y_test, y_pred))
    print(name, 'Train Explained Variance Score:', explained_variance_score(Y_train, y_pred_train), 'Test Explained Variance Score:', explained_variance_score(Y_test, y_pred))
    print(name, 'Train MAE:', mean_absolute_error(Y_train, y_pred_train),'Test MAE:', mean_absolute_error(Y_test, y_pred))
    print(name, 'Train Max Error:', max_error(Y_train, y_pred_train), 'Test Max Error:', max_error(Y_test, y_pred))
    print(name, 'Train r2:', r2_score(Y_train, y_pred_train), 'Test r2:', r2_score(Y_test, y_pred))
    print('\n')
    best_model.fit(X, Y)
    y_pred2 = best_model.predict(X)
    print(name, "Best model predicted r2:", predicted_r2(Y, y_pred2, X))
    #explainer = shap.TreeExplainer(best_model)
    #shap_values = explainer.shap_values(X)
    #X_importance = pd.DataFrame(data=X, columns=df3.columns)
    #print(name,'ALL FEATURES Ranked SHAP Importance:', X.columns[np.argsort(np.abs(shap_values).mean(0))[::-1]])
    #fig, ax = plt.subplots()
    #shap.summary_plot(shap_values, X)
    #fig.savefig("shap_summary_all_features" + name +".svg", format='svg', dpi=1200, bbox_inches = "tight")
    median_r2 = np.median(nested_cv_results['test_r2'])
    models_list_r2.append((best_model,  median_r2))
    predr2_score = predicted_r2(Y, y_pred2, X)
    models_list_predr2.append((best_model, predr2_score))

print('All r2 results:', results)         

best_model1, best_r2 = sorted(models_list_r2, key = lambda x: x[1], reverse=True)[0]
best_model2, best_pred_r2 = sorted(models_list_predr2, key = lambda x: x[1], reverse=True)[0]
print('Best model by median r2:',best_model1)
print('Best model by predicted r2:',best_model2)


XGBR Nested CV results for all scores: 
 {'fit_time': array([131.37645626, 118.03296185, 121.62633896, 102.46035099,
       106.42817378]), 'score_time': array([0.00790191, 0.00741005, 0.00794721, 0.00775599, 0.0070312 ]), 'test_r2': array([0.54500729, 0.4832784 , 0.55739276, 0.57029592, 0.61691988]), 'test_neg_mean_squared_error': array([-0.04653186, -0.06345834, -0.0530849 , -0.04972126, -0.04295986]), 'test_max_error': array([-0.56671327, -0.67441583, -0.5748857 , -0.78292634, -0.55186349]), 'test_neg_mean_absolute_error': array([-0.1689924 , -0.18415861, -0.17216633, -0.16469418, -0.16107942]), 'test_explained_variance': array([0.55504154, 0.52142943, 0.5605567 , 0.57124796, 0.63921105]), 'test_neg_root_mean_squared_error': array([-0.21571245, -0.25190939, -0.2304016 , -0.22298264, -0.20726761]), 'test_neg_median_absolute_error': array([-0.14218479, -0.14303009, -0.11648941, -0.14920637, -0.1520506 ])} 

XGBR r2 Nested CV Median 0.5573927573998524
XGBR MSE Nested CV Median -0.04972