# __Meta-estimator Benchmarking__ 

In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
from missingpy import MissForest
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
import lightgbm
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.metrics import (
    explained_variance_score,
    max_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [2]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

In [3]:
data["BPlabel_encoded"] = data["BPlabel"].map(
    {"most likely": 1, "probable": 0.75, "least likely": 0.1}
)
Y = data["BPlabel_encoded"]
data = data.drop(["BPlabel"], 1)
data.shape  # Data has IPA and ensembl features without possible label

(293, 22)

In [4]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [6]:
def press_statistic(Y, y_pred2, xs):
    res = y_pred2 - Y
    hat = xs.dot(np.linalg.pinv(xs))
    den = 1 - np.diagonal(hat)
    sqr = np.square(res / den)
    return sqr.sum()


def predicted_r2(Y, y_pred2, xs):
    press = press_statistic(Y=Y, y_pred2=y_pred2, xs=xs)
    sst = np.square(Y - Y.mean()).sum()
    return 1 - press / sst


def r2(Y, y_pred2):
    sse = np.square(y_pred2 - Y).sum()
    sst = np.square(Y - Y.mean()).sum()
    return 1 - sse / sst

# Building Models:
- Models' hyperparameters previously tuned with Bayesian optimization 

In [7]:
xgb =  xgboost.XGBRegressor(learning_rate=0.2, n_estimators=40, random_state=0, reg_alpha=1, reg_lambda=1)

lgbm = LGBMRegressor(learning_rate=0.17393749944393758, max_depth=4, n_estimators=50,
              random_state=0, reg_alpha=1, reg_lambda=1)

cb = CatBoostRegressor(depth=4, iterations=50, learning_rate=0.2, random_seed=seed, verbose=False)

gbm = GradientBoostingRegressor(learning_rate=0.12770293035652075, max_depth=4,
                          max_features='sqrt', n_estimators=50, random_state=seed)

rf = RandomForestRegressor(max_depth=4, n_estimators=50, random_state=seed)

dt = DecisionTreeRegressor(criterion='mae', max_depth=3, max_features='log2',
                      random_state=seed)

et = ExtraTreesRegressor(max_depth=4, n_estimators=37, random_state=0)

knn = KNeighborsRegressor(metric='manhattan', n_neighbors=9)

svr = SVR(C=398.23342579215785, gamma=0.0001)

lasso = Lasso(alpha=0.01, random_state=seed)

elastic = ElasticNet(alpha=0.01, l1_ratio=0.0, random_state=0, tol=1)

results = []
names = []
scoring = [
    "r2",
    "neg_mean_squared_error",
    "max_error",
    "neg_mean_absolute_error",
    "explained_variance",
    "neg_root_mean_squared_error",
    "neg_median_absolute_error",
]

inner_cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=seed)
outer_cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=seed)

## Stacking Regressor:

In [8]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gbm),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    ("KNN", knn),
    ("SVR", svr),
    ("LASSO", lasso),
    ("ElasticNet", elastic),

]

stacker = StackingRegressor(
    estimators=estimators,
    final_estimator= xgboost.XGBRegressor(random_state=seed)
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring
)
print("Stacking r2 CV", cv_results)
print('Nested CV results for all scores:', '\n', cv_results, '\n')
print('r2 Nested CV Median', np.median(cv_results['test_r2']))
print('MSE Nested CV Median', np.median(cv_results['test_neg_mean_squared_error'] ))
print('RMSE Nested CV Median', np.median(cv_results['test_neg_root_mean_squared_error'] ))
print('Explained Variance Nested CV Median', np.median(cv_results['test_explained_variance'] ))
print('MAE Nested CV Median', np.median(cv_results['test_neg_mean_absolute_error'] ))
stacker.fit(X, Y)
y_pred = stacker.predict(X)
print("Stacking predicted r2:", predicted_r2(Y, y_pred, X))

stacker.fit(X_train, Y_train)

y_pred = stacker.predict(X_test)
print("Stacking Test r2:", r2_score(Y_test, y_pred))
print("Stacking Test MSE:", mean_squared_error(Y_test, y_pred))
print(
    "Stacking Test Explained Variance Score:",
    explained_variance_score(Y_test, y_pred),
)
print("Stacking Test MAE:", mean_absolute_error(Y_test, y_pred))
print("Stacking Test Max Error:", max_error(Y_test, y_pred))

Stacking r2 CV {'fit_time': array([ 5.80990005,  6.60314202,  7.23765206,  6.17439389,  9.21567798,
        5.58885098,  5.79104114,  8.63454914,  9.20638275, 10.04597282,
        9.92733908,  9.70114613,  8.16205001,  9.7871418 ,  8.58075333]), 'score_time': array([0.03440714, 0.03863382, 0.03429914, 0.03949404, 0.03433013,
       0.03262115, 0.03744006, 0.03765082, 0.03802323, 0.03869009,
       0.03887272, 0.0348618 , 0.03491902, 0.0399909 , 0.03453183]), 'test_r2': array([0.68282881, 0.41290954, 0.69807246, 0.68050223, 0.70760932,
       0.65866211, 0.60176329, 0.61904991, 0.56848964, 0.65998973,
       0.61415221, 0.65111132, 0.76648915, 0.62712288, 0.699978  ]), 'test_neg_mean_squared_error': array([-0.03624553, -0.06335973, -0.03687792, -0.03471475, -0.03665075,
       -0.04183277, -0.04196072, -0.04203529, -0.05069605, -0.04507966,
       -0.04713417, -0.04041286, -0.02435664, -0.04612193, -0.03721468]), 'test_max_error': array([-0.72801944, -0.7091889 , -0.65056828, -0.6491589

## Bagging Regressor:

In [11]:
bagging_xgbr = BaggingRegressor(base_estimator=xgb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

nested_cv_results = model_selection.cross_validate(bagging_xgbr, X , Y, cv=outer_cv, scoring=scoring)
print('Bagging Regressor Nested CV results for all scores:', '\n', nested_cv_results, '\n')
print("Bagging r2 CV", nested_cv_results)
print('r2 CV Median', np.median(nested_cv_results['test_r2']))
print('MSE CV Median', np.median(nested_cv_results['test_neg_mean_squared_error'] ))
print('RMSE CV Median', np.median(nested_cv_results['test_neg_root_mean_squared_error'] ))
print('Explained Variance CV Median', np.median(nested_cv_results['test_explained_variance'] ))
print('MAE CV Median', np.median(nested_cv_results['test_neg_mean_absolute_error'] ))
bagging_xgbr.fit(X, Y)
y_pred = bagging_xgbr.predict(X)
print("Bagging Regressor predicted r2:", predicted_r2(Y, y_pred, X))

bagging_xgbr.fit(X_train, Y_train)
y_pred = bagging_xgbr.predict(X_test)
print("Bagging Test r2:", r2_score(Y_test, y_pred))
print("Bagging Test MSE:", mean_squared_error(Y_test, y_pred))
print("Bagging Test Explained Variance Score:", explained_variance_score(Y_test, y_pred))
print("Bagging Test MAE:", mean_absolute_error(Y_test, y_pred))
print("Bagging Test Max Error:", max_error(Y_test, y_pred))

Bagging Regressor Nested CV results for all scores: 
 {'fit_time': array([1.28085613, 1.09997797, 1.09225678, 1.10022616, 1.08557892,
       1.13048387, 1.12751102, 1.11857891, 1.16129827, 1.13942814,
       1.1302669 , 1.20021582, 1.16203809, 1.08901787, 1.15605402]), 'score_time': array([0.06026697, 0.061867  , 0.06198406, 0.06540179, 0.06171608,
       0.0661869 , 0.06500196, 0.06594896, 0.06250978, 0.06503582,
       0.06870818, 0.06300902, 0.06288981, 0.05941916, 0.05963111]), 'test_r2': array([0.74683232, 0.58699324, 0.73814156, 0.70990122, 0.6851367 ,
       0.6585794 , 0.75699302, 0.77840955, 0.61124277, 0.80838092,
       0.71116926, 0.66727974, 0.80556945, 0.70090607, 0.60393554]), 'test_neg_mean_squared_error': array([-0.02893137, -0.04457234, -0.03198382, -0.03152043, -0.03946766,
       -0.0418429 , -0.02560474, -0.02445102, -0.0456732 , -0.02540548,
       -0.03528282, -0.03854002, -0.02028032, -0.03699554, -0.04912777]), 'test_max_error': array([-0.45231804, -0.65224724,

## Voting Regressor:

In [28]:
model1 = xgb.fit(X_train, Y_train)
model2 = gbm.fit(X_train, Y_train)
model3 = lgbm.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
model8 = knn.fit(X_train, Y_train)
model9 = svr.fit(X_train, Y_train)
model10 = lasso.fit(X_train, Y_train)
model11 = elastic.fit(X_train, Y_train)

vote = VotingRegressor([("xgbr", model1), ("gbr", model2), ("lgbm", model3),
                       ("cb", model4), ("rf", model5), ("et", model6),
                       ("dt", model7), ("knn", model8), ("svr", model9),
                       ("lasso", model10), ("elasticnet", model11)])

cv_results = model_selection.cross_validate(
        vote, X, Y, cv=outer_cv, scoring=scoring
)
print("Voting r2 CV", cv_results)
print('Nested CV results for all scores:', '\n', cv_results, '\n')
print('r2 Nested CV Median', np.median(cv_results['test_r2']))
print('MSE Nested CV Median', np.median(cv_results['test_neg_mean_squared_error'] ))
print('RMSE Nested CV Median', np.median(cv_results['test_neg_root_mean_squared_error'] ))
print('Explained Variance Nested CV Median', np.median(cv_results['test_explained_variance'] ))
print('MAE Nested CV Median', np.median(cv_results['test_neg_mean_absolute_error'] ))
vote.fit(X, Y)
y_pred = vote.predict(X)
print("Voting predicted r2:", predicted_r2(Y, y_pred, X))

vote.fit(X_train, Y_train)

y_pred = vote.predict(X_test)
print("Voting Test r2:", r2_score(Y_test, y_pred))
print("Voting Test MSE:", mean_squared_error(Y_test, y_pred))
print(
    "Voting Test Explained Variance Score:",
    explained_variance_score(Y_test, y_pred),
)
print("Voting Test MAE:", mean_absolute_error(Y_test, y_pred))
print("Voting Test Max Error:", max_error(Y_test, y_pred))

Voting r2 CV {'fit_time': array([0.23519111, 0.22753406, 0.229774  , 0.24351406, 0.23790503,
       0.26208591, 0.22808099, 0.24748683, 0.25187111, 0.23081923,
       0.24530315, 0.26126075, 0.23825121, 0.27137494, 0.2507298 ]), 'score_time': array([0.02515101, 0.02559114, 0.02501106, 0.0255599 , 0.02683282,
       0.0272882 , 0.02749896, 0.02587128, 0.02500987, 0.02616692,
       0.02472591, 0.02501297, 0.02484393, 0.02502108, 0.02409506]), 'test_r2': array([0.62150654, 0.53880398, 0.75173528, 0.47230843, 0.56415267,
       0.64327366, 0.74435826, 0.64649949, 0.58258718, 0.74462804,
       0.67253632, 0.67396169, 0.74614403, 0.71253989, 0.54963344]), 'test_neg_mean_squared_error': array([-0.04325329, -0.049773  , -0.03032346, -0.05733587, -0.05463283,
       -0.0437187 , -0.02693602, -0.03900641, -0.0490398 , -0.03385804,
       -0.04000212, -0.03776603, -0.02647876, -0.03555653, -0.05586339]), 'test_max_error': array([-0.49166201, -0.62360164, -0.47263015, -1.00142125, -0.96325929,
 