# __Meta-estimator Benchmarking__ 

In [3]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from BorutaShap import BorutaShap
import sys
import sklearn.neighbors._base  
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
import lightgbm
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.metrics import (
    explained_variance_score,
    max_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings

filterwarnings("ignore")

seed = 0

In [4]:
data = pd.read_csv("training_cleaned.csv", header=0, sep=",")

In [5]:
data["label_encoded"] = data["label"].map(
    {"most likely": 1, "probable": 0.75, "least likely": 0.1}
)
Y = data["label_encoded"]
data = data.drop(["label"], 1)
data.shape  # Data has IPA and ensembl features without possible label

(804, 29)

In [6]:
X = pd.read_csv("selected_features_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed
)

In [8]:
def press_statistic(Y, y_pred2, xs):
    res = y_pred2 - Y
    hat = xs.dot(np.linalg.pinv(xs))
    den = 1 - np.diagonal(hat)
    sqr = np.square(res / den)
    return sqr.sum()


def predicted_r2(Y, y_pred2, xs):
    press = press_statistic(Y=Y, y_pred2=y_pred2, xs=xs)
    sst = np.square(Y - Y.mean()).sum()
    return 1 - press / sst


def r2(Y, y_pred2):
    sse = np.square(y_pred2 - Y).sum()
    sst = np.square(Y - Y.mean()).sum()
    return 1 - sse / sst

# Building Models:
- Models' hyperparameters previously tuned with Bayesian optimization 

In [10]:
xgb =  xgboost.XGBRegressor(learning_rate=0.2, max_depth=3, n_estimators=50, random_state=seed, reg_alpha=1, reg_lambda=6)

lgbm = LGBMRegressor(learning_rate=0.1478572369480306, max_depth=3, n_estimators=50,
              random_state=0, reg_alpha=1, reg_lambda=1)

cb = CatBoostRegressor(depth=4, iterations=50, learning_rate=0.11772633618138172, random_seed=seed, verbose=False)


gbm = GradientBoostingRegressor(learning_rate=0.1093170713412033, max_depth=4,
                          max_features='sqrt', n_estimators=50, random_state=0)

rf = RandomForestRegressor(criterion='mse', max_depth=4, max_features='log2',
                      n_estimators=50, random_state=0)

dt = DecisionTreeRegressor(criterion='mse', max_depth=3, max_features='auto',
                      random_state=0)

et = ExtraTreesRegressor(criterion='mse', max_depth=4, n_estimators=44,
                    random_state=0)

knn =  KNeighborsRegressor(metric='manhattan', n_neighbors=17, weights='distance')

svr = SVR(C=1000.0, gamma=0.001)

lasso = Lasso(alpha=0.001, max_iter=5000, random_state=0)

elastic = ElasticNet(alpha=0.001, l1_ratio=0.0, random_state=0, tol=1)

results = []
names = []
scoring = [
    "r2",
    "neg_mean_squared_error",
    "max_error",
    "neg_mean_absolute_error",
    "explained_variance",
    "neg_root_mean_squared_error",
    "neg_median_absolute_error",
]

inner_cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=seed)
outer_cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=seed)

## Stacking Regressor:

In [11]:
estimators = [
    ("XGBR", xgb),
    ("GBR", gbm),
    ("RFR", rf),
    ("LGBM", lgbm),
    ("CB", cb),
    ("ET", et),
    ("DT", dt),
    ("KNN", knn),
    ("SVR", svr),
    ("LASSO", lasso),
    ("ElasticNet", elastic),

]

stacker = StackingRegressor(
    estimators=estimators,
    final_estimator = xgboost.XGBRegressor(learning_rate=0.2, max_depth=3, n_estimators=50, random_state=seed, reg_alpha=1, reg_lambda=6)
)
cv_results = model_selection.cross_validate(
        stacker, X, Y, cv=outer_cv, scoring=scoring
)
print("Stacking r2 CV", cv_results)
print('Nested CV results for all scores:', '\n', cv_results, '\n')
print('r2 Nested CV Median', np.median(cv_results['test_r2']))
print('MSE Nested CV Median', np.median(cv_results['test_neg_mean_squared_error'] ))
print('RMSE Nested CV Median', np.median(cv_results['test_neg_root_mean_squared_error'] ))
print('Explained Variance Nested CV Median', np.median(cv_results['test_explained_variance'] ))
print('MAE Nested CV Median', np.median(cv_results['test_neg_mean_absolute_error'] ))
stacker.fit(X, Y)
y_pred = stacker.predict(X)
print("Stacking predicted r2:", predicted_r2(Y, y_pred, X))

stacker.fit(X_train, Y_train)

y_pred = stacker.predict(X_test)
print("Stacking Test r2:", r2_score(Y_test, y_pred))
print("Stacking Test MSE:", mean_squared_error(Y_test, y_pred))
print(
    "Stacking Test Explained Variance Score:",
    explained_variance_score(Y_test, y_pred),
)
print("Stacking Test MAE:", mean_absolute_error(Y_test, y_pred))
print("Stacking Test Max Error:", max_error(Y_test, y_pred))

Stacking r2 CV {'fit_time': array([2.65209007, 2.42366576, 2.40244198, 2.42792916, 2.56486678,
       2.41675091, 2.73693299, 2.56943989, 2.35900593, 2.3772459 ,
       2.39188719, 2.46338296, 2.36363101, 2.63083601, 2.46666718]), 'score_time': array([0.05829978, 0.05995011, 0.04520392, 0.05731797, 0.05903506,
       0.04872704, 0.05615401, 0.0554769 , 0.05630016, 0.05641222,
       0.05739188, 0.05654597, 0.05771899, 0.05475307, 0.05623484]), 'test_r2': array([0.55969372, 0.63066885, 0.78601661, 0.72936449, 0.70386497,
       0.87151283, 0.68248474, 0.62473318, 0.65096193, 0.64939503,
       0.53214258, 0.79728651, 0.85243652, 0.52808112, 0.74607075]), 'test_neg_mean_squared_error': array([-0.01150579, -0.00946588, -0.00824201, -0.00859986, -0.01030107,
       -0.00438215, -0.01237766, -0.00946318, -0.01045862, -0.00973751,
       -0.0148669 , -0.0058706 , -0.00568371, -0.01233186, -0.00798449]), 'test_max_error': array([-0.66299042, -0.58961811, -0.52844723, -0.61833307, -0.63856312,

In [13]:
stacker.fit(X, Y)
y_true = Y
y_pred = stacker.predict(X)
xs = X


print("stacking r2:", r2(y_true, y_pred))
print("stacking Predicted r2:", predicted_r2(y_true, y_pred, xs))

stacking r2: 0.8402496856611482
stacking Predicted r2: 0.7934068882532678


## Bagging Regressor:

In [10]:
bagging_xgb = BaggingRegressor(base_estimator=xgb, n_estimators=10, oob_score=True, random_state=seed, n_jobs=-1)

nested_cv_results = model_selection.cross_validate(bagging_xgb, X , Y, cv=outer_cv, scoring=scoring)
print('Bagging Regressor Nested CV results for all scores:', '\n', nested_cv_results, '\n')
print("Bagging r2 CV", nested_cv_results)
print('r2 CV Median', np.median(nested_cv_results['test_r2']))
print('MSE CV Median', np.median(nested_cv_results['test_neg_mean_squared_error'] ))
print('RMSE CV Median', np.median(nested_cv_results['test_neg_root_mean_squared_error'] ))
print('Explained Variance CV Median', np.median(nested_cv_results['test_explained_variance'] ))
print('MAE CV Median', np.median(nested_cv_results['test_neg_mean_absolute_error'] ))
bagging_xgb.fit(X, Y)
y_pred = bagging_xgb.predict(X)
print("Bagging Regressor predicted r2:", predicted_r2(Y, y_pred, X))

bagging_xgb.fit(X_train, Y_train)
y_pred = bagging_xgb.predict(X_test)
print("Bagging Test r2:", r2_score(Y_test, y_pred))
print("Bagging Test MSE:", mean_squared_error(Y_test, y_pred))
print("Bagging Test Explained Variance Score:", explained_variance_score(Y_test, y_pred))
print("Bagging Test MAE:", mean_absolute_error(Y_test, y_pred))
print("Bagging Test Max Error:", max_error(Y_test, y_pred))

Bagging Regressor Nested CV results for all scores: 
 {'fit_time': array([2.31845212, 0.192976  , 0.19178963, 0.19616604, 0.19088697,
       0.19059181, 0.18550706, 0.19001389, 0.19048119, 0.18465018,
       0.18966722, 0.19518471, 0.19500971, 0.18712425, 0.19362521]), 'score_time': array([0.03024483, 0.02873802, 0.02778506, 0.02869105, 0.02792406,
       0.02719998, 0.02740097, 0.02936792, 0.02747393, 0.02673173,
       0.0303731 , 0.02942538, 0.02866602, 0.02942181, 0.02879477]), 'test_r2': array([0.43581198, 0.57823051, 0.78753025, 0.75034049, 0.71568838,
       0.78875216, 0.621498  , 0.72647138, 0.58557929, 0.58580317,
       0.55561634, 0.78672034, 0.82978715, 0.44827513, 0.73747122]), 'test_neg_mean_squared_error': array([-0.01474298, -0.01080987, -0.0081837 , -0.00793332, -0.0098898 ,
       -0.00720476, -0.01475509, -0.00689763, -0.01241775, -0.01150367,
       -0.01412098, -0.0061766 , -0.00655609, -0.0144173 , -0.00825489]), 'test_max_error': array([-0.62039842, -0.52358367,

In [13]:
bagging_xgb.fit(X, Y)
y_true = Y
y_pred = bagging_xgb.predict(X)
xs = X


print("bagging_xgb r2:", r2(y_true, y_pred))
print("bagging_xgb Predicted r2:", predicted_r2(y_true, y_pred, xs))

bagging_xgb r2: 0.8441371443782102
bagging_xgb Predicted r2: 0.7953440336042631


## Voting Regressor:

In [14]:
model1 = xgb.fit(X_train, Y_train)
model2 = gbm.fit(X_train, Y_train)
model3 = xgb.fit(X_train, Y_train)
model4 = cb.fit(X_train, Y_train)
model5 = rf.fit(X_train, Y_train)
model6 = et.fit(X_train, Y_train)
model7 = dt.fit(X_train, Y_train)
model8 = knn.fit(X_train, Y_train)
model9 = svr.fit(X_train, Y_train)
model10 = lasso.fit(X_train, Y_train)
model11 = elastic.fit(X_train, Y_train)

vote = VotingRegressor([("xgbr", model1), ("gbr", model2), ("xgb", model3),
                       ("cb", model4), ("rf", model5), ("et", model6),
                       ("dt", model7), ("knn", model8), ("svr", model9),
                       ("lasso", model10), ("elasticnet", model11)])

cv_results = model_selection.cross_validate(
        vote, X, Y, cv=outer_cv, scoring=scoring
)
print("Voting r2 CV", cv_results)
print('Nested CV results for all scores:', '\n', cv_results, '\n')
print('r2 Nested CV Median', np.median(cv_results['test_r2']))
print('MSE Nested CV Median', np.median(cv_results['test_neg_mean_squared_error'] ))
print('RMSE Nested CV Median', np.median(cv_results['test_neg_root_mean_squared_error'] ))
print('Explained Variance Nested CV Median', np.median(cv_results['test_explained_variance'] ))
print('MAE Nested CV Median', np.median(cv_results['test_neg_mean_absolute_error'] ))
vote.fit(X, Y)
y_pred = vote.predict(X)
print("Voting predicted r2:", predicted_r2(Y, y_pred, X))

vote.fit(X_train, Y_train)

y_pred = vote.predict(X_test)
print("Voting Test r2:", r2_score(Y_test, y_pred))
print("Voting Test MSE:", mean_squared_error(Y_test, y_pred))
print(
    "Voting Test Explained Variance Score:",
    explained_variance_score(Y_test, y_pred),
)
print("Voting Test MAE:", mean_absolute_error(Y_test, y_pred))
print("Voting Test Max Error:", max_error(Y_test, y_pred))

Voting r2 CV {'fit_time': array([0.50081778, 0.49336433, 0.46647906, 0.54679585, 0.561939  ,
       0.48570895, 0.52429891, 0.52254391, 0.51401329, 0.53150582,
       0.5437808 , 0.5410049 , 0.52506018, 0.48585916, 0.5036931 ]), 'score_time': array([0.05802226, 0.0583148 , 0.04484701, 0.05766296, 0.05896592,
       0.04995704, 0.05564117, 0.05574393, 0.05608296, 0.05684423,
       0.05784822, 0.05789804, 0.05604362, 0.05338168, 0.05659294]), 'test_r2': array([0.55558443, 0.55065817, 0.69837423, 0.65987795, 0.5832834 ,
       0.69089938, 0.54915155, 0.62880703, 0.55555697, 0.65027657,
       0.55025446, 0.64796962, 0.76680165, 0.37386461, 0.68490204]), 'test_neg_mean_squared_error': array([-0.01161317, -0.01151654, -0.01161773, -0.01080791, -0.01449551,
       -0.01054211, -0.01757537, -0.00936045, -0.01331735, -0.00971302,
       -0.01429136, -0.01019484, -0.00898211, -0.01636175, -0.00990786]), 'test_max_error': array([-0.48681138, -0.54223947, -0.53681975, -0.59099351, -0.58835135,
 

In [15]:
vote.fit(X, Y)
y_true = Y
y_pred = vote.predict(X)
xs = X


print("voting r2:", r2(y_true, y_pred))
print("voting Predicted r2:", predicted_r2(y_true, y_pred, xs))

voting r2: 0.808123048996449
voting Predicted r2: 0.7900296027450284
