# Refactored notebook for modelling

## imports

In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import warnings
import re
import plotly.express as px
import plotly.graph_objects as go

from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import (
    downcast,
    map_labels,
    fillna_categories,
    pd_fit_resample,
    infer_categories_fit,
    KindSelector,
    get_feature_names,
    remove_categories,
)
from NHS_PROMs.data_dictionary import meta_dict, methods

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector

# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import (
    ColumnTransformer,
    make_column_transformer,
    make_column_selector,
)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingRegressor,
    BaggingClassifier,
)
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.inspection import permutation_importance
from sklearn import set_config

set_config(display="diagram")

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler 

# use adjusted fillna which can cope with non-existing categories for CategoricalDtype
pd.core.frame.DataFrame.fillna = fillna_categories
# added a remove categories
pd.core.frame.Series.remove_categories = remove_categories
# enable autodetect of categories from CategoricalDtype by using "infer" for SMOTENC
SMOTENC.fit_resample = pd_fit_resample(SMOTENC.fit_resample)
# enable inference of categories for encoders from CategoricalDtype
OneHotEncoder.fit = infer_categories_fit(OneHotEncoder.fit)
OrdinalEncoder.fit = infer_categories_fit(OrdinalEncoder.fit)

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [2]:
# load data + rename columns with structired name
# df_knee_raw = load_proms(part="knee").apply(downcast).rename(structure_name, axis=1)
df_hip_raw = load_proms(part="hip").apply(downcast).rename(structure_name, axis=1)

# get meta data for each
full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
hip_meta = {k: v for k, v in full_meta.items() if k in df_hip_raw.columns}

df_hip_raw.sample(3)

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
20620,RFR,Hip Replacement,0,2016/17,60 to 69,2.0,2,0,1,2,...,4,4,4,4,4,4,4,4,48.0,43.452309
5405,NT316,Hip Replacement,0,2016/17,60 to 69,2.0,2,0,2,2,...,2,1,2,2,2,2,1,1,18.0,35.313557
5523,NT302,Hip Replacement,0,2018/19,60 to 69,1.0,2,0,2,2,...,4,3,4,4,4,4,4,4,47.0,42.789566


## basic cleaning

In [3]:
%%time

endings = (
    "code", # is a coded score and not of interest for the case
    "procedure", # is the same for the hip or knee set
    "revision_flag", # revisions are out of scope, filtered away, so same for all rows after that
    "assisted_by", # is the same for all records
    "profile", # is a coded score and not of interest for the case
    "predicted", # are predictions of other models that are not supposed to be used
)
cols2drop = [c for c in df_hip_raw.columns if c.endswith(endings)]

df_hip_clean = (
    df_hip_raw.apply(lambda s: filter_in_range(s, **hip_meta[s.name])) # filter in range numeric features
    .apply(lambda s: filter_in_labels(s, **hip_meta[s.name])) # filter in labels categorical features + ordinal if ordered
    .apply(lambda s: map_labels(s, **hip_meta[s.name])) # map the labels as values for readability
    .query("t0_revision_flag == 'no revision'") # drop revision cases
    .drop(columns=cols2drop) # drop not needed columns
    .reset_index(drop=True) # make index unique (prevent blow ups when joining)
)

# remove NaNs/missing/unknown from numerical and ordinal features
df_hip_clean = (
    df_hip_clean.apply(pd.Series.remove_categories, args=(["missing", "not known"],))
    .dropna(subset= KindSelector(kind="numerical")(df_hip_clean) + KindSelector(kind="ordinal")(df_hip_clean))
)

df_hip_clean.sample(3)

CPU times: user 631 ms, sys: 64.9 ms, total: 696 ms
Wall time: 696 ms


Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
10970,April 2016 - April 2017,80 to 89,female,yes,more than 10 years,no,alone,no,,yes,...,rarely/never,rarely/never,sometimes or just at first,"often, not just at first",rarely/never,rarely/never,sometimes or just at first,"often, not just at first",rarely/never,38.0
97069,April 2018 - April 2019,70 to 79,male,no,1 to 5 years,no,with partner / spouse / family / friends,no,yes,yes,...,sometimes or just at first,sometimes or just at first,"often, not just at first",rarely/never,sometimes or just at first,sometimes or just at first,rarely/never,sometimes or just at first,sometimes or just at first,35.0
32559,April 2016 - April 2017,80 to 89,male,yes,1 to 5 years,no,with partner / spouse / family / friends,no,yes,,...,rarely/never,sometimes or just at first,sometimes or just at first,sometimes or just at first,rarely/never,rarely/never,sometimes or just at first,rarely/never,rarely/never,44.0


### Explanation why we can drop years

In [4]:
# def plot_year_histograms(t=0, method="eq5d"):
    
#     facet_cols = ["_".join([f"t{t}", method, dim]) for dim in methods[method]["dims"]["names"]]

#     df_plot = (
#         df_hip_clean[["t0_year"] + facet_cols]
#         .set_index("t0_year")
#         .stack()
#         .reset_index()
#         .set_axis(["year", "dimension", "value"], axis=1)
#     )

#     fig = px.histogram(
#         df_plot,
#         title=f"Distributions of values over the years for method {method} at t{t}",
#         x="value",
#         color="year",
#         barmode="group",
#         histnorm="percent",
#         facet_col="dimension",
#         facet_col_wrap=3,
#         category_orders={"value":list(methods[method]["dims"]["labels"].values())},
#     )

#     fig.update_xaxes(col=3, showticklabels=True, visible=True)
#     fig.update_layout(legend=dict(xanchor="right", x=1, yanchor="bottom", y=0))

#     fig.show()
    
# [plot_year_histograms(t=t) for t in [0, 1]];

## split data

In [11]:
# split train + test set
df_hip = df_hip_clean.query("t0_year != '2019/20'").drop(columns="t0_year")
df_hip_unseen = df_hip_clean.query("t0_year == '2019/20'").drop(columns="t0_year")

df_hip.sample(3)

Unnamed: 0,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,t0_stroke,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
122994,60 to 69,male,no,more than 10 years,yes,with partner / spouse / family / friends,yes,,,,...,rarely/never,rarely/never,most of the time,rarely/never,sometimes or just at first,sometimes or just at first,"often, not just at first","often, not just at first",sometimes or just at first,36.0
53286,70 to 79,male,no,1 to 5 years,no,with partner / spouse / family / friends,yes,,yes,,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,48.0
111254,50 to 59,female,no,1 to 5 years,no,with partner / spouse / family / friends,yes,,,,...,rarely/never,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,rarely/never,sometimes or just at first,38.0


In [99]:
# create x, y
X = df_hip.filter(regex="t0")
# # regression:
# y = df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"]

# classification
y_name = "t1_eq5d_discomfort"
y_labels = {k:v for k, v, in enumerate(df_hip[y_name].cat.categories)}
y = df_hip[y_name].cat.codes

# # make a smaller selection of our training data to play with
# X = X.iloc[:1000, :] # [0, 1, 2, 3, 4, -4, -3, -2, -1]]
# y = y.iloc[:1000]


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

## make + train a simple pipeline

In [100]:
ct = ColumnTransformer(
    (
        ("numerical", StandardScaler(), KindSelector(kind="numerical")),
        (
            "categorical",
            OneHotEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="categorical"),
        ),
        (
            "ordinal",
            OrdinalEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="ordinal"),
        ),
    ),
    remainder="drop",
)

pl = Pipeline(
    (
        ("balancer", "passthrough"),
        ("by_column_kinds", ct),
        ("model", KNeighborsClassifier()),
    )
)

# train the pipeline/model
pl.fit(X_train, y_train)

In [101]:
get_feature_names(pl)

['t0_eq5d_score',
 't0_eqvas_score',
 't0_ohs_score',
 't0_gender_male',
 't0_gender_female',
 't0_assisted_yes',
 't0_assisted_no',
 't0_previous_surgery_yes',
 't0_previous_surgery_no',
 't0_living_arrangements_with partner / spouse / family / friends',
 't0_living_arrangements_alone',
 't0_living_arrangements_in a nursing home, hospital or other long-term care home',
 't0_living_arrangements_other',
 't0_disability_yes',
 't0_disability_no',
 't0_heart_disease_yes',
 't0_high_bp_yes',
 't0_stroke_yes',
 't0_circulation_yes',
 't0_lung_disease_yes',
 't0_diabetes_yes',
 't0_kidney_disease_yes',
 't0_nervous_system_yes',
 't0_liver_disease_yes',
 't0_cancer_yes',
 't0_depression_yes',
 't0_arthritis_yes',
 't0_age_band',
 't0_symptom_period',
 't0_eq5d_mobility',
 't0_eq5d_self_care',
 't0_eq5d_activity',
 't0_eq5d_discomfort',
 't0_eq5d_anxiety',
 't0_ohs_pain',
 't0_ohs_sudden_pain',
 't0_ohs_night_pain',
 't0_ohs_washing',
 't0_ohs_transport',
 't0_ohs_dressing',
 't0_ohs_shopping'

## Gridsearch

In [102]:
%%time
# create parameter grid to search on 

# # standard same as pipeline
# param_grid = dict()

# # tuning hyper parameters
param_grid = {
    "balancer": ["passthrough", RandomUnderSampler(replacement=True)],
    "model": [RandomForestClassifier(), AdaBoostClassifier()],
    "model__n_estimators": [25, 50, 100],
}

# # tuning different hyper parameters on different models
# param_grid = [
#     {
#         "balancer": [RandomUnderSampler()],
#         "balancer__replacement": [True, False],
#         "model": [RandomForestClassifier()],
#         "model__n_estimators": [25, 50, 100],
#     },
#     {
#         "balancer": [RandomUnderSampler()],
#         "balancer__replacement": [True, False],
#         "model": [KNeighborsClassifier()],
#         "model__n_neighbors": [2, 5, 10],
#     },
# ]


# tuning different hyper parameters on different models
param_grid = [
    {
        "balancer": [RandomUnderSampler()],
        "balancer__replacement": [True, False],
        "model": [BaggingClassifier()],
        "model__n_estimators": [10, 100],
    },
    {
        "balancer": ["passthrough"],
        "model": [BalancedBaggingClassifier()],
        "model__n_estimators": [10, 100],
        "model__replacement": [True, False],
    },
    {
        "balancer": [RandomUnderSampler()],
        "balancer__replacement": [True, False],
        "model": [RandomForestClassifier()],
        "model__n_estimators": [10, 100],
    },
    {
        "balancer": ["passthrough"],
        "model": [BalancedRandomForestClassifier()],
        "model__n_estimators": [10, 100],
        "model__replacement": [True, False],
        "model__class_weight": [None, "balanced", "balanced_subsample"],
    },
    {
        "balancer": [RandomUnderSampler()],
        "balancer__replacement": [True, False],
        "model": [AdaBoostClassifier()],
        "model__n_estimators": [10, 100],
    },
    {
        "balancer": ["passthrough"],
        "model": [EasyEnsembleClassifier()],
        "model__n_estimators": [10, 100],
        "model__replacement": [True, False],
    },
]


# # construct gridsearch

# # standard
# GS = GridSearchCV(pl, param_grid=param_grid)

# # # # add scoring 
GS = GridSearchCV(pl, param_grid=param_grid)

# # multiple
# GS = GridSearchCV(pl, param_grid=param_grid, scoring=["balanced_accuracy", "f1"], refit=False)


# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
    .set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,param_balancer,param_balancer__replacement,param_model,param_model__n_estimators,param_model__replacement,param_model__class_weight,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,passthrough,,BalancedBaggingClassifier(n_estimators=100),100,False,,"{'balancer': 'passthrough', 'model': BalancedB...",0.513209,0.004237
2,passthrough,,BalancedBaggingClassifier(n_estimators=100),100,True,,"{'balancer': 'passthrough', 'model': BalancedB...",0.51193,0.00481
3,passthrough,,BalancedBaggingClassifier(n_estimators=100),10,False,,"{'balancer': 'passthrough', 'model': BalancedB...",0.496538,0.002408
4,passthrough,,BalancedBaggingClassifier(n_estimators=100),10,True,,"{'balancer': 'passthrough', 'model': BalancedB...",0.495479,0.004437
5,passthrough,,BalancedRandomForestClassifier(),100,False,balanced,"{'balancer': 'passthrough', 'model': BalancedR...",0.468914,0.00503
6,passthrough,,BalancedRandomForestClassifier(),100,True,balanced,"{'balancer': 'passthrough', 'model': BalancedR...",0.468738,0.00397
7,passthrough,,EasyEnsembleClassifier(),100,False,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.465136,0.003069
8,passthrough,,EasyEnsembleClassifier(),10,True,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.464562,0.001781
9,passthrough,,EasyEnsembleClassifier(),10,False,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.463842,0.005955
10,passthrough,,EasyEnsembleClassifier(),100,True,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.463636,0.003164


# OLD

## predict + evaluate

In [None]:
# # make prediction
# y_hat = pl.predict(X_test.head(500))

# # evaluate
# print(classification_report(y_test.head(500), y_hat))

## prediction intervals
Last time we were talking about confidence intervals.

But we assumed that for individual prediction we are meaning prediction intervals, correct?

## used sources
basic explaination:
* https://machinelearningmastery.com/prediction-intervals-for-machine-learning/
* https://towardsdatascience.com/quantile-regression-from-linear-models-to-trees-to-deep-learning-af3738b527c3

using parallel models with the quantile loss function for gradient boosting model:
* https://towardsdatascience.com/how-confidence-and-prediction-intervals-work-4592019576d8
* https://towardsdatascience.com/how-to-generate-prediction-intervals-with-scikit-learn-and-python-ab3899f992ed
* https://heartbeat.fritz.ai/5-regression-loss-functions-all-machine-learners-should-know-4fb140e9d4b0

linear regression approach:
* https://towardsdatascience.com/prediction-intervals-in-linear-regression-2ea14d419981


Our current worked out example is based on:
* parallel gradient boosting models 
* using different quantile loss function alphas

In [None]:
pl.named_steps["model"].set_params(loss="quantile", alpha=0.9)
pl.fit(X_train, y_train)
y_90 = pl.predict(X_test)

pl.named_steps["model"].set_params(loss="quantile", alpha=0.1)
pl.fit(X_train, y_train)
y_10 = pl.predict(X_test)

pl.named_steps["model"].set_params(loss="quantile", alpha=0.5)
pl.fit(X_train, y_train)
y_hat = pl.predict(X_test)

Plot of prediction intervals on test set

In [None]:
pd_conf = pd.DataFrame({
    "10%": y_10,
    "90%": y_90,
    "true":y_test,
    "predicted": y_hat,
}).reset_index(drop=True)
               
# px.scatter(pd_conf)
px.scatter(pd_conf, x="true", y="predicted")

## now do it smart

In [None]:
from joblib import Parallel
from sklearn.multioutput import MultiOutputRegressor, _fit_estimator
from sklearn.base import is_classifier
from sklearn.utils.validation import _check_fit_params
from sklearn.utils.fixes import delayed


class ConfidenceEstimator(MultiOutputRegressor):
    def __init__(self, estimator, quantiles, *, n_jobs=None):

        super().__init__(estimator, n_jobs=n_jobs)
        self.quantiles = quantiles

    def fit(self, X, y, sample_weight=None, **fit_params):
        """Fit the model to data.
        Fit a separate model for each output variable.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Data.
        y : {array-like, sparse matrix} of shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.
        **fit_params : dict of string -> object
            Parameters passed to the ``estimator.fit`` method of each step.
            .. versionadded:: 0.23
        Returns
        -------
        self : object
        """

        if not hasattr(self.estimator, "fit"):
            raise ValueError("The base estimator should implement" " a fit method")

        X, y = self._validate_data(
            X, y, force_all_finite=False, multi_output=False, accept_sparse=True
        )

        if is_classifier(self):
            check_classification_targets(y)

        if sample_weight is not None and not has_fit_parameter(
            self.estimator, "sample_weight"
        ):
            raise ValueError("Underlying estimator does not support" " sample weights.")

        fit_params_validated = _check_fit_params(X, fit_params)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(
                self.estimator.set_params(loss="quantile", alpha=alpha),
                X,
                y,
                sample_weight,
                **fit_params_validated
            )
            for alpha in self.quantiles
        )
        return self

In [None]:
quantiles = [0.1, 0.5, 0.9]

pl = Pipeline((
    ("by_column_types", ct),
    ("model", ConfidenceEstimator(GradientBoostingRegressor(), quantiles=quantiles)),
))

# train the pipeline/model
pl.fit(X_train, y_train)

In [None]:
pl.predict(X_test)

In [None]:
X_i = X_test.sample()
X_i.index[0]
y_i = y_test.loc[X_i.index[0]]
display(pl.predict(X_i), y_i)

## now do it over the top

## refactored it in a transfomer
(still parallel models)

In [None]:
# set confidence intervals
step_size = 0.05
quantiles = np.arange(step_size, 1, step_size)

# setup pipeline
pl = Pipeline((
    ("by_column_types", ct),
    ("model", ConfidenceEstimator(GradientBoostingRegressor(), quantiles=quantiles)),
))
  
# train pipeline/model
pl.fit(X_train, y_train) 

Questions:
* Is this (in this particular form/model) what was meant in the last expert session?
* (Because we have parallel models?) sometimes strange issues?
    * eg: interval boundary 65% < 50%!
    
    How usefull is this approach then?

In [None]:
y_int = pl.predict(X_test)
prediction_intervals = {k:v for k, v in zip(quantiles, y_int)}

def plot_prediction(intervals, true_value):
    fig = go.Figure()

    point_estimate = prediction_intervals[.5]

    for label, x in prediction_intervals.items():
        fig.add_trace(
            go.Scatter(
                x=[x, point_estimate], y=[1, 1], 
                fill='tozeroy', mode="none", 
                fillcolor='rgba(255,0,0,0.1)',
                showlegend=False,
            )
        )
        if label != .5:
            fig.add_annotation(
                x=x, y=label,
                text=f"{label*100:.0f}%",
                showarrow=False,
            )

    fig.add_trace(
        go.Scatter(
            x=[point_estimate]*2, y=[0, 1],
            mode="lines", line={"color":"red"}, 
            name="point estimate",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=[true_value]*2, y=[0, 1],
            mode="lines", line={"color":"blue"}, 
            name="true value",
        )
    )

    x = list(prediction_intervals.values())
    x_range = [np.min(x), np.max(x)]
    x_range = (x_range - np.mean(x_range)) * 1.1 + np.mean(x_range)
    fig.update_xaxes(range=x_range)
    fig.update_yaxes(visible=False, showticklabels=False)
    fig.update_layout(height=400)

    fig.show()

In [None]:
# take one sample from test set
X_i = X_test.sample()
y_i = y_test.loc[X_i.index[0]]
# predict including prediction intervals
y_int = pl.predict(X_i)[0]

# plot prediction intervals
prediction_intervals = {k:v for k, v in zip(quantiles, y_int)}
plot_prediction(intervals=prediction_intervals, true_value=y_i)

In [None]:
# import plotly.graph_objs as go
# fig = go.Figure([
#     go.Scatter(
#         name='Prediction',
#         x=,
#         y=df['10 Min Sampled Avg'],
#         mode='lines',
#         line=dict(color='rgb(31, 119, 180)'),
#     ),
#     go.Scatter(
#         name='Upper Bound',
#         x=df['Time'],
#         y=df['10 Min Sampled Avg']+df['10 Min Std Dev'],
#         mode='lines',
#         marker=dict(color="#444"),
#         line=dict(width=0),
#         showlegend=False
#     ),
#     go.Scatter(
#         name='Lower Bound',
#         x=df['Time'],
#         y=df['10 Min Sampled Avg']-df['10 Min Std Dev'],
#         marker=dict(color="#444"),
#         line=dict(width=0),
#         mode='lines',
#         fillcolor='rgba(68, 68, 68, 0.3)',
#         fill='tonexty',
#         showlegend=False
#     )
# ])
# fig.update_layout(
#     yaxis_title='Wind speed (m/s)',
#     title='Continuous, variable value error bars',
#     hovermode="x"
# )
# fig.show()

## regression

In [None]:
# create x, y
X = df_hip.filter(regex="t0")
y = df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"]


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

# make a smaller selection of our training data to play with
X_train = X_train.iloc[:1000, -5:]
y_train = y_train.iloc[:1000]

In [None]:
# make parameter grid
param_grid = {
    "balancer": ["passthrough"],
    "model": [KNeighborsRegressor()],
}

GS = GridSearchCV(pl, param_grid=param_grid)
# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")
#     .set_index("rank_test_score").sort_index()

## extract feature names pl

In [None]:
get_feature_names(pl)

In [None]:
# # this is slow ...
# r = permutation_importance(pl, X_train.head(1_000), y_train.head(1_000), n_repeats=2, random_state=0)

# feature_names = get_feature_names(pl)

# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
#         print(f"{feature_names[i]:<8}"
#         f"{r.importances_mean[i]:.3f}"
#         f" +/- {r.importances_std[i]:.3f}")

## a more advanced pipeline

In [None]:
# TO DO ...