# Refactored notebook for modelling

## imports

In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import warnings
import re
import plotly.express as px
import plotly.graph_objects as go

from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import (
    downcast,
    map_labels,
    fillna_categories,
    pd_fit_resample,
    infer_categories_fit,
    KindSelector,
    get_feature_names,
    remove_categories,
)
from NHS_PROMs.data_dictionary import meta_dict, methods

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector
from sklearn.dummy import DummyClassifier
from sklearn.utils.class_weight import compute_sample_weight

# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import (
    ColumnTransformer,
    make_column_transformer,
    make_column_selector,
)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingRegressor,
    BaggingClassifier,
)
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.inspection import permutation_importance
from sklearn import set_config

from xgboost import XGBClassifier, XGBRFClassifier

set_config(display="diagram")

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler 

# use adjusted fillna which can cope with non-existing categories for CategoricalDtype
pd.core.frame.DataFrame.fillna = fillna_categories
# added a remove categories
pd.core.frame.Series.remove_categories = remove_categories
# enable autodetect of categories from CategoricalDtype by using "infer" for SMOTENC
SMOTENC.fit_resample = pd_fit_resample(SMOTENC.fit_resample)
# enable inference of categories for encoders from CategoricalDtype
OneHotEncoder.fit = infer_categories_fit(OneHotEncoder.fit)
OrdinalEncoder.fit = infer_categories_fit(OrdinalEncoder.fit)

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [2]:
# load data + rename columns with structired name
# df_knee_raw = load_proms(part="knee").apply(downcast).rename(structure_name, axis=1)
df_hip_raw = load_proms(part="hip").apply(downcast).rename(structure_name, axis=1)

# get meta data for each
full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
hip_meta = {k: v for k, v in full_meta.items() if k in df_hip_raw.columns}

df_hip_raw.sample(3)

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
23302,RHU,Hip Replacement,0,2016/17,70 to 79,2.0,2,0,2,2,...,4,4,4,4,4,4,4,4,48.0,31.7971
5249,NT304,Hip Replacement,0,2016/17,60 to 69,2.0,2,0,2,2,...,2,0,2,1,2,1,2,2,19.0,31.988735
16267,RCD,Hip Replacement,0,2017/18,70 to 79,2.0,2,0,4,2,...,3,4,4,2,3,2,3,3,36.0,33.244831


## basic cleaning

In [13]:
%%time

endings = (
    "code", # is a coded score and not of interest for the case
    "procedure", # is the same for the hip or knee set
    "revision_flag", # revisions are out of scope, filtered away, so same for all rows after that
    "assisted_by", # is the same for all records
    "profile", # is a coded score and not of interest for the case
    "predicted", # are predictions of other models that are not supposed to be used
)
cols2drop = [c for c in df_hip_raw.columns if c.endswith(endings)]

df_hip_clean = (
    df_hip_raw.apply(lambda s: filter_in_range(s, **hip_meta[s.name])) # filter in range numeric features
    .apply(lambda s: filter_in_labels(s, **hip_meta[s.name])) # filter in labels categorical features + ordinal if ordered
    .apply(lambda s: map_labels(s, **hip_meta[s.name])) # map the labels as values for readability
    .query("t0_revision_flag == 'no revision'") # drop revision cases
    .drop(columns=cols2drop) # drop not needed columns
    .reset_index(drop=True) # make index unique (prevent blow ups when joining)
)

# remove NaNs/missing/unknown from numerical and ordinal features
df_hip_clean = (
    df_hip_clean.apply(pd.Series.remove_categories, args=(["missing", "not known"],))
    .dropna(subset= KindSelector(kind="numerical")(df_hip_clean) + KindSelector(kind="ordinal")(df_hip_clean))
)

df_hip_clean.sample(3)

CPU times: user 913 ms, sys: 193 ms, total: 1.11 s
Wall time: 1.13 s


Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
71818,April 2017 - April 2018,70 to 79,male,no,1 to 5 years,yes,alone,yes,,,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,48.0
21533,April 2016 - April 2017,70 to 79,female,no,1 to 5 years,no,with partner / spouse / family / friends,no,,yes,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,48.0
22575,April 2016 - April 2017,80 to 89,female,no,less than 1 year,no,alone,yes,,,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,44.0


### Explanation why we can drop years

In [4]:
# def plot_year_histograms(t=0, method="eq5d"):
    
#     facet_cols = ["_".join([f"t{t}", method, dim]) for dim in methods[method]["dims"]["names"]]

#     df_plot = (
#         df_hip_clean[["t0_year"] + facet_cols]
#         .set_index("t0_year")
#         .stack()
#         .reset_index()
#         .set_axis(["year", "dimension", "value"], axis=1)
#     )

#     fig = px.histogram(
#         df_plot,
#         title=f"Distributions of values over the years for method {method} at t{t}",
#         x="value",
#         color="year",
#         barmode="group",
#         histnorm="percent",
#         facet_col="dimension",
#         facet_col_wrap=3,
#         category_orders={"value":list(methods[method]["dims"]["labels"].values())},
#     )

#     fig.update_xaxes(col=3, showticklabels=True, visible=True)
#     fig.update_layout(legend=dict(xanchor="right", x=1, yanchor="bottom", y=0))

#     fig.show()
    
# [plot_year_histograms(t=t) for t in [0, 1]];

## split data

In [5]:
# split train + test set
df_hip = df_hip_clean.query("t0_year != '2019/20'").drop(columns="t0_year")
df_hip_unseen = df_hip_clean.query("t0_year == '2019/20'").drop(columns="t0_year")

df_hip.sample(3)

Unnamed: 0,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,t0_stroke,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
18735,60 to 69,male,no,1 to 5 years,no,with partner / spouse / family / friends,yes,,yes,,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,sometimes or just at first,rarely/never,rarely/never,rarely/never,47.0
122908,50 to 59,male,no,6 to 10 years,no,with partner / spouse / family / friends,yes,,,,...,"often, not just at first",most of the time,most of the time,sometimes or just at first,rarely/never,most of the time,most of the time,most of the time,"often, not just at first",21.0
109370,50 to 59,male,no,6 to 10 years,no,alone,no,,yes,,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,48.0


In [6]:
# create x, y
X = df_hip.filter(regex="t0")
# # regression:
# y = df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"]

# classification
y_name = "t1_eq5d_discomfort"
y_labels = {k:v for k, v, in enumerate(df_hip[y_name].cat.categories)}
y = df_hip[y_name].cat.codes

# # make a smaller selection of our training data to play with
# X = X.iloc[:1000, :] # [0, 1, 2, 3, 4, -4, -3, -2, -1]]
# y = y.iloc[:1000]


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [7]:
# # for total / Laurence
# y_temp = pd.cut(
#     df_hip["t1_ohs_score"],
#     bins=[0, 29, 39, 48],
#     labels=["severe-moderate", "mild", "satisfactory"],
#     include_lowest=True,
# )
# y_labels = {k: v for k, v, in enumerate(y_temp.cat.categories)}
# y = y_temp.cat.codes
# y_temp.value_counts() / len(y_temp)

## make + train a simple pipeline

In [8]:
ct = ColumnTransformer(
    (
        ("numerical", StandardScaler(), KindSelector(kind="numerical")),
        (
            "categorical",
            OneHotEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="categorical"),
        ),
        (
            "ordinal",
            OrdinalEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="ordinal"),
        ),
    ),
    remainder="drop",
)

pl = Pipeline(
    (
        ("balancer", "passthrough"),
        ("by_column_kinds", ct),
        ("model", KNeighborsClassifier()),
    )
)

# train the pipeline/model
pl.fit(X_train, y_train)

In [9]:
# get_feature_names(pl)

## Gridsearch

In [None]:
XGBRFClassifier()

In [10]:
# implemented automatic weights in pl
class BalancedXGBRFClassifier(XGBRFClassifier):
    
    def fit(self, X, y, **kwargs):
        
        weights = compute_sample_weight(class_weight="balanced", y=y)
        kwargs.update({"sample_weight":weights})
        
        return super().fit(X, y, **kwargs)

In [11]:
%%time
# create parameter grid to search on 

# # standard same as pipeline
# param_grid = dict()

# kills warnings ;)
std_xgb_args = dict(
    n_estimators=100,
    use_label_encoder=False,
    objective="multi:softprob",
    eval_metric="mlogloss", 
)

# tuning different hyper parameters on different models
param_grid = [
    {
        "balancer": ["passthrough"],
        "model": [
            DummyClassifier(strategy="stratified"),
            BalancedBaggingClassifier(n_estimators=100),
            BalancedRandomForestClassifier(n_estimators=100),
            BalancedXGBRFClassifier(**std_xgb_args),
        ],
    },
    {
        "balancer": [RandomUnderSampler()],
        "model": [
            XGBRFClassifier(**std_xgb_args), 
            XGBClassifier(**std_xgb_args)],
    },
]


# # construct gridsearch

# # standard
GS = GridSearchCV(pl, param_grid=param_grid, scoring="roc_auc_ovo_weighted",)

# # multiple
# GS = GridSearchCV(pl, param_grid=param_grid, scoring=["balanced_accuracy", "f1"], refit=False)

# train gridsearch
GS.fit(X_train, y_train)

# show results
(
    pd.DataFrame(GS.cv_results_)
    .filter(regex=r"^(?!.*(split|time)).*$")
    .set_index("rank_test_score").sort_index()
)

CPU times: user 9min 49s, sys: 19.5 s, total: 10min 9s
Wall time: 2min 49s


Unnamed: 0_level_0,param_balancer,param_model,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,passthrough,"BalancedXGBRFClassifier(base_score=None, boost...","{'balancer': 'passthrough', 'model': BalancedX...",0.665197,0.006404
2,passthrough,BalancedBaggingClassifier(n_estimators=100),"{'balancer': 'passthrough', 'model': BalancedB...",0.659628,0.002722
3,RandomUnderSampler(),"XGBRFClassifier(base_score=None, booster=None,...","{'balancer': RandomUnderSampler(), 'model': XG...",0.65954,0.00574
4,passthrough,BalancedRandomForestClassifier(),"{'balancer': 'passthrough', 'model': BalancedR...",0.656112,0.004724
5,RandomUnderSampler(),"XGBClassifier(base_score=None, booster=None, c...","{'balancer': RandomUnderSampler(), 'model': XG...",0.639205,0.001754
6,passthrough,DummyClassifier(strategy='stratified'),"{'balancer': 'passthrough', 'model': DummyClas...",0.499996,0.002411


In [None]:
df = (
    pd.DataFrame(GS.cv_results_)
    .filter(regex=r"^(?!.*(split|time)).*$")
    .set_index("rank_test_score").sort_index()
)