# Refactored notebook for modelling

## imports

In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import warnings
import re
import plotly.express as px
import plotly.graph_objects as go

from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import (
    downcast,
    map_labels,
    fillna_categories,
    pd_fit_resample,
    infer_categories_fit,
    KindSelector,
    get_feature_names,
    remove_categories,
)
from NHS_PROMs.data_dictionary import meta_dict, methods

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector

# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import (
    ColumnTransformer,
    make_column_transformer,
    make_column_selector,
)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingRegressor,
    BaggingClassifier,
)
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.inspection import permutation_importance
from sklearn import set_config

set_config(display="diagram")

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler 

# use adjusted fillna which can cope with non-existing categories for CategoricalDtype
pd.core.frame.DataFrame.fillna = fillna_categories
# added a remove categories
pd.core.frame.Series.remove_categories = remove_categories
# enable autodetect of categories from CategoricalDtype by using "infer" for SMOTENC
SMOTENC.fit_resample = pd_fit_resample(SMOTENC.fit_resample)
# enable inference of categories for encoders from CategoricalDtype
OneHotEncoder.fit = infer_categories_fit(OneHotEncoder.fit)
OrdinalEncoder.fit = infer_categories_fit(OrdinalEncoder.fit)

SEED = 42

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [2]:
# load data + rename columns with structired name
# df_knee_raw = load_proms(part="knee").apply(downcast).rename(structure_name, axis=1)
df_hip_raw = load_proms(part="hip").apply(downcast).rename(structure_name, axis=1)

# get meta data for each
full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
hip_meta = {k: v for k, v in full_meta.items() if k in df_hip_raw.columns}

df_hip_raw.sample(3)

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
7005,NT457,Hip Replacement,0,2017/18,50 to 59,1.0,2,0,1,2,...,4,4,4,4,4,4,4,4,48.0,43.657093
29121,RNA,Hip Replacement,0,2016/17,70 to 79,1.0,2,0,2,2,...,4,2,0,0,4,4,4,3,36.0,36.46262
6940,NT450,Hip Replacement,0,2017/18,70 to 79,2.0,2,0,2,2,...,4,2,4,4,4,2,4,4,43.0,38.455685


## basic cleaning

In [3]:
%%time

endings = (
    "code", # is a coded score and not of interest for the case
    "procedure", # is the same for the hip or knee set
    "revision_flag", # revisions are out of scope, filtered away, so same for all rows after that
    "assisted_by", # is the same for all records
    "profile", # is a coded score and not of interest for the case
    "predicted", # are predictions of other models that are not supposed to be used
)
cols2drop = [c for c in df_hip_raw.columns if c.endswith(endings)]

df_hip_clean = (
    df_hip_raw.apply(lambda s: filter_in_range(s, **hip_meta[s.name])) # filter in range numeric features
    .apply(lambda s: filter_in_labels(s, **hip_meta[s.name])) # filter in labels categorical features + ordinal if ordered
    .apply(lambda s: map_labels(s, **hip_meta[s.name])) # map the labels as values for readability
    .query("t0_revision_flag == 'no revision'") # drop revision cases
    .drop(columns=cols2drop) # drop not needed columns
    .reset_index(drop=True) # make index unique (prevent blow ups when joining)
)

# remove NaNs/missing/unknown from numerical and ordinal features
df_hip_clean = (
    df_hip_clean.apply(pd.Series.remove_categories, args=(["missing", "not known"],))
    .dropna(subset= KindSelector(kind="numerical")(df_hip_clean) + KindSelector(kind="ordinal")(df_hip_clean))
)

df_hip_clean.sample(3)

CPU times: user 630 ms, sys: 51.8 ms, total: 682 ms
Wall time: 680 ms


Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
89607,April 2018 - April 2019,80 to 89,female,yes,6 to 10 years,no,alone,yes,yes,,...,most of the time,"often, not just at first",most of the time,all of the time,"often, not just at first",most of the time,sometimes or just at first,"often, not just at first",most of the time,20.0
96860,April 2018 - April 2019,80 to 89,female,yes,1 to 5 years,no,with partner / spouse / family / friends,yes,,,...,rarely/never,sometimes or just at first,"often, not just at first",all of the time,sometimes or just at first,sometimes or just at first,"often, not just at first",rarely/never,sometimes or just at first,35.0
53436,April 2017 - April 2018,70 to 79,male,no,6 to 10 years,no,with partner / spouse / family / friends,no,,,...,rarely/never,rarely/never,sometimes or just at first,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,46.0


### Explanation why we can drop years

In [None]:
# def plot_year_histograms(t=0, method="eq5d"):
    
#     facet_cols = ["_".join([f"t{t}", method, dim]) for dim in methods[method]["dims"]["names"]]

#     df_plot = (
#         df_hip_clean[["t0_year"] + facet_cols]
#         .set_index("t0_year")
#         .stack()
#         .reset_index()
#         .set_axis(["year", "dimension", "value"], axis=1)
#     )

#     fig = px.histogram(
#         df_plot,
#         title=f"Distributions of values over the years for method {method} at t{t}",
#         x="value",
#         color="year",
#         barmode="group",
#         histnorm="percent",
#         facet_col="dimension",
#         facet_col_wrap=3,
#         category_orders={"value":list(methods[method]["dims"]["labels"].values())},
#     )

#     fig.update_xaxes(col=3, showticklabels=True, visible=True)
#     fig.update_layout(legend=dict(xanchor="right", x=1, yanchor="bottom", y=0))

#     fig.show()
    
# [plot_year_histograms(t=t) for t in [0, 1]];

## split data

In [4]:
# split train + test set
df_hip = df_hip_clean.query("t0_year != '2019/20'").drop(columns="t0_year")
df_hip_unseen = df_hip_clean.query("t0_year == '2019/20'").drop(columns="t0_year")

df_hip.sample(3)

Unnamed: 0,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,t0_stroke,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
64009,70 to 79,male,no,1 to 5 years,no,with partner / spouse / family / friends,no,,,,...,rarely/never,sometimes or just at first,sometimes or just at first,rarely/never,rarely/never,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,39.0
91360,70 to 79,male,no,1 to 5 years,no,with partner / spouse / family / friends,yes,,,,...,rarely/never,sometimes or just at first,rarely/never,rarely/never,rarely/never,sometimes or just at first,sometimes or just at first,rarely/never,rarely/never,44.0
71968,80 to 89,female,yes,1 to 5 years,no,alone,yes,yes,yes,,...,all of the time,sometimes or just at first,all of the time,all of the time,rarely/never,rarely/never,all of the time,rarely/never,rarely/never,31.0


In [6]:
# create x, y
X = df_hip.filter(regex="t0")
# # regression:
# y = df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"]

# classification
#y_name = "t1_eq5d_discomfort"
#y_labels = {k:v for k, v, in enumerate(df_hip[y_name].cat.categories)}
#y = df_hip[y_name].cat.codes

y_temp = pd.cut(
     df_hip["t1_ohs_score"],
     bins=[0, 29, 39, 48],
     labels=["severe-moderate", "mild", "satisfactory"],
     include_lowest=True,
     )
    
y_labels = {k: v for k, v, in enumerate(y_temp.cat.categories)}
y = y_temp.cat.codes
#y_temp.value_counts() / len(y_temp)

# # make a smaller selection of our training data to play with
# X = X.iloc[:1000, :] # [0, 1, 2, 3, 4, -4, -3, -2, -1]]
# y = y.iloc[:1000]


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=SEED)

In [7]:
y_temp.value_counts() / len(y_temp)

satisfactory       0.659223
mild               0.225011
severe-moderate    0.115766
Name: t1_ohs_score, dtype: float64

In [None]:
# # for total / Laurence
# y_temp = pd.cut(
#     df_hip["t1_ohs_score"],
#     bins=[0, 29, 39, 48],
#     labels=["severe-moderate", "mild", "satisfactory"],
#     include_lowest=True,
# )
# y_labels = {k: v for k, v, in enumerate(y_temp.cat.categories)}
# y = y_temp.cat.codes
# y_temp.value_counts() / len(y_temp)

## make + train a simple pipeline

In [8]:
ct = ColumnTransformer(
    (
        ("numerical", StandardScaler(), KindSelector(kind="numerical")),
        (
            "categorical",
            OneHotEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="categorical"),
        ),
        (
            "ordinal",
            OrdinalEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="ordinal"),
        ),
    ),
    remainder="drop",
)

pl = Pipeline(
    (
        ("balancer", "passthrough"),
        ("by_column_kinds", ct),
        ("model", KNeighborsClassifier()),
    )
)

# train the pipeline/model
pl.fit(X_train, y_train)

In [9]:
get_feature_names(pl)

['t0_eq5d_score',
 't0_eqvas_score',
 't0_ohs_score',
 't0_gender_male',
 't0_gender_female',
 't0_assisted_yes',
 't0_assisted_no',
 't0_previous_surgery_yes',
 't0_previous_surgery_no',
 't0_living_arrangements_with partner / spouse / family / friends',
 't0_living_arrangements_alone',
 't0_living_arrangements_in a nursing home, hospital or other long-term care home',
 't0_living_arrangements_other',
 't0_disability_yes',
 't0_disability_no',
 't0_heart_disease_yes',
 't0_high_bp_yes',
 't0_stroke_yes',
 't0_circulation_yes',
 't0_lung_disease_yes',
 't0_diabetes_yes',
 't0_kidney_disease_yes',
 't0_nervous_system_yes',
 't0_liver_disease_yes',
 't0_cancer_yes',
 't0_depression_yes',
 't0_arthritis_yes',
 't0_age_band',
 't0_symptom_period',
 't0_eq5d_mobility',
 't0_eq5d_self_care',
 't0_eq5d_activity',
 't0_eq5d_discomfort',
 't0_eq5d_anxiety',
 't0_ohs_pain',
 't0_ohs_sudden_pain',
 't0_ohs_night_pain',
 't0_ohs_washing',
 't0_ohs_transport',
 't0_ohs_dressing',
 't0_ohs_shopping'

## Gridsearch

Test one model

In [10]:
%%time
# create parameter grid to search on 

# # standard same as pipeline
# param_grid = dict()

# tuning different hyper parameters on different models
param_grid = [
    {
        "balancer": [RandomUnderSampler()],
        "balancer__replacement": [True, False],
        "model": [BaggingClassifier()],
        "model__n_estimators": [10, 100],
    }
]

# # construct gridsearch

# # standard
GS = GridSearchCV(pl, param_grid=param_grid, scoring="roc_auc_ovo_weighted")

# # multiple
# GS = GridSearchCV(pl, param_grid=param_grid, scoring=["balanced_accuracy", "f1"], refit=False)

# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
    .set_index("rank_test_score").sort_index()

CPU times: user 1min 51s, sys: 1.07 s, total: 1min 52s
Wall time: 1min 52s


Unnamed: 0_level_0,param_balancer,param_balancer__replacement,param_model,param_model__n_estimators,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,RandomUnderSampler(replacement=True),True,BaggingClassifier(n_estimators=100),100,{'balancer': RandomUnderSampler(replacement=Tr...,0.66195,0.00484
2,RandomUnderSampler(replacement=True),False,BaggingClassifier(n_estimators=100),100,{'balancer': RandomUnderSampler(replacement=Tr...,0.661828,0.003961
3,RandomUnderSampler(replacement=True),False,BaggingClassifier(n_estimators=100),10,{'balancer': RandomUnderSampler(replacement=Tr...,0.628779,0.004766
4,RandomUnderSampler(replacement=True),True,BaggingClassifier(n_estimators=100),10,{'balancer': RandomUnderSampler(replacement=Tr...,0.628043,0.004812


In [11]:
%%time
# create parameter grid to search on 

# # standard same as pipeline
# param_grid = dict()

# tuning different hyper parameters on different models
param_grid = [
    {
        "balancer": [RandomUnderSampler()],
        "balancer__replacement": [True, False],
        "model": [BaggingClassifier()],
        "model__n_estimators": [10, 100],
    },
    {
        "balancer": ["passthrough"],
        "model": [BalancedBaggingClassifier()],
        "model__n_estimators": [10, 100],
        "model__replacement": [True, False],
    },
    {
        "balancer": [RandomUnderSampler()],
        "balancer__replacement": [True, False],
        "model": [RandomForestClassifier()],
        "model__n_estimators": [10, 100],
    },
    {
        "balancer": ["passthrough"],
        "model": [BalancedRandomForestClassifier()],
        "model__n_estimators": [10, 100],
        "model__replacement": [True, False],
        "model__class_weight": [None, "balanced", "balanced_subsample"],
    },
    {
        "balancer": [RandomUnderSampler()],
        "balancer__replacement": [True, False],
        "model": [AdaBoostClassifier()],
        "model__n_estimators": [10, 100],
    },
    {
        "balancer": ["passthrough"],
        "model": [EasyEnsembleClassifier()],
        "model__n_estimators": [10, 100],
        "model__replacement": [True, False],
    },
]


# # construct gridsearch

# # standard
GS = GridSearchCV(pl, param_grid=param_grid, scoring="roc_auc_ovo_weighted")

# # multiple
# GS = GridSearchCV(pl, param_grid=param_grid, scoring=["balanced_accuracy", "f1"], refit=False)

# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
    .set_index("rank_test_score").sort_index()

CPU times: user 21min 40s, sys: 26.5 s, total: 22min 6s
Wall time: 22min 6s


Unnamed: 0_level_0,param_balancer,param_balancer__replacement,param_model,param_model__n_estimators,param_model__replacement,param_model__class_weight,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",100,True,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692946,0.00348
2,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",100,False,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692751,0.003282
3,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",10,False,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692697,0.003615
4,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",10,True,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692642,0.003023
5,RandomUnderSampler(),False,AdaBoostClassifier(),100,,,"{'balancer': RandomUnderSampler(), 'balancer__...",0.6868,0.003617
6,RandomUnderSampler(),True,AdaBoostClassifier(),100,,,"{'balancer': RandomUnderSampler(), 'balancer__...",0.684443,0.003714
7,passthrough,,BalancedRandomForestClassifier(),100,False,,"{'balancer': 'passthrough', 'model': BalancedR...",0.675617,0.004227
8,passthrough,,BalancedRandomForestClassifier(),100,False,balanced_subsample,"{'balancer': 'passthrough', 'model': BalancedR...",0.675379,0.004683
9,passthrough,,BalancedRandomForestClassifier(),100,True,,"{'balancer': 'passthrough', 'model': BalancedR...",0.674735,0.004383
10,passthrough,,BalancedBaggingClassifier(),100,True,,"{'balancer': 'passthrough', 'model': BalancedB...",0.674332,0.006193


In [13]:
resultsDF = pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
    .set_index("rank_test_score").sort_index()

resultsDF

Unnamed: 0_level_0,param_balancer,param_balancer__replacement,param_model,param_model__n_estimators,param_model__replacement,param_model__class_weight,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",100,True,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692946,0.00348
2,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",100,False,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692751,0.003282
3,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",10,False,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692697,0.003615
4,passthrough,,"EasyEnsembleClassifier(n_estimators=100, repla...",10,True,,"{'balancer': 'passthrough', 'model': EasyEnsem...",0.692642,0.003023
5,RandomUnderSampler(),False,AdaBoostClassifier(),100,,,"{'balancer': RandomUnderSampler(), 'balancer__...",0.6868,0.003617
6,RandomUnderSampler(),True,AdaBoostClassifier(),100,,,"{'balancer': RandomUnderSampler(), 'balancer__...",0.684443,0.003714
7,passthrough,,BalancedRandomForestClassifier(),100,False,,"{'balancer': 'passthrough', 'model': BalancedR...",0.675617,0.004227
8,passthrough,,BalancedRandomForestClassifier(),100,False,balanced_subsample,"{'balancer': 'passthrough', 'model': BalancedR...",0.675379,0.004683
9,passthrough,,BalancedRandomForestClassifier(),100,True,,"{'balancer': 'passthrough', 'model': BalancedR...",0.674735,0.004383
10,passthrough,,BalancedBaggingClassifier(),100,True,,"{'balancer': 'passthrough', 'model': BalancedB...",0.674332,0.006193


In [17]:
resultsDF.to_csv("ResultsGridsearchArthritisSeverityClasses.csv", index=False)