# Refactored notebook for modelling

## imports

In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import warnings
import re

from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import downcast, map_labels, fillna_categories, pd_fit_resample
from NHS_PROMs.data_dictionary import meta_dict

# use adjusted fillna which can cope with non-existing categories
pd.core.frame.DataFrame.fillna = fillna_categories
pd.core.frame.DataFrame.fillna = fillna_categories

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector
# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score, 
from sklearn.inspection import permutation_importance
from sklearn import set_config
set_config(display='diagram')

from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline, make_pipeline

# enable autodetect by using "infer" + the use of column names
SMOTENC.fit_resample = pd_fit_resample(SMOTENC.fit_resample)

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [2]:
# load data + rename columns with structired name
# df_knee_raw = load_proms(part="knee").apply(downcast).rename(structure_name, axis=1)
df_hip_raw = load_proms(part="hip").apply(downcast).rename(structure_name, axis=1)

# get meta data for each
full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
hip_meta = {k: v for k, v in full_meta.items() if k in df_hip_raw.columns}

df_hip_raw.sample(3)

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
16570,RBD,Hip Replacement,0,2016/17,70 to 79,1.0,2,0,2,2,...,3,3,4,4,4,3,3,3,40.0,40.748692
34908,RTG,Hip Replacement,0,2016/17,50 to 59,1.0,2,0,2,2,...,4,4,4,4,4,4,4,4,48.0,45.764263
44066,RYR,Hip Replacement,0,2016/17,60 to 69,2.0,2,0,2,2,...,4,4,4,4,4,4,4,4,47.0,43.251167


## basic cleaning

In [3]:
endings = (
    "code",
    "procedure",
    "revision_flag",
    "assisted_by",
    "profile",
    "predicted",
)
cols2drop = [c for c in df_hip_raw.columns if c.endswith(endings)]

In [4]:
%%time
df_hip_clean = (
    df_hip_raw.apply(lambda s: filter_in_range(s, **hip_meta[s.name]))
    .apply(lambda s: filter_in_labels(s, **hip_meta[s.name]))
    .apply(lambda s: map_labels(s, **hip_meta[s.name]))
    .query("t0_revision_flag == 'no revision'")
    .drop(columns=cols2drop)
    #     .replace("missing", np.nan)
)

df_hip_clean.sample(3)

CPU times: user 742 ms, sys: 72.8 ms, total: 815 ms
Wall time: 852 ms


Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
18233,April 2016 - April 2017,60 to 69,male,yes,1 to 5 years,no,with partner / spouse / family / friends,yes,yes,yes,...,"often, not just at first",sometimes or just at first,all of the time,"often, not just at first",sometimes or just at first,"often, not just at first","often, not just at first","often, not just at first",sometimes or just at first,24.0
6086,April 2016 - April 2017,70 to 79,female,no,1 to 5 years,no,alone,no,missing,missing,...,rarely/never,rarely/never,"often, not just at first",rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,43.0
32730,April 2018 - April 2019,70 to 79,male,no,1 to 5 years,no,with partner / spouse / family / friends,no,missing,missing,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,44.0


## split data

In [5]:
# split train + test set
# df_knee_seen = df_knee_clean.query("t0_year != '2019/20'")
# df_knee_unseen = df_knee_clean.query("t0_year == '2019/20'")

df_hip = df_hip_clean.query("t0_year != '2019/20'")
df_hip_unseen = df_hip_clean.query("t0_year == '2019/20'")

df_hip.sample(3)

Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
6266,April 2019 - April 2020,60 to 69,female,no,1 to 5 years,no,alone,no,missing,yes,...,rarely/never,rarely/never,sometimes or just at first,rarely/never,rarely/never,sometimes or just at first,rarely/never,sometimes or just at first,sometimes or just at first,42.0
27324,April 2016 - April 2017,70 to 79,female,no,6 to 10 years,no,alone,yes,missing,yes,...,"often, not just at first","often, not just at first",most of the time,"often, not just at first",most of the time,most of the time,"often, not just at first",most of the time,most of the time,15.0
11521,April 2018 - April 2019,80 to 89,female,no,1 to 5 years,no,with partner / spouse / family / friends,yes,missing,yes,...,sometimes or just at first,sometimes or just at first,"often, not just at first",sometimes or just at first,"often, not just at first",sometimes or just at first,"often, not just at first","often, not just at first",sometimes or just at first,28.0


## create delta dataframes

In [6]:
# df_org = df_hip_seen.apply(
#     lambda s: map_labels(s, backwards=True, **hip_meta[s.name])
# ).apply(np.asarray)

# # df_knee_delta = method_delta(df_knee_train)
# df_hip_delta = method_delta(df_org)

# # now you could join them again with the original df ...
# # eg: df_hip_train.join(df_hip_delta)
# df_hip_delta.sample(5)

## Make feature set

In [7]:
# asses quickly missing
print(len(df_hip), "original")
print(len(df_hip.dropna()), "after possible total dropna")
(df_hip.isna().sum() / len(df_hip)).sort_values(ascending=False).head(10)

139251 original
103346 after possible total dropna


t0_age_band          0.089507
t0_gender            0.089507
t0_eqvas_score       0.089385
t0_eq5d_score        0.056014
t1_eqvas_score       0.047734
t1_eq5d_score        0.042082
t0_ohs_score         0.010959
t1_ohs_score         0.010908
t0_eq5d_mobility     0.000000
t0_eq5d_self_care    0.000000
dtype: float64

In [8]:
# remove NaNs from non categorical/ordinal columns (numerical)
print(len(df_hip), "original")
num_cols = df_hip.select_dtypes(exclude="category").columns
df_hip = df_hip.dropna(subset=num_cols).fillna(value="missing")

print(len(df_hip), "after dropna on numerical + fillna on categories")

139251 original
113818 after dropna on numerical + fillna on categories


In [9]:
# create x, y
X = df_hip.filter(regex="t0")
y = (df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"] <= 20).astype(int) # knee <= 7

# make a smaller selection of our training data to play with
X = X.iloc[:1000, -5:]
y = y.iloc[:1000]


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

## make balanced

In [10]:
# print("before:")
# display(y_train.value_counts())

# # cat_cols = X_train.dtypes == "category"
# # cat_cols = X_train.columns[cat_cols] TO DO: Fix with eg column selector

# resampler = SMOTENC(categorical_features="infer")
# X_train_balanced, y_train_balanced = resampler.fit_resample(X_train, y_train)

# print("after:")
# display(y_train_balanced.value_counts())

## make + train a simple pipeline

In [12]:
# make the pipeline
ct = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_include="category")),
    (StandardScaler(), make_column_selector(dtype_include="number")),
)

pl = make_pipeline(
#     SMOTENC(categorical_features="infer"), # disabled just for the sake of speed
    ct,
    KNeighborsClassifier(),
)

In [13]:
ct = ColumnTransformer(
    (
        ("categorical", OneHotEncoder(), make_column_selector(dtype_include="category")),
        ("numerical", StandardScaler(), make_column_selector(dtype_include="number")),
    ),
    remainder="drop",
)

pl = Pipeline(
    (
        ("balancer", SMOTENC(categorical_features="infer")),
        ("by_column_types", ct),
        ("model", KNeighborsClassifier()),
    )
)

# train the pipeline/model
pl.fit(X_train, y_train)

## predict + evaluate

In [14]:
# make prediction
y_hat = pl.predict(X_test.head(500))

# evaluate
print(classification_report(y_test.head(500), y_hat))

              precision    recall  f1-score   support

           0       0.74      0.75      0.75       186
           1       0.68      0.67      0.67       144

    accuracy                           0.72       330
   macro avg       0.71      0.71      0.71       330
weighted avg       0.71      0.72      0.71       330



## Gridsearch

In [17]:
# create parameter grid to search on 
# standard (same as pipeline)
param_grid = dict()

# construct gridsearch
GS = GridSearchCV(pl, param_grid=param_grid, scoring="f1") ## add scoring

# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
    .set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,{},0.62341,0.042112


In [24]:
# create parameter grid to search on 

# standard same as pipeline
param_grid = dict()

# # # two models with default parameters
param_grid = {"model": [KNeighborsClassifier(), DecisionTreeClassifier()]}

# # tuning hyper parameters
param_grid = {
    "model": [RandomForestClassifier(), AdaBoostClassifier()],
    "model__n_estimators": [25, 50, 100],
}

# tuning different hyper parameters on different models
param_grid = [
    {
        "model": [RandomForestClassifier()],
        "model__n_estimators": [25, 50, 100],
    },
    {
        "model": [KNeighborsClassifier()],
        "model__n_neighbors": [2, 5, 10],
    },
]


# # construct gridsearch

# # standard
# GS = GridSearchCV(pl, param_grid=param_grid)

# # # # add scoring 
# GS = GridSearchCV(pl, param_grid=param_grid, scoring="f1")

# # multiple
GS = GridSearchCV(pl, param_grid=param_grid, scoring=["balanced_accuracy", "f1"], refit=False)


# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
#     .set_index("rank_test_score").sort_index()

Unnamed: 0,param_model,param_model__n_estimators,param_model__n_neighbors,params,mean_test_balanced_accuracy,std_test_balanced_accuracy,rank_test_balanced_accuracy,mean_test_f1,std_test_f1,rank_test_f1
0,RandomForestClassifier(),25.0,,"{'model': RandomForestClassifier(), 'model__n_...",0.687605,0.013575,5,0.645654,0.033942,3
1,RandomForestClassifier(),50.0,,"{'model': RandomForestClassifier(), 'model__n_...",0.699575,0.0164,2,0.656883,0.030983,2
2,RandomForestClassifier(),100.0,,"{'model': RandomForestClassifier(), 'model__n_...",0.700357,0.021489,1,0.658819,0.036525,1
3,KNeighborsClassifier(),,2.0,"{'model': KNeighborsClassifier(), 'model__n_ne...",0.665567,0.048039,6,0.5599,0.087594,6
4,KNeighborsClassifier(),,5.0,"{'model': KNeighborsClassifier(), 'model__n_ne...",0.690511,0.025143,4,0.644055,0.036591,4
5,KNeighborsClassifier(),,10.0,"{'model': KNeighborsClassifier(), 'model__n_ne...",0.692775,0.026182,3,0.628044,0.042289,5


## regression

In [25]:
# create x, y
X = df_hip.filter(regex="t0")
y = df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"]


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

# make a smaller selection of our training data to play with
X_train = X_train.iloc[:1000, -5:]
y_train = y_train.iloc[:1000]

In [26]:
# make parameter grid
param_grid = {
    "balancer": ["passthrough"],
    "model": [KNeighborsRegressor()],
}

GS = GridSearchCV(pl, param_grid=param_grid)
# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")
#     .set_index("rank_test_score").sort_index()



Unnamed: 0,param_balancer,param_model,params,mean_test_score,std_test_score,rank_test_score
0,passthrough,KNeighborsRegressor(),"{'balancer': 'passthrough', 'model': KNeighbor...",0.23894,0.037825,1


## extract feature names pl

In [None]:
# get the feature names from pipeline
def get_feature_names(sklobj, feature_names=[]):

    if isinstance(sklobj, Pipeline):
        for name, step in sklobj.steps:
            get_feature_names(step, feature_names)
    elif isinstance(sklobj, ColumnTransformer):
        for name, transformer, columns in sklobj.transformers_:
            feature_names += get_feature_names(transformer, columns)
    elif isinstance(sklobj, OneHotEncoder):
        feature_names = sklobj.get_feature_names(feature_names).tolist()
    elif isinstance(sklobj, str):
        if sklobj == "passthrough":
            pass
        elif sklobj == "drop":
            feature_names = []
            
    return feature_names

In [None]:
get_feature_names(pl)

In [None]:
# # this is slow ...
# r = permutation_importance(pl, X_train.head(1_000), y_train.head(1_000), n_repeats=2, random_state=0)

# feature_names = get_feature_names(pl)

# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
#         print(f"{feature_names[i]:<8}"
#         f"{r.importances_mean[i]:.3f}"
#         f" +/- {r.importances_std[i]:.3f}")

## a more advanced pipeline

In [None]:
# TO DO ...