# Refactored notebook for modelling

## imports

In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import warnings
import re

from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import downcast, map_labels, fillna_categories, pd_fit_resample
from NHS_PROMs.data_dictionary import meta_dict

# use adjusted fillna which can cope with non-existing categories
pd.core.frame.DataFrame.fillna = fillna_categories
pd.core.frame.DataFrame.fillna = fillna_categories

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from sklearn import set_config
set_config(display='diagram')

from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline, make_pipeline

# enable autodetect by using "infer" + the use of column names
SMOTENC.fit_resample = pd_fit_resample(SMOTENC.fit_resample)

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [2]:
# load data + rename columns with structired name
# df_knee_raw = load_proms(part="knee").apply(downcast).rename(structure_name, axis=1)
df_hip_raw = load_proms(part="hip").apply(downcast).rename(structure_name, axis=1)

# get meta data for each
full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
hip_meta = {k: v for k, v in full_meta.items() if k in df_hip_raw.columns}

df_hip_raw.sample(3)

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
29433,RRJ,Hip Replacement,1,2017/18,70 to 79,1.0,2,0,2,1,...,2,3,4,4,4,4,4,4,44.0,36.857944
27105,RN3,Hip Replacement,0,2018/19,70 to 79,1.0,2,0,2,2,...,4,4,4,3,4,3,4,3,42.0,39.813755
2307,RHU,Hip Replacement,0,2019/20,,,2,0,2,2,...,4,4,4,4,4,4,4,4,48.0,39.406384


## basic cleaning

In [3]:
endings = (
    "code",
    "procedure",
    "revision_flag",
    "assisted_by",
    "profile",
    "predicted",
)
cols2drop = [c for c in df_hip_raw.columns if c.endswith(endings)]

In [4]:
%%time
df_hip_clean = (
    df_hip_raw.apply(lambda s: filter_in_range(s, **hip_meta[s.name]))
    .apply(lambda s: filter_in_labels(s, **hip_meta[s.name]))
    .apply(lambda s: map_labels(s, **hip_meta[s.name]))
    .query("t0_revision_flag == 'no revision'")
    .drop(columns=cols2drop)
    #     .replace("missing", np.nan)
)

df_hip_clean.sample(3)

CPU times: user 532 ms, sys: 40.7 ms, total: 572 ms
Wall time: 573 ms


Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
26628,April 2017 - April 2018,70 to 79,female,no,1 to 5 years,no,with partner / spouse / family / friends,yes,missing,missing,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,all of the time,rarely/never,rarely/never,44.0
35066,April 2016 - April 2017,60 to 69,female,no,1 to 5 years,no,with partner / spouse / family / friends,yes,missing,missing,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,48.0
6207,April 2016 - April 2017,60 to 69,female,no,1 to 5 years,no,with partner / spouse / family / friends,yes,missing,missing,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,48.0


## split data

In [5]:
# split train + test set
# df_knee_seen = df_knee_clean.query("t0_year != '2019/20'")
# df_knee_unseen = df_knee_clean.query("t0_year == '2019/20'")

df_hip = df_hip_clean.query("t0_year != '2019/20'")
df_hip_unseen = df_hip_clean.query("t0_year == '2019/20'")

df_hip.sample(3)

Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t1_ohs_washing,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score
3229,April 2016 - April 2017,,,no,1 to 5 years,no,with partner / spouse / family / friends,yes,missing,missing,...,rarely/never,rarely/never,rarely/never,rarely/never,rarely/never,sometimes or just at first,rarely/never,sometimes or just at first,sometimes or just at first,43.0
12707,April 2016 - April 2017,50 to 59,male,no,less than 1 year,no,alone,no,missing,yes,...,rarely/never,rarely/never,"often, not just at first",rarely/never,rarely/never,sometimes or just at first,sometimes or just at first,rarely/never,sometimes or just at first,35.0
38291,April 2017 - April 2018,60 to 69,male,no,1 to 5 years,no,with partner / spouse / family / friends,yes,missing,yes,...,rarely/never,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,sometimes or just at first,"often, not just at first",39.0


## create delta dataframes

In [6]:
# df_org = df_hip_seen.apply(
#     lambda s: map_labels(s, backwards=True, **hip_meta[s.name])
# ).apply(np.asarray)

# # df_knee_delta = method_delta(df_knee_train)
# df_hip_delta = method_delta(df_org)

# # now you could join them again with the original df ...
# # eg: df_hip_train.join(df_hip_delta)
# df_hip_delta.sample(5)

## Make feature set

In [7]:
# asses quickly missing
print(len(df_hip), "original")
print(len(df_hip.dropna()), "after possible total dropna")
(df_hip.isna().sum() / len(df_hip)).sort_values(ascending=False).head(10)

139251 original
103346 after possible total dropna


t0_age_band          0.089507
t0_gender            0.089507
t0_eqvas_score       0.089385
t0_eq5d_score        0.056014
t1_eqvas_score       0.047734
t1_eq5d_score        0.042082
t0_ohs_score         0.010959
t1_ohs_score         0.010908
t0_eq5d_mobility     0.000000
t0_eq5d_self_care    0.000000
dtype: float64

In [8]:
# remove NaNs from non categorical/ordinal columns (numerical)
print(len(df_hip), "original")
num_cols = df_hip.select_dtypes(exclude="category").columns
df_hip = df_hip.dropna(subset=num_cols).fillna(value="missing")

print(len(df_hip), "after dropna on numerical + fillna on categories")

139251 original


ValueError: fill value must be in categories

In [None]:
# create x, y
X = df_hip.filter(regex="t0")
y = (df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"] <= 20).astype(int) # knee <= 7


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

## make balanced

In [None]:
print("before:")
display(y_train.value_counts())

# cat_cols = X_train.dtypes == "category"
# cat_cols = X_train.columns[cat_cols] TO DO: Fix with eg column selector

resampler = SMOTENC(categorical_features="infer")
X_train_balanced, y_train_balanced = resampler.fit_resample(X_train, y_train)

print("after:")
display(y_train_balanced.value_counts())

## make + train a simple pipeline

In [None]:
# make the pipeline
ct = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_include="category")),
    (StandardScaler(), make_column_selector(dtype_include="number")),
)

pl = make_pipeline(
    SMOTENC(categorical_features="infer"), ct, KNeighborsClassifier()
)

# train the pipeline/model
pl.fit(X_train, y_train)

## predict + evaluate

In [None]:
# make prediction
y_hat = pl.predict(X_test.head(500))

# evaluate
print(classification_report(y_test.head(500), y_hat))

## extract feature names pl

In [None]:
# get the feature names from pipeline
def get_feature_names(sklobj, feature_names=[]):

    if isinstance(sklobj, Pipeline):
        for name, step in sklobj.steps:
            get_feature_names(step, feature_names)
    elif isinstance(sklobj, ColumnTransformer):
        for name, transformer, columns in sklobj.transformers_:
            feature_names += get_feature_names(transformer, columns)
    elif isinstance(sklobj, OneHotEncoder):
        feature_names = sklobj.get_feature_names(feature_names).tolist()
    elif isinstance(sklobj, str):
        if sklobj == "passthrough":
            pass
        elif sklobj == "drop":
            feature_names = []
            
    return feature_names

In [None]:
get_feature_names(pl)

In [None]:
# # this is slow ...
# r = permutation_importance(pl, X_train.head(1_000), y_train.head(1_000), n_repeats=2, random_state=0)

# feature_names = get_feature_names(pl)

# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
#         print(f"{feature_names[i]:<8}"
#         f"{r.importances_mean[i]:.3f}"
#         f" +/- {r.importances_std[i]:.3f}")

## a more advanced pipeline

In [None]:
# TO DO ...