# Refactored notebook for modelling

## imports

In [None]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import warnings
import re
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import (
    downcast,
    map_labels,
    fillna_categories,
    pd_fit_resample,
    infer_categories_fit,
    KindSelector,
    get_feature_names,
    remove_categories,
)
from NHS_PROMs.data_dictionary import meta_dict, methods

import shap
shap.initjs()

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector
from sklearn.multioutput import MultiOutputClassifier

# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import (
    ColumnTransformer,
    make_column_transformer,
    make_column_selector,
)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingRegressor,
    BaggingClassifier,
)
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.inspection import permutation_importance
from sklearn import set_config

set_config(display="diagram")

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler 

# use adjusted fillna which can cope with non-existing categories for CategoricalDtype
pd.core.frame.DataFrame.fillna = fillna_categories
# added a remove categories
pd.core.frame.Series.remove_categories = remove_categories
# enable autodetect of categories from CategoricalDtype by using "infer" for SMOTENC
SMOTENC.fit_resample = pd_fit_resample(SMOTENC.fit_resample)
# enable inference of categories for encoders from CategoricalDtype
OneHotEncoder.fit = infer_categories_fit(OneHotEncoder.fit)
OrdinalEncoder.fit = infer_categories_fit(OrdinalEncoder.fit)

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [None]:
# load data + rename columns with structired name
# df_knee_raw = load_proms(part="knee").apply(downcast).rename(structure_name, axis=1)
df_hip_raw = load_proms(part="hip").apply(downcast).rename(structure_name, axis=1)

# get meta data for each
full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
hip_meta = {k: v for k, v in full_meta.items() if k in df_hip_raw.columns}

df_hip_raw.sample(3)

## basic cleaning

In [None]:
%%time

endings = (
    "code", # is a coded score and not of interest for the case
    "procedure", # is the same for the hip or knee set
    "revision_flag", # revisions are out of scope, filtered away, so same for all rows after that
    "assisted_by", # is the same for all records
    "profile", # is a coded score and not of interest for the case
    "predicted", # are predictions of other models that are not supposed to be used
)
cols2drop = [c for c in df_hip_raw.columns if c.endswith(endings)]

df_hip_clean = (
    df_hip_raw.apply(lambda s: filter_in_range(s, **hip_meta[s.name])) # filter in range numeric features
    .apply(lambda s: filter_in_labels(s, **hip_meta[s.name])) # filter in labels categorical features + ordinal if ordered
    .apply(lambda s: map_labels(s, **hip_meta[s.name])) # map the labels as values for readability
    .query("t0_revision_flag == 'no revision'") # drop revision cases
    .drop(columns=cols2drop) # drop not needed columns
    .reset_index(drop=True) # make index unique (prevent blow ups when joining)
)

# remove NaNs/missing/unknown from numerical and ordinal features
df_hip_clean = (
    df_hip_clean.apply(pd.Series.remove_categories, args=(["missing", "not known"],))
    .dropna(subset= KindSelector(kind="numerical")(df_hip_clean) + KindSelector(kind="ordinal")(df_hip_clean))
)

df_hip_clean.sample(3)

### Explanation why we can drop years

In [None]:
# def plot_year_histograms(t=0, method="eq5d"):
    
#     facet_cols = ["_".join([f"t{t}", method, dim]) for dim in methods[method]["dims"]["names"]]

#     df_plot = (
#         df_hip_clean[["t0_year"] + facet_cols]
#         .set_index("t0_year")
#         .stack()
#         .reset_index()
#         .set_axis(["year", "dimension", "value"], axis=1)
#     )

#     fig = px.histogram(
#         df_plot,
#         title=f"Distributions of values over the years for method {method} at t{t}",
#         x="value",
#         color="year",
#         barmode="group",
#         histnorm="percent",
#         facet_col="dimension",
#         facet_col_wrap=3,
#         category_orders={"value":list(methods[method]["dims"]["labels"].values())},
#     )

#     fig.update_xaxes(col=3, showticklabels=True, visible=True)
#     fig.update_layout(legend=dict(xanchor="right", x=1, yanchor="bottom", y=0))

#     fig.show()
    
# [plot_year_histograms(t=t) for t in [0, 1]];

## split data

In [None]:
# split train + test set
df_hip = df_hip_clean.query("t0_year != '2019/20'").drop(columns="t0_year")
df_hip_unseen = df_hip_clean.query("t0_year == '2019/20'").drop(columns="t0_year")

df_hip.sample(3)

In [None]:
# create x, y
X = df_hip.filter(regex="t0")
# # regression:
# y = df_hip["t1_ohs_score"] - df_hip["t0_ohs_score"]


# classification
# y_name = "t1_eq5d_discomfort"
# y_labels = {k:v for k, v, in enumerate(df_hip[y_name].cat.categories)}
# y = df_hip[y_name].cat.codes
Y = (
    df_hip.filter(regex="t1_eq5d")
    .drop(columns="t1_eq5d_score")
#     .assign(
#         t1_total_class=pd.cut(
#             df_hip["t1_ohs_score"],
#             bins=[0, 29, 39, 48],
#             labels=["severe-moderate", "mild", "satisfactory"],
#             include_lowest=True,
#         )
#     )
)
y = Y["t1_eq5d_discomfort"] 

# # make a smaller selection of our training data to play with
# X = X.iloc[:1000, :] # [0, 1, 2, 3, 4, -4, -3, -2, -1]]
# y = y.iloc[:1000]


# create train, test
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)

## make + train a simple pipeline

In [None]:
ct = ColumnTransformer(
    (
        ("numerical", StandardScaler(), KindSelector(kind="numerical")),
        (
            "categorical",
            OneHotEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="categorical"),
        ),
        (
            "ordinal",
            OrdinalEncoder(categories="categories", handle_unknown="ignore"),
            KindSelector(kind="ordinal"),
        ),
    ),
    remainder="drop",
)

pl = Pipeline(
    (
        ("balancer", "passthrough"),
        ("by_column_kinds", ct),
        ("model", MultiOutputClassifier(KNeighborsClassifier())),
    )
)

# train the pipeline/model
pl.fit(X_train, y_train)

In [None]:
display(len(get_feature_names(pl)))
get_feature_names(pl)

## Gridsearch

In [None]:
%%time
# create parameter grid to search on 

# # standard same as pipeline
# param_grid = dict()

# tuning different hyper parameters on different models
param_grid = [
#     {
#         "balancer": [RandomUnderSampler()],
#         "balancer__replacement": [True, False],
#         "model": [BaggingClassifier()],
#         "model__n_estimators": [10, 100],
#     },
    {
        "balancer": ["passthrough"],
        "model__estimator": [BalancedBaggingClassifier()],
        "model__estimator__n_estimators": [10],
        "model__estimator__replacement": [True],
    },
#     {
#         "balancer": [RandomUnderSampler()],
#         "balancer__replacement": [True, False],
#         "model": [RandomForestClassifier()],
#         "model__n_estimators": [10, 100],
#     },
#     {
#         "balancer": ["passthrough"],
#         "model": [BalancedRandomForestClassifier()],
#         "model__n_estimators": [10, 100],
#         "model__replacement": [True, False],
#         "model__class_weight": [None, "balanced", "balanced_subsample"],
#     },
#     {
#         "balancer": [RandomUnderSampler()],
#         "balancer__replacement": [True, False],
#         "model": [AdaBoostClassifier()],
#         "model__n_estimators": [10, 100],
#     },
#     {
#         "balancer": ["passthrough"],
#         "model": [EasyEnsembleClassifier()],
#         "model__n_estimators": [10, 100],
#         "model__replacement": [True, False],
#     },
]


# # construct gridsearch

# # standard
GS = GridSearchCV(pl, param_grid=param_grid) #, scoring="roc_auc_ovo_weighted")

# # multiple
# GS = GridSearchCV(pl, param_grid=param_grid, scoring=["balanced_accuracy", "f1"], refit=False)

# train gridsearch
GS.fit(X_train, y_train)

# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
    .set_index("rank_test_score").sort_index()

## predict mulitoutput proba

In [None]:
# make multi output prediction
X_sample = X_test.sample()
Y_hat = GS.predict_proba(X_sample)

In [None]:
# status before in plotly format
df_t0 = (
    X_sample.filter(regex="t0_eq5d")
    .drop(columns="t0_eq5d_score")
#     .assign(
#         t0_total_class=pd.cut(
#             df_hip["t0_ohs_score"],
#             bins=[0, 29, 39, 48],
#             labels=["severe-moderate", "mild", "satisfactory"],
#             include_lowest=True,
#         )
#     )
    .T
    .set_axis(labels=["variable"], axis=1)
    .assign(value=1)
)

# status after in plotly format
df_hat = pd.DataFrame(
    data=np.reshape(Y_hat, [-1, 1]),
    index=pd.MultiIndex.from_product(
        [Y.columns, Y.iloc[:,0].cat.categories],
        names=["dimension", "variable"],
    ),
    columns=["value"],
).reset_index("variable")

# concat before and after
df = pd.concat([df_t0, df_hat])
df["variable"] = df["variable"].astype(pd.CategoricalDtype(Y.iloc[:,0].cat.categories, ordered=True))
df["value"] = df["value"].astype(float)

## visualisation

In [None]:
method = "eq5d"
cols = ["t0", "t1"]
rows = df.index.str.replace("t[01]_eq5d_", "").unique().to_list()
colormap = {k: v for k, v in zip(df["variable"].cat.categories, ["green", "orange", "red"])}

row_specs = [{'type':'domain'}, {"type":"xy"}, {'type':'domain'}]
specs = [row_specs] * len(rows)
fig = make_subplots(rows=len(rows), cols=len(cols)+1, specs=specs, 
                    subplot_titles=["before", "surgery", "after"] + [""] * len(cols) * (len(rows)-1),
                   )

# r = 0
# c = 1

for r in range(len(rows)):
    for c in range(len(cols)):
        df_subplot = df.filter(like=f"{cols[c]}_{method}_{rows[r]}", axis=0)
        if c==1:
            c += 1
        fig.add_trace(
                go.Pie(
                    labels=df_subplot["variable"].to_list(),
                    values=df_subplot["value"].to_list(),
                    marker={"colors": df_subplot["variable"].map(colormap).to_list()},
#                     name="Starry Night",
        #             marker_colors=night_colors,
                ),
                row=r + 1,
                col=c + 1,
            )
            
arrow = go.Scatter(
    x=[2, 1, 1, -1, -1, 1, 1, 2], 
    y=[0, 2, 1, 1, -1, -1, -2, 0], 
    mode="lines",
    line_color="gray",
    fill="toself",
)
for r in range(len(rows)):
    fig.add_trace(arrow, row=r+1, col=2)
    fig.add_annotation(
        x=0, y=0,
        text=rows[r],
        font={"size":18},
        showarrow=False,
        row=r+1, col=2,
    )

fig.update_layout(
    showlegend=False,
    height=1000,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.update_xaxes(visible=False, col=2)
fig.update_yaxes(visible=False, col=2)
# fig.update_layout(visible=False, col=2)
    

fig.update_traces(hoverinfo='none', textinfo='label', col=1)
fig.update_traces(hoverinfo='none', col=2)
fig.update_traces(hoverinfo='label', textinfo='percent', col=3)
fig.show()


In [None]:
p[2]