# Refactored notebook for modelling

## imports

In [None]:
import sys
import os

sys.path.append("..")

import numpy as np
import pandas as pd
import pickle
import warnings
import re
import plotly.express as px
import plotly.graph_objects as go

from NHS_PROMs.settings import config
from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import (
    most_recent_file,
    downcast,
    map_labels,
    fillna_categories,
    pd_fit_resample,
    infer_categories_fit,
    KindSelector,
    get_feature_names,
    remove_categories,
)
from NHS_PROMs.data_dictionary import meta_dict, methods

import shap
shap.initjs()

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector

# from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import (
    ColumnTransformer,
    make_column_transformer,
    make_column_selector,
)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingRegressor,
    BaggingClassifier,
)
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.inspection import permutation_importance
from sklearn import set_config

set_config(display="diagram")

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler 

# use adjusted fillna which can cope with non-existing categories for CategoricalDtype
pd.core.frame.DataFrame.fillna = fillna_categories
# added a remove categories
pd.core.frame.Series.remove_categories = remove_categories
# enable autodetect of categories from CategoricalDtype by using "infer" for SMOTENC
SMOTENC.fit_resample = pd_fit_resample(SMOTENC.fit_resample)
# enable inference of categories for encoders from CategoricalDtype
OneHotEncoder.fit = infer_categories_fit(OneHotEncoder.fit)
OrdinalEncoder.fit = infer_categories_fit(OrdinalEncoder.fit)

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [None]:
from NHS_PROMs.model import pl, param_grid

class PROMsModel():
    def __init__(self, kind="hip"):
        self.kind = kind
        self.outputs = config["outputs"][kind]
        
    def load_data(self, mode="train"):
        df = (
            load_proms(part=self.kind)
            .apply(downcast)
            .rename(structure_name, axis=1)
        )
        
        self.load_meta(df.columns)
        
        df = self.preprocess(df)
        
        if mode=="train":
            df = df.query("t0_year != 'April 2019 - April 2020'").drop(columns="t0_year")
        elif mode=="predict":
            df = df.query("t0_year == 'April 2019 - April 2020'").drop(columns="t0_year")
        else: 
            raise ValueError(f"No valid mode: '{mode}'")
            
        return df
    
    def load_meta(self, columns):
        # get meta data 
        full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
        self.meta = {k: v for k, v in full_meta.items() if k in columns}
    
    def preprocess(self, df):
        # remove certain columns
        endings = config["preprocessing"]["remove_columns_ending_with"]
        cols2drop = [c for c in df.columns if c.endswith(endings)]
        
        df = (
            df.apply(lambda s: filter_in_range(s, **self.meta[s.name])) # filter in range numeric features
            .apply(lambda s: filter_in_labels(s, **self.meta[s.name])) # filter in labels categorical features + ordinal if ordered
            .apply(lambda s: map_labels(s, **self.meta[s.name])) # map the labels as values for readability
            .query("t0_revision_flag == 'no revision'") # drop revision cases
            .drop(columns=cols2drop) # drop not needed columns
            .reset_index(drop=True) # make index unique (prevent blow ups when joining)
        )

        # remove NaNs/missing/unknown from numerical and ordinal features
        df = (
            df.apply(pd.Series.remove_categories, args=(["missing", "not known"],))
            .dropna(subset= KindSelector(kind="numerical")(df) + KindSelector(kind="ordinal")(df))
        )
        
        return df
        
    def split_XY(self, df):
        
        # define inputs and outputs 
        X = df.filter(regex="t0").copy()
        Y = df[self.outputs].copy()
        
        # get cut from settings
        for col in Y.columns:
            if pd.api.types.is_numeric_dtype(Y[col]):
                Y[col] = pd.cut(
                    Y[col],
                    include_lowest=True,
                    **self.outputs[col],
                )
        
        return X, Y

    def train_models(self):
        X, Y = (
            self.load_data(mode="train")
            .pipe(self.split_XY)
        )
        self.models = dict()
        for col, y in Y.iteritems():
            self.models[col] = self.train_model(X, y)
        
    def train_model(self, X, y):
        GS = GridSearchCV(
            estimator=pl,
            param_grid=param_grid,
            scoring=config["score"]
        )
        return GS
    
    def save_models(self):
        hashable = frozenset(self.models.items())
        sha = hex(hash(hashable))[-5:]
        path = os.path.join("..", config["models"]["path"])
        filename = f"{self.kind}_{sha}.mdl"
        pickle.dump(self.models, open(os.path.join(path, filename), 'wb'))
        
    def load_models(self, filename=None):
        path = os.path.join("..", config["models"]["path"])
        if filename is None:
            filename = most_recent_file(path, ext=".mdl", prefix=self.kind)
            if filename is None:
                raise ValueError("No correct models found!")
        else:
            if not re.search(fr"^{self.kind}_", filename):
                raise Warning(f"File '{filename} does not seem to be having models for {self.kind}")
        self.models = pickle.load(open(os.path.join(path, filename), 'rb'))
        
    def predict()

In [None]:
PM = PROMsModel(kind="hip")
df = PM.load_data(mode="train").sample(30)
PM.train_models()

In [None]:
display(len(get_feature_names(pl)))
get_feature_names(pl)

## Gridsearch

In [None]:


# show results
pd.DataFrame(GS.cv_results_)\
    .filter(regex=r"^(?!.*(split|time)).*$")\
    .set_index("rank_test_score").sort_index()

## force plot independent of model used

In [None]:
class shap_force_plot:
    def __init__(self, GS, X, feature_names=None):
        # split pipeline, since explainer strips DataFrame before applying model
        self.preprocess = GS.best_estimator_[:-1].transform
        self.explainer = shap.KernelExplainer(
            model=GS.best_estimator_[-1].predict_proba,
            data=self.preprocess(X),
            link="identity",
        )
        self.feature_names = feature_names

    def plot(self, X_sample, i):
        X_preprocessed = self.preprocess(X_sample)
        return shap.force_plot(
            base_value=self.explainer.expected_value[i],
            shap_values=self.explainer.shap_values(X_preprocessed)[i],
            feature_names=self.feature_names,
            link="identity",
        )

In [None]:
sample_size = 100
feature_names = [s.replace("t0_","").replace("_yes", "") for s in get_feature_names(GS)]
fp = shap_force_plot(GS, X_train.sample(sample_size), feature_names=feature_names)

In [None]:
X = X_train.sample()
p = [fp.plot(X, i) for i in range(3)];

In [None]:
[display(p_) for p_ in p]