# Get started with your EDA

## imports

In [3]:
import pandas as pd
import warnings
import re

from utils.load_data import load_proms, downcast, structure_name, get_meta, clean_data
from utils.data_dictionary import methods, comorbidities

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [4]:
# load data + rename columns with structired name
df_knee_raw=load_proms(part="knee").rename(structure_name, axis=1)
df_hip_raw=load_proms(part="hip").rename(structure_name, axis=1)

# get df with meta data for each
df_knee_meta = get_meta(df_knee_raw.columns)
df_hip_meta = get_meta(df_hip_raw.columns)  

In [17]:
df_hip_meta.query("feature != 'predicted'")

Unnamed: 0,t,method,feature,kind,labels,range
t0_provider_code,0,,provider_code,categorical,,
t0_procedure,0,,procedure,categorical,"[{'Hip Replacement': 'hip Replacement', 'Knee ...",
t0_revision_flag,0,,revision_flag,categorical,"[{0: 'no revision', 1: 'revision procedure'}]",
t0_year,0,,year,ordinal,"[{'2016/17': 'April 2016 - April 2017', '2017/...",
t0_age_band,0,,age_band,ordinal,"[{'80 to 89': '80 to 89', '60 to 69': '60 to 6...",
...,...,...,...,...,...,...
t1_ohs_limping,1,ohs,limping,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",
t1_ohs_stairs,1,ohs,stairs,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",
t1_ohs_standing,1,ohs,standing,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",
t1_ohs_work,1,ohs,work,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",


## basic cleaning

In [None]:
%%time
# clean the data based on meta (all not in range, labels or label "missing")
# + remove revision rows
df_knee_clean = clean_data(df_knee_raw, df_knee_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)
df_hip_clean = clean_data(df_hip_raw, df_hip_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)

# fill comorbidities mising with 0's as requested
cm_cols = ["t0_" + cm for cm in comorbidities]
df_knee_clean[cm_cols] = df_knee_clean[cm_cols].fillna(0)
df_hip_clean[cm_cols] = df_hip_clean[cm_cols].fillna(0)

# split train + test set
df_knee_train = df_knee_clean.query("t0_year != '2019/20'")
df_knee_test = df_knee_clean.query("t0_year == '2019/20'")

df_hip_train = df_hip_clean.query("t0_year != '2019/20'")
df_hip_test = df_hip_clean.query("t0_year == '2019/20'")

## create delta dataframes

In [None]:
def method_delta(df):
    # create MultiIndex
    df = df.copy().sort_index(axis=1)
    df.columns = pd.MultiIndex.from_frame(
        df.columns.str.extract(fr"^(t[01])_({'|'.join(methods.keys())})?_?(.*)$"),
        names=["available", "method", "feature"],
    )
    # select only methods dim and scores + get delta (t1 - t0)
    df = df.loc[
        :, [(m == m) & (f not in ["profile", "predicted"]) for t, m, f in df.columns]
    ]
    df_delta = (df["t1"] - df["t0"])

    df_delta.columns = ["delta_" + "_".join(col) for col in df_delta.columns]
    return df_delta

df_knee_delta = method_delta(df_knee_train)
df_hip_delta = method_delta(df_hip_train)

# now you could join them again with the original df ...
# eg: df_hip_train.join(df_hip_delta)