# Get started with your EDA

## imports

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import warnings
import re

from NHS_PROMs.load.load_data import load_proms, downcast, structure_name, get_meta, clean_data
from NHS_PROMs.utils.data_dictionary import methods, comorbidities

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [2]:
# load data + rename columns with structired name
df_knee_raw=load_proms(part="knee").rename(structure_name, axis=1)
df_hip_raw=load_proms(part="hip").rename(structure_name, axis=1)

# get df with meta data for each
df_knee_meta = get_meta(df_knee_raw.columns)
df_hip_meta = get_meta(df_hip_raw.columns)  

## basic cleaning

In [3]:
%%time
# clean the data based on meta (all not in range, labels or label "missing")
# + remove revision rows
df_knee_clean = clean_data(df_knee_raw, df_knee_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)
df_hip_clean = clean_data(df_hip_raw, df_hip_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)

# fill comorbidities mising with 0's as requested
cm_cols = ["t0_" + cm for cm in comorbidities]
df_knee_clean[cm_cols] = df_knee_clean[cm_cols].fillna(0)
df_hip_clean[cm_cols] = df_hip_clean[cm_cols].fillna(0)

# split train + test set
df_knee_train = df_knee_clean.query("t0_year != '2019/20'")
df_knee_test = df_knee_clean.query("t0_year == '2019/20'")

df_hip_train = df_hip_clean.query("t0_year != '2019/20'")
df_hip_test = df_hip_clean.query("t0_year == '2019/20'")

CPU times: user 5.16 s, sys: 5.42 s, total: 10.6 s
Wall time: 11.1 s


## create delta dataframes

In [4]:
def method_delta(df):
    # create MultiIndex
    df = df.copy().sort_index(axis=1)
    df.columns = pd.MultiIndex.from_frame(
        df.columns.str.extract(fr"^(t[01])_({'|'.join(methods.keys())})?_?(.*)$"),
        names=["available", "method", "feature"],
    )
    # select only methods dim and scores + get delta (t1 - t0)
    df = df.loc[
        :, [(m == m) & (f not in ["profile", "predicted"]) for t, m, f in df.columns]
    ]
    df_delta = (df["t1"] - df["t0"])

    df_delta.columns = ["delta_" + "_".join(col) for col in df_delta.columns]
    return df_delta

df_knee_delta = method_delta(df_knee_train)
df_hip_delta = method_delta(df_hip_train)

# now you could join them again with the original df ...
# eg: df_hip_train.join(df_hip_delta)

## Make feature set

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
cols2drop = ["t0_provider_code", "t0_procedure", "t0_assisted_by", "t0_eq5d_profile"]
df_hip = df_hip_clean.copy()\
                    .filter(regex="t0")\
                    .drop(columns=cols2drop)\
                    .dropna()

In [20]:
df_hip_meta.loc[df_hip.columns]

Unnamed: 0,t,method,feature,kind,labels,range
t0_year,0,,year,ordinal,"[{'2016/17': 'April 2016 - April 2017', '2017/...",
t0_age_band,0,,age_band,ordinal,"[{'80 to 89': '80 to 89', '60 to 69': '60 to 6...",
t0_gender,0,,gender,categorical,"[{0: 'not known', 1: 'male', 2: 'female', 9: '...",
t0_assisted,0,,assisted,categorical,"[{1: 'yes', 2: 'no', 9: 'missing'}]",
t0_symptom_period,0,,symptom_period,ordinal,"[{1: 'less than 1 year', 2: '1 to 5 years', 3:...",
t0_previous_surgery,0,,previous_surgery,categorcial,"[{1: 'yes', 2: 'no', 9: 'missing'}]",
t0_living_arrangements,0,,living_arrangements,categorical,[{1: 'with partner / spouse / family / friends...,
t0_disability,0,,disability,categorical,"[{1: 'yes', 2: 'no', 9: 'missing'}]",
t0_heart_disease,0,,heart_disease,categorical,"[{1: 'yes', 9: 'missing'}]",
t0_high_bp,0,,high_bp,categorical,"[{1: 'yes', 9: 'missing'}]",


In [13]:
df_hip

Unnamed: 0,t0_year,t0_age_band,t0_gender,t0_assisted,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,t0_disability,t0_heart_disease,t0_high_bp,...,t0_ohs_washing,t0_ohs_transport,t0_ohs_dressing,t0_ohs_shopping,t0_ohs_walking,t0_ohs_limping,t0_ohs_stairs,t0_ohs_standing,t0_ohs_work,t0_ohs_score
0,2016/17,,,2.0,2.0,1.0,1.0,1.0,0.0,0.0,...,3.0,1.0,2.0,3.0,2.0,1.0,0.0,2.0,2.0,19.0
1,2016/17,,,2.0,3.0,2.0,1.0,1.0,0.0,0.0,...,2.0,2.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,13.0
2,2016/17,,,1.0,2.0,2.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,7.0
3,2016/17,,,2.0,4.0,2.0,1.0,2.0,0.0,0.0,...,2.0,2.0,1.0,3.0,2.0,1.0,2.0,2.0,2.0,21.0
4,2016/17,,,2.0,4.0,2.0,1.0,2.0,0.0,0.0,...,1.0,1.0,1.0,2.0,2.0,0.0,2.0,1.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21874,2019/20,80 to 89,2.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,...,2.0,2.0,3.0,3.0,0.0,1.0,1.0,1.0,1.0,15.0
21876,2019/20,80 to 89,2.0,2.0,2.0,2.0,1.0,2.0,0.0,1.0,...,3.0,2.0,1.0,1.0,4.0,1.0,2.0,1.0,1.0,20.0
21877,2019/20,80 to 89,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,2.0,3.0,2.0,21.0
21878,2019/20,80 to 89,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,...,1.0,1.0,1.0,2.0,4.0,0.0,2.0,1.0,1.0,18.0


In [22]:
df_hip_meta.loc["t0_age_band", "labels"]

[{'80 to 89': '80 to 89',
  '60 to 69': '60 to 69',
  '50 to 59': '50 to 59',
  '40 to 49': '40 to 49',
  '30 to 39': '30 to 39',
  '70 to 79': '70 to 79',
  '20 to 29': '20 to 29',
  '90 to 120': '90 to 120'}]