# Get started with your EDA

## imports

In [1]:
import pandas as pd
import warnings
import re

from utils.load_data import load_proms, downcast, structure_name, get_meta, clean_data
from utils.data_dictionary import methods, comorbidities

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [2]:
# load data + rename columns with structired name
df_knee_raw=load_proms(part="knee").rename(structure_name, axis=1)
df_hip_raw=load_proms(part="hip").rename(structure_name, axis=1)

# get df with meta data for each
df_knee_meta = get_meta(df_knee_raw.columns)
df_hip_meta = get_meta(df_hip_raw.columns)  

In [3]:
methods["ohs"]["dims"]["labels"]

{0: 'all of the time',
 1: 'most of the time',
 2: 'often, not just at first',
 3: 'sometimes or just at first',
 4: 'rarely/never',
 9: 'missing'}

In [4]:
[(key, value) for key, value in methods["ohs"]["dims"]["labels"].items()] # keys() / values()

[(0, 'all of the time'),
 (1, 'most of the time'),
 (2, 'often, not just at first'),
 (3, 'sometimes or just at first'),
 (4, 'rarely/never'),
 (9, 'missing')]

In [5]:
labels = methods["ohs"]["dims"]["labels"]
[k for k, v in labels.items() if v != "missing"]

[0, 1, 2, 3, 4]

In [6]:
df_hip_meta.query("feature != 'predicted'")

Unnamed: 0,t,method,feature,kind,labels,range
t0_provider_code,0,,provider_code,categorical,,
t0_procedure,0,,procedure,categorical,"[{'Hip Replacement': 'hip Replacement', 'Knee ...",
t0_revision_flag,0,,revision_flag,categorical,"[{0: 'no revision', 1: 'revision procedure'}]",
t0_year,0,,year,ordinal,"[{'2016/17': 'April 2016 - April 2017', '2017/...",
t0_age_band,0,,age_band,ordinal,"[{'80 to 89': '80 to 89', '60 to 69': '60 to 6...",
...,...,...,...,...,...,...
t1_ohs_limping,1,ohs,limping,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",
t1_ohs_stairs,1,ohs,stairs,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",
t1_ohs_standing,1,ohs,standing,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",
t1_ohs_work,1,ohs,work,ordinal,"[{0: 'all of the time', 1: 'most of the time',...",


## basic cleaning

In [7]:
%%time
# clean the data based on meta (all not in range, labels or label "missing")
# + remove revision rows
df_knee_clean = clean_data(df_knee_raw, df_knee_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)
df_hip_clean = clean_data(df_hip_raw, df_hip_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)

# fill comorbidities mising with 0's as requested
cm_cols = ["t0_" + cm for cm in comorbidities]
df_knee_clean[cm_cols] = df_knee_clean[cm_cols].fillna(0)
df_hip_clean[cm_cols] = df_hip_clean[cm_cols].fillna(0)

# split train + test set
df_knee_train = df_knee_clean.query("t0_year != '2019/20'")
df_knee_test = df_knee_clean.query("t0_year == '2019/20'")

df_hip_train = df_hip_clean.query("t0_year != '2019/20'")
df_hip_test = df_hip_clean.query("t0_year == '2019/20'")

Wall time: 17.4 s


In [8]:
df_knee_train.loc[:, ["t0_heart_disease", "t0_high_bp", "t0_stroke", "t0_circulation"]]

Unnamed: 0,t0_heart_disease,t0_high_bp,t0_stroke,t0_circulation
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0
...,...,...,...,...
45630,0.0,1.0,0.0,1.0
45631,0.0,0.0,0.0,0.0
45632,1.0,1.0,0.0,0.0
45633,0.0,1.0,0.0,0.0


In [9]:
df_knee_clean[cm_cols]

Unnamed: 0,t0_heart_disease,t0_high_bp,t0_stroke,t0_circulation,t0_lung_disease,t0_diabetes,t0_kidney_disease,t0_nervous_system,t0_liver_disease,t0_cancer,t0_depression,t0_arthritis
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
24591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24593,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
cm_cols

['t0_heart_disease',
 't0_high_bp',
 't0_stroke',
 't0_circulation',
 't0_lung_disease',
 't0_diabetes',
 't0_kidney_disease',
 't0_nervous_system',
 't0_liver_disease',
 't0_cancer',
 't0_depression',
 't0_arthritis']

In [11]:
comorbidities.keys()

dict_keys(['heart_disease', 'high_bp', 'stroke', 'circulation', 'lung_disease', 'diabetes', 'kidney_disease', 'nervous_system', 'liver_disease', 'cancer', 'depression', 'arthritis'])

## create delta dataframes

Unnamed: 0,t0_provider_code,t0_procedure,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
0,ADP02,Hip Replacement,2016/17,,,2.0,,2.0,1.0,1.0,...,3.0,2.0,4.0,4.0,3.0,2.0,2.0,3.0,36.0,37.773033
1,ADP02,Hip Replacement,2016/17,,,2.0,,3.0,2.0,1.0,...,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,46.0,35.586815
2,ADP02,Hip Replacement,2016/17,,,1.0,,2.0,2.0,1.0,...,2.0,3.0,2.0,4.0,1.0,2.0,4.0,2.0,31.0,32.934055
3,ADP02,Hip Replacement,2016/17,,,2.0,,4.0,2.0,1.0,...,2.0,2.0,2.0,1.0,1.0,2.0,2.0,0.0,15.0,38.679630
4,ADP02,Hip Replacement,2016/17,,,2.0,,4.0,2.0,1.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,47.0,36.867462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41272,RYR,Hip Replacement,2018/19,80 to 89,2.0,2.0,,2.0,2.0,2.0,...,3.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,41.0,39.284161
41274,RYR,Hip Replacement,2018/19,80 to 89,2.0,2.0,,2.0,2.0,1.0,...,3.0,4.0,2.0,4.0,0.0,2.0,4.0,4.0,39.0,35.703945
41276,RYR,Hip Replacement,2018/19,80 to 89,2.0,2.0,,2.0,2.0,1.0,...,3.0,4.0,1.0,4.0,4.0,3.0,4.0,4.0,36.0,40.521175
41277,RYR,Hip Replacement,2018/19,80 to 89,2.0,2.0,,3.0,2.0,2.0,...,3.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,38.0,41.542564


In [18]:
df = df_hip_train.copy().sort_index(axis=1)
df

Unnamed: 0,t0_age_band,t0_arthritis,t0_assisted,t0_assisted_by,t0_cancer,t0_circulation,t0_depression,t0_diabetes,t0_disability,t0_eq5d_activity,...,t1_ohs_sudden_pain,t1_ohs_transport,t1_ohs_walking,t1_ohs_washing,t1_ohs_work,t1_readmitted,t1_satisfaction,t1_success,t1_urine,t1_wound
0,,1.0,2.0,,0.0,0.0,0.0,0.0,1.0,,...,4.0,3.0,4.0,4.0,3.0,2.0,2.0,1.0,2.0,2.0
1,,0.0,2.0,,0.0,0.0,0.0,0.0,1.0,2.0,...,4.0,3.0,4.0,4.0,4.0,2.0,1.0,1.0,2.0,2.0
2,,1.0,1.0,,0.0,0.0,0.0,0.0,1.0,3.0,...,4.0,2.0,4.0,3.0,2.0,2.0,2.0,1.0,2.0,2.0
3,,0.0,2.0,,0.0,0.0,0.0,0.0,2.0,2.0,...,0.0,2.0,1.0,3.0,0.0,1.0,3.0,2.0,1.0,2.0
4,,0.0,2.0,,0.0,0.0,0.0,0.0,2.0,2.0,...,4.0,4.0,4.0,4.0,4.0,2.0,1.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41272,80 to 89,1.0,2.0,,0.0,0.0,0.0,1.0,1.0,2.0,...,4.0,3.0,3.0,4.0,3.0,2.0,1.0,1.0,2.0,2.0
41274,80 to 89,0.0,2.0,,0.0,0.0,0.0,0.0,1.0,2.0,...,4.0,3.0,4.0,4.0,4.0,2.0,4.0,,2.0,2.0
41276,80 to 89,1.0,2.0,,0.0,0.0,0.0,0.0,2.0,2.0,...,1.0,3.0,4.0,4.0,4.0,2.0,3.0,1.0,2.0,2.0
41277,80 to 89,1.0,2.0,,1.0,0.0,0.0,0.0,2.0,2.0,...,3.0,3.0,4.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0


In [23]:
# df.columns = pd.MultiIndex.from_frame(
#         df.columns.str.extract(fr"^(t[01])_({'|'.join(methods.keys())})?_?(.*)$"),
#         names=["available", "method", "feature"],
#     )
df = df["t0"]
df

method,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,eq5d,...,ohs,ohs,ohs,ohs,NaN,NaN,NaN,NaN,NaN,NaN
feature,age_band,arthritis,assisted,assisted_by,cancer,circulation,depression,diabetes,disability,activity,...,transport,walking,washing,work,previous_surgery,procedure,provider_code,stroke,symptom_period,year
0,,1.0,2.0,,0.0,0.0,0.0,0.0,1.0,,...,1.0,2.0,3.0,2.0,1.0,Hip Replacement,ADP02,0.0,2.0,2016/17
1,,0.0,2.0,,0.0,0.0,0.0,0.0,1.0,2.0,...,2.0,1.0,2.0,1.0,2.0,Hip Replacement,ADP02,0.0,3.0,2016/17
2,,1.0,1.0,,0.0,0.0,0.0,0.0,1.0,3.0,...,1.0,1.0,1.0,0.0,2.0,Hip Replacement,ADP02,0.0,2.0,2016/17
3,,0.0,2.0,,0.0,0.0,0.0,0.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,Hip Replacement,ADP02,0.0,4.0,2016/17
4,,0.0,2.0,,0.0,0.0,0.0,0.0,2.0,2.0,...,1.0,2.0,1.0,0.0,2.0,Hip Replacement,ADP02,0.0,4.0,2016/17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41272,80 to 89,1.0,2.0,,0.0,0.0,0.0,1.0,1.0,2.0,...,2.0,3.0,2.0,1.0,2.0,Hip Replacement,RYR,0.0,2.0,2018/19
41274,80 to 89,0.0,2.0,,0.0,0.0,0.0,0.0,1.0,2.0,...,1.0,1.0,2.0,1.0,2.0,Hip Replacement,RYR,0.0,2.0,2018/19
41276,80 to 89,1.0,2.0,,0.0,0.0,0.0,0.0,2.0,2.0,...,2.0,3.0,3.0,2.0,2.0,Hip Replacement,RYR,0.0,2.0,2018/19
41277,80 to 89,1.0,2.0,,1.0,0.0,0.0,0.0,2.0,2.0,...,3.0,4.0,4.0,3.0,2.0,Hip Replacement,RYR,0.0,3.0,2018/19


In [12]:
def method_delta(df):
    # create MultiIndex
    df = df.copy().sort_index(axis=1)
    df.columns = pd.MultiIndex.from_frame(
        df.columns.str.extract(fr"^(t[01])_({'|'.join(methods.keys())})?_?(.*)$"),
        names=["available", "method", "feature"],
    )
    # select only methods dim and scores + get delta (t1 - t0)
    df = df.loc[
        :, [(m == m) & (f not in ["profile", "predicted"]) for t, m, f in df.columns]
    ]
    df_delta = (df["t1"] - df["t0"])

    df_delta.columns = ["delta_" + "_".join(col) for col in df_delta.columns]
    return df_delta

df_knee_delta = method_delta(df_knee_train)
df_hip_delta = method_delta(df_hip_train)


# now you could join them again with the original df ...
# eg: df_hip_train.join(df_hip_delta)

In [26]:
df_hip_tot = df_hip_train.join(df_hip_delta)
df_hip_tot

Unnamed: 0,t0_provider_code,t0_procedure,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,...,delta_ohs_pain,delta_ohs_score,delta_ohs_shopping,delta_ohs_stairs,delta_ohs_standing,delta_ohs_sudden_pain,delta_ohs_transport,delta_ohs_walking,delta_ohs_washing,delta_ohs_work
0,ADP02,Hip Replacement,2016/17,,,2.0,,2.0,1.0,1.0,...,3.0,17.0,1.0,2.0,0.0,2.0,2.0,2.0,1.0,1.0
0,ADP02,Hip Replacement,2016/17,,,2.0,,2.0,1.0,1.0,...,2.0,17.0,0.0,0.0,1.0,4.0,1.0,1.0,1.0,3.0
0,ADP02,Hip Replacement,2016/17,,,2.0,,2.0,1.0,1.0,...,-1.0,21.0,2.0,2.0,3.0,2.0,1.0,3.0,2.0,2.0
0,ADP02,Hip Replacement,2017/18,,,2.0,,2.0,1.0,1.0,...,3.0,17.0,1.0,2.0,0.0,2.0,2.0,2.0,1.0,1.0
0,ADP02,Hip Replacement,2017/18,,,2.0,,2.0,1.0,1.0,...,2.0,17.0,0.0,0.0,1.0,4.0,1.0,1.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44236,RYR,Hip Replacement,2016/17,80 to 89,2.0,2.0,,2.0,2.0,1.0,...,0.0,10.0,4.0,0.0,1.0,4.0,1.0,1.0,0.0,0.0
44237,RYR,Hip Replacement,2016/17,80 to 89,2.0,2.0,,1.0,2.0,2.0,...,0.0,-2.0,-1.0,1.0,1.0,-2.0,1.0,-1.0,0.0,0.0
44238,RYR,Hip Replacement,2016/17,80 to 89,2.0,2.0,,1.0,2.0,2.0,...,4.0,20.0,0.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0
44239,RYR,Hip Replacement,2016/17,80 to 89,2.0,1.0,,2.0,2.0,2.0,...,3.0,22.0,1.0,2.0,3.0,3.0,2.0,2.0,0.0,1.0


In [54]:
df_hip_tot["t1_success_cat"] = (df_hip_tot["t1_success"]>=3) * 1
df_hip_tot["t1_satisfaction_cat"] = (df_hip_tot["t1_satisfaction"]>=4) * 1
df_hip_tot["t1_combo_cat"] = df_hip_tot["t1_satisfaction_cat"] * df_hip_tot["t1_success_cat"]
df_hip_tot[["t1_success_cat", "t1_satisfaction_cat"]].value_counts()

t1_success_cat  t1_satisfaction_cat
0               0                      309174
                1                       11337
1               1                        9847
                0                        4135
dtype: int64

In [51]:
table = pd.crosstab(df_hip_tot["t1_satisfaction_cat"], df_hip_tot["t1_success_cat"])
table

t1_success_cat,0,1
t1_satisfaction_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
0,309174,4135
1,11337,9847


In [52]:
table.sum(axis=0)

t1_success_cat
0    320511
1     13982
dtype: int64

In [53]:
table.sum(axis=1)

t1_satisfaction_cat
0    313309
1     21184
dtype: int64

## plotjes!

In [55]:
df_hip_tot

Unnamed: 0,t0_provider_code,t0_procedure,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,t0_living_arrangements,...,delta_ohs_stairs,delta_ohs_standing,delta_ohs_sudden_pain,delta_ohs_transport,delta_ohs_walking,delta_ohs_washing,delta_ohs_work,t1_success_cat,t1_satisfaction_cat,t1_combo_cat
0,ADP02,Hip Replacement,2016/17,,,2.0,,2.0,1.0,1.0,...,2.0,0.0,2.0,2.0,2.0,1.0,1.0,0,0,0
0,ADP02,Hip Replacement,2016/17,,,2.0,,2.0,1.0,1.0,...,0.0,1.0,4.0,1.0,1.0,1.0,3.0,0,0,0
0,ADP02,Hip Replacement,2016/17,,,2.0,,2.0,1.0,1.0,...,2.0,3.0,2.0,1.0,3.0,2.0,2.0,0,0,0
0,ADP02,Hip Replacement,2017/18,,,2.0,,2.0,1.0,1.0,...,2.0,0.0,2.0,2.0,2.0,1.0,1.0,0,0,0
0,ADP02,Hip Replacement,2017/18,,,2.0,,2.0,1.0,1.0,...,0.0,1.0,4.0,1.0,1.0,1.0,3.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44236,RYR,Hip Replacement,2016/17,80 to 89,2.0,2.0,,2.0,2.0,1.0,...,0.0,1.0,4.0,1.0,1.0,0.0,0.0,0,0,0
44237,RYR,Hip Replacement,2016/17,80 to 89,2.0,2.0,,1.0,2.0,2.0,...,1.0,1.0,-2.0,1.0,-1.0,0.0,0.0,0,0,0
44238,RYR,Hip Replacement,2016/17,80 to 89,2.0,2.0,,1.0,2.0,2.0,...,1.0,1.0,1.0,1.0,2.0,2.0,2.0,0,0,0
44239,RYR,Hip Replacement,2016/17,80 to 89,2.0,1.0,,2.0,2.0,2.0,...,2.0,3.0,3.0,2.0,2.0,0.0,1.0,0,0,0


In [65]:
df_plot = df_hip_tot[["delta_ohs_score", "delta_eq5d_score", "t1_success_cat", "t1_satisfaction_cat", "t1_combo_cat"]].dropna()
df_plot.sample(5)

Unnamed: 0,delta_ohs_score,delta_eq5d_score,t1_success_cat,t1_satisfaction_cat,t1_combo_cat
11232,29.0,-0.071,0,0,0
20215,22.0,0.59,0,0,0
33925,33.0,0.892,0,0,0
23910,32.0,0.484,0,0,0
37127,22.0,0.672,1,1,1


In [84]:
import altair as alt
alt.Chart(df_plot.sample(5_000)).mark_bar().encode(
    alt.X("delta_ohs_score:Q", bin=True),
    alt.Y("count()"),
    color="t1_success_cat:N"
)

# sp2 = alt.Chart(df_plot.sample(5_000)).mark_bar().encode(
#     alt.X("delta_ohs_score:Q", bin=True),
#     alt.Y("count()")
# )

# sp1 | sp2

In [123]:
def plot_kde(df, x="delta_ohs_score", color="t1_success_cat"):
    return (
        alt.Chart(df_plot.sample(5_000))
        .transform_density(
            x,
            groupby=[color],
            steps=500,
#             extent=[-30, 50],
            as_=[x, "density"],
        )
        .mark_area(opacity=0.5)
        .encode(
            alt.X(x + ":Q"),
            alt.Y("density:Q"),
            color=color + ":N",
        )
    )

In [125]:
tl = plot_kde(df_plot, x="delta_ohs_score", color="t1_success_cat")
tr = plot_kde(df_plot, x="delta_eq5d_score", color="t1_success_cat")
ml = plot_kde(df_plot, x="delta_ohs_score", color="t1_satisfaction_cat")
mr = plot_kde(df_plot, x="delta_eq5d_score", color="t1_satisfaction_cat")
bl = plot_kde(df_plot, x="delta_ohs_score", color="t1_combo_cat")
br = plot_kde(df_plot, x="delta_eq5d_score", color="t1_combo_cat")


display(tl | tr) 
display(ml | mr)
display(bl | br)

In [130]:
df_knee_tot = df_knee_train.join(df_knee_delta)

df_knee_tot["t1_success_cat"] = (df_knee_tot["t1_success"]>=3) * 1
df_knee_tot["t1_satisfaction_cat"] = (df_knee_tot["t1_satisfaction"]>=4) * 1
df_knee_tot["t1_combo_cat"] = df_knee_tot["t1_satisfaction_cat"] * df_knee_tot["t1_success_cat"]
df_knee_tot[["t1_success_cat", "t1_satisfaction_cat"]].value_counts()

df_plot = df_knee_tot[["delta_oks_score", "delta_eq5d_score", "t1_success_cat", "t1_satisfaction_cat", "t1_combo_cat"]].dropna()

tl = plot_kde(df_plot, x="delta_oks_score", color="t1_success_cat")
tr = plot_kde(df_plot, x="delta_eq5d_score", color="t1_success_cat")
ml = plot_kde(df_plot, x="delta_oks_score", color="t1_satisfaction_cat")
mr = plot_kde(df_plot, x="delta_eq5d_score", color="t1_satisfaction_cat")
bl = plot_kde(df_plot, x="delta_oks_score", color="t1_combo_cat")
br = plot_kde(df_plot, x="delta_eq5d_score", color="t1_combo_cat")


display(tl | tr) 
display(ml | mr)
display(bl | br)

In [131]:
print("hoi")

hoi
