# first setup for EDA
This notebooks gives a first setup for the EDA for the NHS data.
The wonderfull work by Laurence resented in the following [notebook](https://github.com/laurencefrank/NHS-PROMs/blob/main/notebooks/nb1_NHS_PROMs_G5.ipynb)

## imports

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
import re

from utils.load_data import dir_digger, read_csv_zip, structure_name, get_meta, clean_data
from utils.data_dictionary import methods, column_meta


In [2]:
EXT_DATA = "..\data\external"
INT_DATA = "..\data\interim"

## parse files

### load + clean interim

In [3]:
files = dir_digger(INT_DATA, ext=".parquet")
print(f"Found {len(files)} files:")
display(files)

df_hip_raw = (
    pd.concat([pd.read_parquet(file) for file in files if "hip" in file])
    .drop_duplicates()
    .rename(structure_name, axis=1)
)
df_knee_raw = (
    pd.concat([pd.read_parquet(file) for file in files if "knee" in file])
    .drop_duplicates()
    .rename(structure_name, axis=1)
)

Found 4 files:


['..\\data\\interim\\hip-ccg.parquet',
 '..\\data\\interim\\hip-provider.parquet',
 '..\\data\\interim\\knee-ccg.parquet',
 '..\\data\\interim\\knee-provider.parquet']

In [4]:
df_meta = get_meta(df_hip_raw.columns)  
df_hip_clean = clean_data(df_hip_raw, df_meta)

if (df_hip_clean.describe(include="all").loc["max"]==9).any():
    to_check = list(df_hip_clean.loc[:, df_hip_clean.describe(include="all").loc["max"]==9].columns)
    warning.warn("Please check the folowing columns for meta data:")
    display(to_check)

## general descriptives

In [5]:
df_hip_clean.head(5)

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
0,00C,Hip Replacement,0.0,2018/19,,,2.0,,4.0,2.0,...,4.0,2.0,4.0,4.0,3.0,4.0,3.0,4.0,43.0,42.200172
1,00C,Hip Replacement,0.0,2018/19,,,1.0,,2.0,2.0,...,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,25.0,33.264294
2,00C,Hip Replacement,0.0,2018/19,,,2.0,,4.0,2.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,33.0,33.289623
3,00C,Hip Replacement,0.0,2018/19,,,2.0,,3.0,2.0,...,3.0,2.0,3.0,3.0,4.0,3.0,3.0,4.0,38.0,34.730251
4,00C,Hip Replacement,0.0,2018/19,,,2.0,,4.0,2.0,...,4.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,42.0,37.042629


In [6]:
df_hip_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247138 entries, 0 to 124843
Data columns (total 81 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   t0_provider_code        247138 non-null  object  
 1   t0_procedure            247138 non-null  category
 2   t0_revision_flag        247138 non-null  float64 
 3   t0_year                 247138 non-null  category
 4   t0_age_band             231201 non-null  object  
 5   t0_gender               231201 non-null  float32 
 6   t0_assisted             244470 non-null  float64 
 7   t0_assisted_by          0 non-null       float64 
 8   t0_symptom_period       244972 non-null  float64 
 9   t0_previous_surgery     245337 non-null  float64 
 10  t0_living_arrangements  243834 non-null  float64 
 11  t0_disability           232612 non-null  float64 
 12  t0_heart_disease        21313 non-null   float64 
 13  t0_high_bp              93756 non-null   float64 
 14  t0_s

In [7]:
df_hip_clean.describe(include="all")

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
count,247138,247138,247138.0,247138,231201,231201.0,244470.0,0.0,244972.0,245337.0,...,245535.0,245524.0,245403.0,244949.0,245184.0,245270.0,245364.0,245229.0,244289.0,240182.0
unique,490,1,,3,8,,,,,,...,,,,,,,,,,
top,RVR,Hip Replacement,,2016/17,70 to 79,,,,,,...,,,,,,,,,,
freq,2908,247138,,87511,92641,,,,,,...,,,,,,,,,,
mean,,,0.051396,,,1.609163,1.844386,,2.133897,1.914285,...,3.344643,2.964737,3.260804,3.481406,3.165459,3.284075,3.456656,3.371,39.6936,38.435741
std,,,0.220805,,,0.487939,0.36249,,0.70902,0.279943,...,0.817938,1.107218,1.172747,0.942803,1.122328,0.968217,0.775134,0.902881,8.709873,4.216595
min,,,0.0,,,1.0,1.0,,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.187914
25%,,,0.0,,,1.0,2.0,,2.0,2.0,...,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,36.0,35.939915
50%,,,0.0,,,2.0,2.0,,2.0,2.0,...,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,43.0,39.032459
75%,,,0.0,,,2.0,2.0,,2.0,2.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,46.0,41.626392


In [8]:
df_hip_raw.head(5)

Unnamed: 0,t0_provider_code,t0_procedure,t0_revision_flag,t0_year,t0_age_band,t0_gender,t0_assisted,t0_assisted_by,t0_symptom_period,t0_previous_surgery,...,t1_ohs_transport,t1_ohs_dressing,t1_ohs_shopping,t1_ohs_walking,t1_ohs_limping,t1_ohs_stairs,t1_ohs_standing,t1_ohs_work,t1_ohs_score,t1_ohs_predicted
0,00C,Hip Replacement,0,2018/19,,,2,0,4,2,...,4,2,4,4,3,4,3,4,43.0,42.200172
1,00C,Hip Replacement,0,2018/19,,,1,0,2,2,...,3,2,2,2,2,2,2,2,25.0,33.264294
2,00C,Hip Replacement,0,2018/19,,,2,0,4,2,...,3,3,3,3,3,3,3,3,33.0,33.289623
3,00C,Hip Replacement,0,2018/19,,,2,0,3,2,...,3,2,3,3,4,3,3,4,38.0,34.730251
4,00C,Hip Replacement,0,2018/19,,,2,0,4,2,...,4,3,4,4,3,4,4,4,42.0,37.042629


## first plots

In [95]:
method_cols = df_meta["method"].notna() & (df_meta["feature"].isin(["predicted", "profile"]) == False)
method_cols = method_cols[method_cols].index

new_index = [
    re.search(
        fr"^(?P<t>t[01])_?(?P<method>{'|'.join(methods)})?_(?P<feature>.*)$", col
    ).groups()
    for col in method_cols
]

df_methods = df_hip_clean[method_cols]
names = ["available", "method", "feature"]
df_methods.columns = pd.MultiIndex.from_tuples(new_index, names=names)

df_long = df_methods.stack(names).reset_index(names).rename(columns={0:"values"})
df_long.sample(5)

Unnamed: 0,available,method,feature,values
85098,t1,ohs,standing,3.0
62400,t1,ohs,sudden_pain,2.0
19788,t0,ohs,washing,3.0
107824,t0,ohs,score,22.0
42011,t0,eqvas,score,70.0


In [112]:
for method in ["eq5d", "ohs"]:
    data = (
        df_long[(df_long["method"] == method) & (df_long["feature"] != "score")]
        .sample(10_000)
        .sort_values("available")
    )

    if len(data):
        fig, ax = plt.subplots(figsize=(15, 5))
        ax = sns.violinplot(
            title=method,
            x="feature",
            y="values",
            hue="available",
            data=data,
            palette="muted",
            split=True,
        )

ValueError: a must be greater than 0 unless no samples are taken

In [109]:
methods["eq5d"]["dims"]["labels"]

{1: 'no problems', 2: 'some problems', 3: 'severe problems', 9: 'missing'}

In [115]:
methods["eq5d"]["dims"]["names"]

('mobility', 'self_care', 'activity', 'discomfort', 'anxiety')

In [110]:
methods["ohs"]["dims"]["labels"]

{0: 'all of the time',
 1: 'most of the time',
 2: 'often, not just at first',
 3: 'sometimes or just at first',
 4: 'rarely/never',
 9: 'missing'}

In [114]:
methods["ohs"]["dims"]["names"]

('pain',
 'sudden_pain',
 'night_pain',
 'washing',
 'transport',
 'dressing',
 'shopping',
 'walking',
 'limping',
 'stairs',
 'standing',
 'work')