# first setup for EDA
This notebooks gives a first setup for the EDA for the NHS data.
The wonderfull work by Laurence resented in the following [notebook](https://github.com/laurencefrank/NHS-PROMs/blob/main/notebooks/nb1_NHS_PROMs_G5.ipynb)

## imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
import re

from utils.load_data import dir_digger, read_csv_zip, structure_name, get_meta, clean_data
from utils.data_dictionary import methods, column_meta


In [None]:
EXT_DATA = "..\data\external"
INT_DATA = "..\data\interim"

## parse files

### load + clean interim

In [None]:
files = dir_digger(INT_DATA, ext=".parquet")
print(f"Found {len(files)} files:")
display(files)

df_hip_raw = (
    pd.concat([pd.read_parquet(file) for file in files if "hip" in file])
    .drop_duplicates()
    .rename(structure_name, axis=1)
)
df_knee_raw = (
    pd.concat([pd.read_parquet(file) for file in files if "knee" in file])
    .drop_duplicates()
    .rename(structure_name, axis=1)
)

In [None]:
df_meta = get_meta(df_hip_raw.columns)  
df_hip_clean = clean_data(df_hip_raw, df_meta)

if (df_hip_clean.describe(include="all").loc["max"]==9).any():
    to_check = list(df_hip_clean.loc[:, df_hip_clean.describe(include="all").loc["max"]==9].columns)
    warning.warn("Please check the folowing columns for meta data:")
    display(to_check)

## general descriptives

In [None]:
df_hip_clean.head(5)

In [None]:
df_hip_clean.info()

In [None]:
df_hip_clean.describe(include="all")

In [None]:
df_hip_raw.head(5)

## first plots

In [None]:
method_cols = df_meta["method"].notna() & (df_meta["feature"].isin(["predicted", "profile"]) == False)
method_cols = method_cols[method_cols].index

new_index = [
    re.search(
        fr"^(?P<t>t[01])_?(?P<method>{'|'.join(methods)})?_(?P<feature>.*)$", col
    ).groups()
    for col in method_cols
]

df_methods = df_hip_clean[method_cols]
names = ["available", "method", "feature"]
df_methods.columns = pd.MultiIndex.from_tuples(new_index, names=names)

df_long = df_methods.stack(names).reset_index(names).rename(columns={0:"response"})
df_long.sample(5)

In [None]:
for method in ["eq5d", "ohs"]:
    data = (
        df_long[(df_long["method"] == method) & (df_long["feature"] != "score")]
        .sample(10_000)
        .sort_values("available")
    )

    if len(data):
        fig, ax = plt.subplots(figsize=(15, 5))
        ax = sns.violinplot(
            title=method,
            x="feature",
            y="response",
            hue="available",
            data=data,
            palette="muted",
            split=True,
        )

In [None]:
methods["eq5d"]["dims"]["labels"]

In [None]:
methods["eq5d"]["dims"]["names"]

In [None]:
methods["ohs"]["dims"]["labels"]

In [None]:
methods["ohs"]["dims"]["names"]

In [None]:
    data = (
        df_long[(df_long["method"] == "ohs") & (df_long["feature"] != "score")]
        .sample(10_000)
    )
#     data["values"] = data["values"].map(methods["ohs"]["dims"]["labels"])

fig, ax = plt.subplots(figsize=(18, 5))
ax = sns.violinplot(
    title=method,
    x="feature",
    y="response",
    hue="available",
    hue_order=["t0", "t1"],
    data=data,
    inner=None,
    cut=0,
    palette="muted",
    split=True,
)

# ax = sns.boxenplot(
#     data=data,
#     x="feature",
#     y="values",
#     order=methods["ohs"]["dims"]["names"],
#     hue="available",
#     hue_order=["t0", "t1"],
#     palette="muted",
#     k_depth="proportion",
    
#     scale="linear",
#     width=.7,
# #     split=True,
# )

ax.set_title("OHS features (dimensions)")
labels = methods["ohs"]["dims"]["labels"].copy()
labels.pop(9)
ax.set_yticks(list(labels))
ax.set_yticklabels(labels.values())
plt.show()

In [None]:
for method in [
    k for k, v in methods.items() if v.get("dims") and (k in df_long["method"].unique())
]:
    # query subset for method
    data = df_long.query(f"method == '{method}' & feature not in 'score'").sample(
        10_000
    )

    # plot violin per method dimension
    fig, ax = plt.subplots(figsize=(18, 5))
    ax = sns.violinplot(
        x="feature",
        y="response",
        hue="available",
        hue_order=["t0", "t1"],
        data=data,
        inner=None,
        cut=0,
        palette="muted",
        split=True,
    )

    ax.set_title(f"{method.upper()} features (dimensions)")
    labels = methods[method]["dims"]["labels"].copy()
    labels.pop(9)
    ax.set_yticks(list(labels))
    ax.set_yticklabels(labels.values())
    plt.show()

In [None]:
df_long.query("method == 'eq5d' & feature not in 'score'")