# first setup for EDA
This notebooks gives a first setup for the EDA for the NHS data.
The wonderfull work by Laurence resented in the following [notebook](https://github.com/laurencefrank/NHS-PROMs/blob/main/notebooks/nb1_NHS_PROMs_G5.ipynb)

## imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
import re

from sklearn.preprocessing import OrdinalEncoder

from utils.load_data import dir_digger, read_csv_zip, structure_name, get_meta, clean_data
from utils.data_dictionary import methods, column_meta


In [None]:
EXT_DATA = "..\data\external"
INT_DATA = "..\data\interim"
N_PLOT_SAMPLES = 10_000

## parse files

### load + clean interim

In [None]:
files = dir_digger(INT_DATA, ext=".parquet")
print(f"Found {len(files)} files:")
display(files)

df_hip_raw = (
    pd.concat([pd.read_parquet(file) for file in files if "hip" in file])
    .drop_duplicates()
    .rename(structure_name, axis=1)
)
df_knee_raw = (
    pd.concat([pd.read_parquet(file) for file in files if "knee" in file])
    .drop_duplicates()
    .rename(structure_name, axis=1)
)

In [None]:
df_raw = df_hip_raw

df_meta = get_meta(df_raw.columns)  
df_clean = clean_data(df_raw, df_meta)

if (df_clean.describe(include="all").loc["max"]==9).any():
    to_check = list(df_clean.loc[:, df_clean.describe(include="all").loc["max"]==9].columns)
    warning.warn("Please check the folowing columns for meta data:")
    display(to_check)

## general descriptives

In [None]:
df_clean.head(5)

In [None]:
df_clean.info()

In [None]:
df_clean.describe(include="all")

In [None]:
df_clean.head(5)

## first plots

In [None]:
# make easy ling format of data

method_cols = df_meta["method"].notna() & (
    df_meta["feature"].isin(["predicted", "profile"]) == False
)
method_cols = method_cols[method_cols].index

new_index = [
    re.search(
        fr"^(?P<t>t[01])_?(?P<method>{'|'.join(methods)})?_(?P<feature>.*)$", col
    ).groups()
    for col in method_cols
]

df_methods = df_clean[method_cols]
names = ["available", "method", "feature"]
df_methods.columns = pd.MultiIndex.from_tuples(new_index, names=names)

df_long = df_methods.stack(names).reset_index(names).rename(columns={0: "response"})
df_long.sample(5)

### Data from methods

In [None]:
# plot different scores
data = df_long.query(f"method == method & feature in 'score'").sample(N_PLOT_SAMPLES).copy()
methods_order = np.sort(data["method"].unique())
# min-max scale the old way
for method in methods_order:
    min_, max_ = methods[method]["score"]["range"]
    data.loc[data["method"] == method, "response"] = (
        data.loc[data["method"] == method, "response"] - min_
    ) / (max_ - min_) * 100

# plot boxen per method score
fig, ax = plt.subplots(figsize=(18, 6))
ax = sns.boxenplot(
    x="method",
    y="response",
    order=methods_order,
    hue="available",
    hue_order=["t0", "t1"],
    data=data,
    palette="muted",
    scale="linear",
)

ax.set_title("Method scores (scaled on full range, bin width~count)")
ax.set_ylabel("score [% of range]")
plt.show()

In [None]:
for method in [
    k for k, v in methods.items() if v.get("dims") and (k in df_long["method"].unique())
]:
    # query subset for method
    data = df_long.query(f"method == '{method}' & feature not in 'score'").sample(
        N_PLOT_SAMPLES
    )

    # plot violin per method dimension
    fig, ax = plt.subplots(figsize=(18, 6))
    ax = sns.violinplot(
        x="feature",
        y="response",
        hue="available",
        hue_order=["t0", "t1"],
        data=data,
        inner=None,
        cut=0,
        palette="muted",
        split=True,
    )

    ax.set_title(f"KDE of {method.upper()} features (dimensions)")
    labels = methods[method]["dims"]["labels"].copy()
    labels.pop(9)
    ax.set_yticks(list(labels))
    ax.set_yticklabels(labels.values())
    plt.show()

### Other data available

In [None]:
for available in ["t0", "t1"]:
    cols = df_meta.query(fr"method != method & t=='{available[1]}'").index
    data = df_clean.loc[:, cols]
    data.columns = pd.Index(data.columns.str.replace(r"^t[01]_", ""), name="feature")
    if available == "t0":
        data = data.drop(columns=["provider_code", "procedure", "year", "age_band"])
    
    # plot per feature
    fig, ax = plt.subplots(figsize=(18, 6))
    ax = sns.violinplot(
        x="feature",
        y="value",
        order=data.columns,
        inner=None,
        data=data.melt().sample(N_PLOT_SAMPLES),
        palette="muted",
    )

    ax.set_title(f"Distributions of features known on {available}")
    ax.set_ylim((.5, 2.5))
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
    ax.yaxis.set_visible(False)
    plt.show()