# Get started with your EDA

## imports

In [None]:
import numpy as np
import pandas as pd
from os import path
import warnings

import altair as alt
from sklearn.decomposition import PCA

from utils.load_data import read_online_proms_data, downcast, structure_name, get_meta, clean_data
from utils.data_dictionary import methods

In [None]:
# from https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/patient-reported-outcome-measures-proms
URL_PROMS_DATA = [
    r"https://files.digital.nhs.uk/6C/A1D581/CSV%20Data%20Pack%202016-17%20Finalised.zip",
    r"https://files.digital.nhs.uk/70/5176AA/CSV%20Data%20Pack%20Final%201718.zip",
    r"https://files.digital.nhs.uk/52/A8FF7F/PROMs%20CSV%20Data%20Pack%20Finalised%202018-19.zip",
    r"https://files.digital.nhs.uk/1F/51FEDE/PROMs%20CSV%20Data%20Pack%20Provisional%201920.zip",
]

DATA_PATH = "../data"

PART = "knee"
ORG = "provider"

## load data

In [None]:
%%time

# define path location
file_name = f"{PART}-{ORG}.parquet"
full_path = path.join(DATA_PATH, file_name)

# load from disk if present,
# otherwise get it directly from NHS-source + rename columns + save to parquet
if path.isfile(full_path):
    df_raw = pd.read_parquet(full_path)
else:
    df_raw = read_online_proms_data(urls=URL_PROMS_DATA, part=PART, org=ORG).apply(downcast)
    df_raw.columns = (
        df_raw.columns.str.replace("Pre-Op Q", "t0")
        .str.replace("Post-Op Q", "t1")
        .str.replace("Knee Replacement", "oks")
        .str.replace("Hip Replacement", "ohs")
        .str.replace("-", "_")
        .str.replace(" ", "_")
        .str.lower()
    )
    if len(df_raw)==0:
        print("No data found!")
    else:
        try:
            df_raw.to_parquet(full_path)
        except:
            print(f"Could not save {full_path}, but has the dataframe in memory.")

# display basic info df_raw    
df_raw.info(verbose=False)

## basic cleaning

In [None]:
%%time
# restructure names
df_raw.columns = [structure_name(col) for col in df_raw.columns]

# construct a meta data df
df_meta = get_meta(df_raw.columns)  

# clean the data based on meta (all not in range, labels or label "missing")
df_clean = clean_data(df_raw, df_meta).apply(downcast)

# raise warning if somewhere there is a hidden 9 as max
if (df_clean.describe(include="all").loc["max"]==9).any():
    to_check = list(df_clean.loc[:, df_clean.describe(include="all").loc["max"]==9].columns)
    warning.warn("Please check the folowing columns for meta data:")
    display(to_check)
    
df_clean.sample(3)

### drop revisions and last year of data

In [None]:
df_eda = df_clean.query("t0_revision_flag == 0 & t0_year != '2019/20'")\
                 .drop(columns="t0_revision_flag")
df_eda["t0_year"].unique()

## simple descriptives

In [None]:
df_eda.describe(include="all")

In [None]:
df_eda.info(verbose=False)

## really terrible query

In [None]:
df_eda.columns
# df_eda.shape

In [None]:
df_terrible = df_eda[
    (df_eda["t1_eq5d_score"] < df_eda["t0_eq5d_score"]) 
    & (df_eda["t1_eqvas_score"] < df_eda["t0_eqvas_score"])
    & (df_eda["t1_oks_score"] < df_eda["t0_oks_score"])
].query("t1_satisfaction >3 & t1_success > 3")
len(df_terrible) / len(df_eda) * 100

## PCA quick start

In [None]:
# select method columns
df_meta_methods = df_meta.query("method == method & feature not in ('profile', 'predicted') ")
df_pca = df_eda.loc[:, df_meta_methods.index]

# create MultiIndex
df_pca.columns = pd.MultiIndex.from_frame(
    df_pca.columns.str.extract(fr"^(t[01])_({'|'.join(methods.keys())})?_?(.*)$"),
    names=["available", "method", "feature"],
)

# scale before applying PCA
def normalise(s):
    _,  method, feature = s.name
    if feature == "score":
        range_ = methods[method]["score"]["range"]
    else:
        range_ = [k for k, v in methods[method]["dims"]["labels"].items() if v != "missing"]
        range_ = (min(range_), max(range_))
    return (s - np.min(range_)) / np.ptp(range_)
    
df_pca = df_pca.apply(normalise).dropna()
    
# get in correct format
idx = pd.IndexSlice
df_pca = df_pca.stack(["available"]).droplevel(0)

In [None]:
def plot_pca(df, title):
    data = dict(
        explained_var=np.hstack(
            [
                PCA().fit(df).explained_variance_ratio_,
                PCA().fit(df.loc["t0"]).explained_variance_ratio_,
                PCA().fit(df.loc["t1"]).explained_variance_ratio_,
            ]
        ),
        applied_on=np.repeat(["t0 + t1", "t0", "t1"], len(df.columns)),
        component=list(range(len(df.columns))) * 3,
    )

    explained_variance = (
        alt.Chart(pd.DataFrame(data=data))
        .mark_bar()
        .encode(
            x=alt.X("explained_var", scale=alt.Scale(domain=(0, 1))),
            y=alt.Y("applied_on", sort=["t0 + t1", "t0", "t1"]),
            color=alt.Color("component", sort="descending"),
            order=alt.Order("component", sort="ascending"),
        )
        .properties(title=title)
    )

    return explained_variance

In [None]:
def biplot(df, x=0, y=1):
    
    # get the administraion in order
    if df.index.name:
        index_name = df.index.name
    else:
        index_name = "index"
    org_cols = list(df.columns)
    
    # add PCA components to df
    pca = PCA()
    comps = pd.DataFrame(pca.fit_transform(df), columns=[str(i) for i in range(len(org_cols))])
    comps = pd.concat([df.copy().reset_index(), comps], axis=1).sample(5_000)

    # extract explained variance for title
    expl_var = pca.explained_variance_ratio_
    title = f"PCA biplot of components {x} ({expl_var[x]:.2f} var) and {y} ({expl_var[y]:.2f} var)"

    # create df for features->components
    vec = pca.components_[[x, y], :].T
    df_vec = pd.DataFrame(data=np.vstack([vec, np.zeros_like(vec)]),
                         columns=["x", "y"])
    df_vec["feature"] = org_cols * 2
    
    # use altair to plot results
    components_scatter = alt.Chart(comps).mark_circle(size=10).encode(
        x=f"{x}:Q",
        y=f"{y}:Q",
        color=index_name,
        tooltip=org_cols,
    ).interactive().properties(title=title)

    features_vector = alt.Chart(df_vec).mark_line().encode(
        x="x:Q",
        y="y:Q",
        color="feature",
        tooltip=["feature"]
    ).interactive()

    return components_scatter + features_vector

In [None]:
df_plot = df_pca.loc[:, idx[:, "score"]].droplevel("feature", axis=1)
display(plot_pca(df_plot, title="PCA on EQ5D, EQVAS, OKS scores"), biplot(df_plot))

In [None]:
df_plot = df_pca.loc[:, "eq5d"].drop(columns="score")
display(plot_pca(df_plot,title="PCA on EQ5D dimensions"), biplot(df_plot))

In [None]:
df_plot = df_pca.loc[:, "oks"].drop(columns="score")
display(plot_pca(df_plot, title="PCA on OKS dimensions"), biplot(df_plot))

In [None]:
df_plot = df_pca.copy()
df_plot.columns = ["_".join(t) for t in df_plot.columns]
display(plot_pca(df_plot, title="PCA on all dimensions and scores"), biplot(df_plot))

In [None]:
from pca import pca

model = pca(n_components=len(df_pca.columns))
model.fit_transform(df_pca)
# Plot explained variance
fig, ax = model.plot()

# Scatter first 2 PCs
fig, ax = model.scatter()

# Make biplot with the number of features
fig, ax = model.biplot(n_feat=2)