# Get started with your EDA

## imports

In [None]:
import pandas as pd
from os import path
import warnings

from utils.load_data import read_online_proms_data, downcast, structure_name, get_meta, clean_data
from utils.data_dictionary import methods

In [None]:
# from https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/patient-reported-outcome-measures-proms
URL_PROMS_DATA = [
    r"https://files.digital.nhs.uk/6C/A1D581/CSV%20Data%20Pack%202016-17%20Finalised.zip",
    r"https://files.digital.nhs.uk/70/5176AA/CSV%20Data%20Pack%20Final%201718.zip",
    r"https://files.digital.nhs.uk/52/A8FF7F/PROMs%20CSV%20Data%20Pack%20Finalised%202018-19.zip",
    r"https://files.digital.nhs.uk/1F/51FEDE/PROMs%20CSV%20Data%20Pack%20Provisional%201920.zip",
]

DATA_PATH = "../data"

PART = "knee"
ORG = "provider"

## load data

In [None]:
%%time

# define path location
file_name = f"{PART}-{ORG}.parquet"
full_path = path.join(DATA_PATH, file_name)

# load from disk if present,
# otherwise get it directly from NHS-source + rename columns + save to parquet
if path.isfile(full_path):
    df_raw = pd.read_parquet(full_path)
else:
    df_raw = read_online_proms_data(urls=URL_PROMS_DATA, part=PART, org=ORG).apply(downcast)
    df_raw.columns = (
        df_raw.columns.str.replace("Pre-Op Q", "t0")
        .str.replace("Post-Op Q", "t1")
        .str.replace("Knee Replacement", "oks")
        .str.replace("Hip Replacement", "ohs")
        .str.replace("-", "_")
        .str.replace(" ", "_")
        .str.lower()
    )
    df_raw.to_parquet(full_path)

# display basic info df_raw    
df_raw.info(verbose=False)

## basic cleaning

In [None]:
%%time
# restructure names
df_raw.columns = [structure_name(col) for col in df_raw.columns]

# construct a meta data df
df_meta = get_meta(df_raw.columns)  

# clean the data based on meta (all not in range, labels or label "missing")
df_clean = clean_data(df_raw, df_meta).apply(downcast)

# raise warning if somewhere there is a hidden 9 as max
if (df_clean.describe(include="all").loc["max"]==9).any():
    to_check = list(df_clean.loc[:, df_clean.describe(include="all").loc["max"]==9].columns)
    warning.warn("Please check the folowing columns for meta data:")
    display(to_check)
    
df_clean.sample(3)

### drop revisions and last year of data

In [None]:
df_eda = df_clean.query("t0_revision_flag == 0 & t0_year != '2019/20'")\
                 .drop(columns="t0_revision_flag")
df_eda["t0_year"].unique()

## simple descriptives

In [None]:
df_eda.describe(include="all")

In [None]:
df_eda.info(verbose=False)

In [None]:
# It's up to you now!