# Get started with your EDA

## imports

In [None]:
%load_ext nb_black

In [None]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import warnings
import re

from NHS_PROMs.load_data import load_proms, structure_name
from NHS_PROMs.preprocess import filter_in_range, filter_in_labels, method_delta
from NHS_PROMs.utils import downcast, map_labels
from NHS_PROMs.data_dictionary import meta_dict

## load data
General approach is not DRY for the sake of availability of having knee and hip df's always at hand, but also keep it readable (script-wise).

In [None]:
# load data + rename columns with structired name
# df_knee_raw = load_proms(part="knee").apply(downcast).rename(structure_name, axis=1)
df_hip_raw = load_proms(part="hip").apply(downcast).rename(structure_name, axis=1)

# get meta data for each
full_meta = {t + k: v for k, v in meta_dict.items() for t in ["t0_", "t1_"]}
hip_meta = {k: v for k, v in full_meta.items() if k in df_hip_raw.columns}

df_hip_raw.sample(3)

## basic cleaning

In [None]:
endings = (
    "code",
    "procedure",
    "revision_flag",
    "assisted_by",
    "profile",
    "score",
    "predicted",
)
cols2drop = [c for c in df_hip_raw.columns if c.endswith(endings)]

In [None]:
%%time
df_hip_clean = df_hip_raw\
    .apply(lambda s: filter_in_range(s, **hip_meta[s.name]))\
    .apply(lambda s: filter_in_labels(s, **hip_meta[s.name]))\
    .apply(lambda s: map_labels(s, **hip_meta[s.name]))\
    .query("t0_revision_flag == 'no revision'")\
    .drop(columns=cols2drop)\
#     .replace("missing", np.nan)

df_hip_clean.sample(3)

## split data

In [None]:
# split train + test set
# df_knee_seen = df_knee_clean.query("t0_year != '2019/20'")
# df_knee_unseen = df_knee_clean.query("t0_year == '2019/20'")

df_hip_seen = df_hip_clean.query("t0_year != '2019/20'")
df_hip_unseen = df_hip_clean.query("t0_year == '2019/20'")

## create delta dataframes

In [None]:
df_org = df_hip_seen.apply(
    lambda s: map_labels(s, backwards=True, **hip_meta[s.name])
).apply(np.asarray)
# df_knee_delta = method_delta(df_knee_train)
df_hip_delta = method_delta(df_org)
# now you could join them again with the original df ...
# eg: df_hip_train.join(df_hip_delta)

## Make feature set

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
cols2drop = ["t0_provider_code", "t0_procedure", "t0_assisted_by", "t0_eq5d_profile"]
df_hip = df_hip_clean.copy()\
                    .filter(regex="t0")\
                    .drop(columns=cols2drop)\
                    .dropna()

In [None]:
df_hip_meta.loc[df_hip.columns]

In [None]:
df_hip

In [None]:
df_hip_meta.loc["t0_age_band", "labels"]

In [None]:
pd.DataFrame()