# Group project 5

## Import and load data

In [1]:
from utils.load_data import load_proms, downcast, structure_name, get_meta, clean_data
from utils.data_dictionary import methods, comorbidities
from sklearn.model_selection import train_test_split

import pandas as pd
import warnings
import re
import numpy as np

SEED = 1234

In [2]:
# load data + rename columns with structired name
df_knee_raw=load_proms(part="knee").rename(structure_name, axis=1)
df_hip_raw=load_proms(part="hip").rename(structure_name, axis=1)

# get df with meta data for each
df_knee_meta = get_meta(df_knee_raw.columns)
df_hip_meta = get_meta(df_hip_raw.columns)  

## Basic cleaning

In [3]:
%%time
# Clean the data based on meta (all not in range, labels or label "missing")
# + remove revision rows
df_knee_clean = clean_data(df_knee_raw, df_knee_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)
df_hip_clean = clean_data(df_hip_raw, df_hip_meta)\
                .query("t0_revision_flag == 0")\
                .drop(columns=["t0_revision_flag"])\
                .apply(downcast)

# Fill comorbidities missing with 0's as requested
cm_cols = ["t0_" + cm for cm in comorbidities]
df_knee_clean[cm_cols] = df_knee_clean[cm_cols].fillna(0)
df_hip_clean[cm_cols] = df_hip_clean[cm_cols].fillna(0)

Wall time: 11.2 s


## Create delta frames

In [4]:
def method_delta(df):
    # create MultiIndex
    df = df.copy().sort_index(axis=1)
    df.columns = pd.MultiIndex.from_frame(
        df.columns.str.extract(fr"^(t[01])_({'|'.join(methods.keys())})?_?(.*)$"),
        names=["available", "method", "feature"],
    )
    # select only methods dim and scores + get delta (t1 - t0)
    df = df.loc[
        :, [(m == m) & (f not in ["profile", "predicted"]) for t, m, f in df.columns]
    ]
    df_delta = (df["t1"] - df["t0"])

    df_delta.columns = ["delta_" + "_".join(col) for col in df_delta.columns]
    return df_delta

df_knee_delta = method_delta(df_knee_clean)
df_hip_delta = method_delta(df_hip_clean)

In [5]:
# Join df_seen with df_delta
df_knee_tot = df_knee_clean.join(df_knee_delta).reset_index(drop = True)
df_hip_tot = df_hip_clean.join(df_hip_delta).reset_index(drop = True)

In [6]:
# Define your Y
df_knee_tot["Y"] = (df_knee_tot["delta_oks_score"] < 7)*1
df_hip_tot["Y"] = (df_hip_tot["delta_ohs_score"] < 8)*1

In [7]:
# Split in seen and unseen datasets
df_knee_seen = df_knee_tot.query("t0_year != '2019/20'")
df_knee_unseen = df_knee_tot.query("t0_year == '2019/20'")

df_hip_seen = df_hip_tot.query("t0_year != '2019/20'")
df_hip_unseen = df_hip_tot.query("t0_year == '2019/20'")

## Split train and test set

In [8]:
# Optie 1: Split train and test set -> Stratified ERROR
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)

y = df_knee_seen["Y"]
x = df_knee_seen[df_knee_seen.columns.difference(["Y"])]

for train_index, test_index in split.split(x, y):
    Knee_train = df_knee_seen.loc[train_index]
    Knee_test = df_knee_seen.loc[test_index]
    
print("The distribution of Outcome in the training set is: \n", Knee_train["Y"].value_counts(normalize=True))
print("The distribution of Outcome in the test set is: \n", Knee_test["Y"].value_counts(normalize=True))   

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Int64Index([133057, 266048,  78448, 245138, 227728,\n            ...\n            128612, 299287, 235596, 322108, 295284],\n           dtype='int64', length=74243). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

In [9]:
# Optie 2: Split train and test set -> Train_test_split
y = df_knee_seen["Y"]
x = df_knee_seen[df_knee_seen.columns.difference(["Y"])]

x_knee_train, x_knee_test, y_knee_train, y_knee_test = train_test_split(x, y, train_size = 0.70, stratify = y)

print("The distribution of Outcome in the training set is: \n", y_knee_train.value_counts(normalize=True))
print("The distribution of Outcome in the test set is: \n", y_knee_test.value_counts(normalize=True))   

The distribution of Outcome in the training set is: 
 0    0.861724
1    0.138276
Name: Y, dtype: float64
The distribution of Outcome in the test set is: 
 0    0.861722
1    0.138278
Name: Y, dtype: float64


In [10]:
# Drop all t1 features
x_knee_train = x_knee_train.filter(regex = "t0_")
x_knee_test = x_knee_test.filter(regex = "t0_")

In [11]:
# Drop t0 features not interesting for model
x_knee_train = x_knee_train.drop(columns= ["t0_eqvas_score","t0_provider_code","t0_assisted_by","t0_year","t0_eq5d_profile","t0_procedure","t0_year"])
x_knee_test = x_knee_test.drop(columns= ["t0_eqvas_score","t0_provider_code","t0_assisted_by","t0_year","t0_eq5d_profile","t0_procedure","t0_year"])

In [12]:
# Check percentage missing
x_knee_train.isna().sum().sort_values()/len(x_knee_train)*100

t0_liver_disease          0.000000
t0_arthritis              0.000000
t0_cancer                 0.000000
t0_circulation            0.000000
t0_depression             0.000000
t0_diabetes               0.000000
t0_nervous_system         0.000000
t0_lung_disease           0.000000
t0_stroke                 0.000000
t0_kidney_disease         0.000000
t0_high_bp                0.000000
t0_heart_disease          0.000000
t0_oks_washing            0.086217
t0_oks_pain               0.137136
t0_previous_surgery       0.734097
t0_symptom_period         0.841557
t0_oks_night_pain         0.945268
t0_oks_confidence         0.965572
t0_oks_stairs             0.973070
t0_oks_shopping           0.973694
t0_oks_work               0.979317
t0_oks_transport          0.984940
t0_oks_kneeling           1.018677
t0_oks_limping            1.021489
t0_oks_walking            1.047104
t0_oks_standing           1.047729
t0_assisted               1.118327
t0_oks_score              1.197360
t0_living_arrangemen

In [13]:
# Replace NaN for t0_gender and t0_age_band
x_knee_train["t0_gender"] = x_knee_train["t0_gender"].fillna(0)
x_knee_train["t0_age_band"] = x_knee_train["t0_age_band"].astype(str).replace("nan", "Unknown")

x_knee_test["t0_gender"] = x_knee_test["t0_gender"].fillna(0)
x_knee_test["t0_age_band"] = x_knee_test["t0_age_band"].astype(str).replace("nan", "Unknown")

In [20]:
x_knee_test["t0_age_band"].value_counts()

70 to 79     46751
60 to 69     39987
80 to 89     12638
50 to 59     11446
Unknown       9662
40 to 49       181
90 to 120       23
Name: t0_age_band, dtype: int64

In [15]:
x_knee_train["t0_age_band"].isna().sum()

0

In [16]:
# Remove all NaNs
x_knee_train = x_knee_train.dropna()
x_knee_test = x_knee_test.dropna()

In [17]:
# Loop for OneHotEncoding TRAIN
from sklearn.preprocessing import OneHotEncoder

x_knee_train_ohe = pd.DataFrame()

for Hot in x_knee_train.columns:
    mapping = df_knee_meta.loc[Hot, "labels"]
    OH1 = OneHotEncoder(sparse=False)
    if isinstance(mapping, list):
        data = OH1.fit_transform(x_knee_train[Hot].map(mapping[0]).to_frame())
        columns = OH1.categories_
        df = pd.DataFrame(data = data, columns = columns, index = x_knee_train.index).add_prefix(Hot + "_")
        x_knee_train_ohe = pd.concat([x_knee_train_ohe, df], axis = 1)

In [18]:
y_knee_train_ohe = y_knee_train.loc[x_knee_train_ohe.index]

In [29]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(x_knee_train_ohe.iloc[:250_000,:], y_knee_train_ohe.iloc[:250_000])
y_predict = clf.predict(x_knee_train_ohe.iloc[250_000:,:])


In [21]:
x_knee_train_ohe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 281640 entries, 69850 to 445962
Columns: 123 entries, ('t0_age_band_40 to 49',) to ('t0_symptom_period_more than 10 years',)
dtypes: float64(123)
memory usage: 266.4 MB


In [32]:
from sklearn.metrics import classification_report

print(classification_report(y_knee_train_ohe.iloc[250_000:], y_predict))

              precision    recall  f1-score   support

           0       0.87      0.82      0.85     27310
           1       0.18      0.25      0.21      4330

    accuracy                           0.74     31640
   macro avg       0.53      0.54      0.53     31640
weighted avg       0.78      0.74      0.76     31640

