# PyCon UK: Alzheimer's Disease Challenge Hackathon
### Create local train/test dataset


In [None]:
from dateutil import rrule
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn import model_selection, preprocessing, pipeline

%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
# Load TADPOLE data
DATA_DIR = Path('../data')
tadpole = pd.read_csv(DATA_DIR / 'TADPOLE_LB1_LB2.csv', low_memory=False)
tadpole.head()

In [None]:
tadpole.columns[:10]

In [None]:
outcomes = ["ADAS13", "DX", "Ventricles"]
cog_tests_attributes = ["CDRSB", "ADAS11", "MMSE", "RAVLT_immediate"]
mri_measures = ['Hippocampus', 'WholeBrain', 'Entorhinal', 'MidTemp' , "FDG", "AV45"]
pet_measures = ["FDG", "AV45"]
csf_measures = ["ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17"]
risk_factors = ["APOE4", "AGE"]

In [None]:
# Add age at exam
tadpole.EXAMDATE = pd.to_datetime(tadpole.EXAMDATE)
tadpole_grouped = tadpole.groupby("RID").apply(lambda x:(x["EXAMDATE"]-x["EXAMDATE"].min()).dt.days/365.25 + x["AGE"].min())
tadpole_grouped.sort_index(inplace=True)
tadpole.sort_values(by=["RID", "EXAMDATE"], inplace=True)
tadpole["AGE_AT_EXAM"] = tadpole_grouped.values
tadpole['AGE_INT'] = tadpole['AGE_AT_EXAM'].apply(int)

# Create X, y datasets

In [None]:
dx_map = {
        'MCI': 'MCI',
        'NL': 'CN',
        'Dementia': 'AD',
        'MCI to Dementia': 'AD',
        'NL to MCI': 'MCI',
        'MCI to NL': 'CN',
        'Dementia to MCI': 'MCI',
        'NL to Dementia': 'AD'
    }
tadpole['diagnosis'] = tadpole['DX'].map(dx_map)

In [None]:
X_cols = [
    # cog_tests_attributes
    "CDRSB", "ADAS11", "MMSE", "RAVLT_immediate",
    # mri_measures
    'Hippocampus', 'WholeBrain', 'Entorhinal', 'MidTemp' , "FDG", "AV45",
    # pet_measures
    "FDG", "AV45",
    # csf_measures
    "ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17",
    # risk_factors
    "APOE4", "AGE",
    # age
    'AGE_AT_EXAM',
]

X = tadpole[X_cols]
y_diag = tadpole['diagnosis']
y_adas = tadpole['ADAS13']
y_vent = tadpole['Ventricles']

# Create local train/test sets

In [None]:
# Train on all patients except last observation!
test_idx = []
patients_from_2016 = tadpole[tadpole['EXAMDATE'] >= pd.to_datetime('2016-01-01')]  # 347 of these
for rid in  patients_from_2016['RID'].unique():
    rid_df = tadpole[tadpole['RID'] == rid].copy()
    rid_df.sort_values('EXAMDATE', ascending=True, inplace=True)
    test_idx += [rid_df.index[-1]]  # most recent visit from that patient

In [None]:
train_idx = list(set(tadpole.index) - set(test_idx))  # exclude visits in test data from training data

In [None]:
# Create train/test dfs
train = tadpole.loc[train_idx]
test = tadpole.loc[test_idx]

In [None]:
# Cut off 2016 onwards in training data
train = train[train['EXAMDATE'] < pd.to_datetime('2016-01-01')].copy()

In [None]:
# Checks
(train['EXAMDATE'].min(), train['EXAMDATE'].max())

In [None]:
# Convert dates to first day of month
def month_year(x):
    return pd.to_datetime(x.strftime('%Y-%m-01'))

month_year(test['EXAMDATE'].min())