In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
def create_stratifying_varaible(a, nbins=15):
    bins = np.linspace(0, 423, nbins)
    discretized_motor = np.digitize(a.MOTOR_TOT, bins)

    # create stratification variable by appending discretized motor with already discreted domside and gender
    a.Gender[a.Gender==0.0] = 1. # for the purpose of stratification ignore differences between 0 and 1
    a.DOMSIDE[a.DOMSIDE==3.0] = 1. # for the purpose of stratification ignore differences between 3 and 1 in DOMSIDE
    strat_variable = 100 * discretized_motor + 10 * a.DOMSIDE + a.Gender
    return strat_variable

def gen_Stratified_split(a, strat_variable):
    # stratify on the basis of discretized motor
    train_ids, test_ids, _, _ = train_test_split(a.index, strat_variable, test_size=0.2, stratify=strat_variable)
    #
    a_train = pd.DataFrame(data={'PATNO': train_ids})
    a_test = pd.DataFrame(data={'PATNO': test_ids})
    return train_ids, test_ids, a_train, a_test

def sainty_check(a_train, a_test, a):
    assert train_ids.shape[0] + test_ids.shape[0]==423
    train_PD = pd.merge(a, a_train, on='PATNO', how='inner')
    test_PD = pd.merge(a, a_test, on='PATNO', how='inner')
    assert train_PD.shape[0] + test_PD.shape[0]==423
    
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    sb.kdeplot(train_PD.MOTOR_TOT, color='r', label='Train MOT', ax=axs[0], shade=True, shade_lowest=False)
    sb.kdeplot(test_PD.MOTOR_TOT, color = 'm', label = 'Test MOT', ax=axs[0], shade=True, shade_lowest=False)
    sb.kdeplot(train_PD.NHY, color='r', label='Train NHY', ax=axs[1], shade=True, shade_lowest=False)
    sb.kdeplot(test_PD.NHY, color = 'm', label = 'Test NHY', ax=axs[1], shade=True, shade_lowest=False)
    sb.kdeplot(train_PD.STAI_TOT, color='r', label='Train STAI', ax=axs[2], shade=True, shade_lowest=False)
    sb.kdeplot(test_PD.STAI_TOT, color = 'm', label = 'Test STAI', ax=axs[2], shade=True, shade_lowest=False)
    
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    sb.distplot(train_PD.Gender, color='r', label='Train GENDER', ax=axs[0], kde=False, norm_hist=True)
    sb.distplot(test_PD.Gender, color = 'm', label = 'Test GENDER', ax=axs[0], kde=False, norm_hist=True)
    sb.distplot(train_PD.DOMSIDE, color='r', label='Train DOMSIDE', ax=axs[1], kde=False, norm_hist=True)
    sb.distplot(test_PD.DOMSIDE, color = 'm', label = 'Test DOMSIDE', ax=axs[1], kde=False, norm_hist=True)
    sb.kdeplot(train_PD.SDMTOTAL, color='r', label='Train SDMT', ax=axs[2], shade=True, shade_lowest=False)
    sb.kdeplot(test_PD.SDMTOTAL, color = 'm', label = 'Test SDMT', ax=axs[2], shade=True, shade_lowest=False)
    
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    sb.kdeplot(train_PD.PD_MED_USE, color='r', label='Train MED USE', ax=axs[0], shade=True, shade_lowest=False)
    sb.kdeplot(test_PD.PD_MED_USE, color = 'm', label = 'Test MED USE', ax=axs[0], shade=True, shade_lowest=False)
    sb.kdeplot(train_PD.SFT_TOT, color='r', label='Train SFT', ax=axs[1], shade=True, shade_lowest=False)
    sb.kdeplot(test_PD.SFT_TOT, color = 'm', label = 'Test SFT', ax=axs[1], shade=True, shade_lowest=False)
    sb.kdeplot(train_PD.UPSIT_TOT, color='r', label='Train UPSIT', ax=axs[2], shade=True, shade_lowest=False)
    sb.kdeplot(test_PD.UPSIT_TOT, color = 'm', label = 'Test UPSIT', ax=axs[2], shade=True, shade_lowest=False)
    return train_PD, test_PD

Creates a stratified 80/20 split of the 423 PD patients. 

In [None]:
path = ### PUT PATH TO DATA HERE ###

In [None]:
# import processed data from 'Full_Data_Processing.ipynb'
pd_patients = pd.read_csv('pd_on.csv', index_col=0)

In [None]:
# merge data with demographic data, from 'Demographics_Data_Processing.ipynb'
demo = pd.read_csv('demographics.csv')

In [None]:
pd_patients = pd_patients.merge(demo, how='left', on='PATNO')

In [None]:
# sanity check make sure we have 423 PD patients
assert len(pd_patients.groupby("PATNO")) == 423

In [None]:
# select motor only features
pd_motor = pd_patients.iloc[:, 4:66]
pd_motor = pd_motor.drop(columns=['PAG_NAME', 'CMEDTM', 'EXAMTM'])

In [None]:
# create data frame for stratifying
df_strat = pd_patients[['PATNO', 'SDMTOTAL', 'NHY']]
df_strat['MOTOR_TOT'] = pd_motor.sum(axis=1, skipna=False)
df_strat['PD_MED_USE'] = pd_patients.PD_MED_USE
df_strat['DOMSIDE'] = pd_patients.DOMSIDE.astype(float)
df_strat['STAI_TOT'] = pd_patients.STAI_TOT
df_strat['SFT_TOT'] = pd_patients.SFT_TOT
df_strat['Gender'] = pd_patients.GENDER
df_strat['UPSIT_TOT'] = pd_patients.UPSIT_TOT
df_strat['INFODT'] = pd.to_datetime(pd_patients.INFODT)

In [None]:
# compute average scores per patient
a = df_strat.groupby('PATNO').mean()
a['PATNO'] = a.index

In [None]:
strat_varaible = create_stratifying_varaible(a)
train_ids, test_ids, a_train, a_test = gen_Stratified_split(a, strat_varaible)

a.index.name=None
_, _, = sainty_check(a_train, a_test, a)

## Redo analysis with slopes rather than means representing patients (takes some temporality into effect)

In [None]:
def process(x):
    "Compute slopes of relevant quantities"
    gender = x.Gender.iloc[0]
    dom_side = x.DOMSIDE.iloc[0]
    tmax_id = x.INFODT.idxmax()
    tmin_id = x.INFODT.idxmin() 
    tmax = x.INFODT.max()
    tmin = x.INFODT.min()
    tdiff_in_years = ((tmax - tmin).days/365)
    processed_x = x.mean()
    processed_x.MOTOR_TOT = ((x[x.index==tmax_id].MOTOR_TOT.values - x[x.index==tmin_id].MOTOR_TOT.values) / tdiff_in_years)[0]
#     processed_x.SDMTOTAL = ((x[x.index==tmax_id].SDMTOTAL.values - x[x.index==tmin_id].SDMTOTAL.values) / tdiff_in_years)[0]
#     processed_x.STAI_TOT = ((x[x.index==tmax_id].STAI_TOT.values - x[x.index==tmin_id].STAI_TOT.values) / tdiff_in_years)[0]
#     processed_x.SFT_TOT = ((x[x.index==tmax_id].SFT_TOT.values - x[x.index==tmin_id].SFT_TOT.values) / tdiff_in_years)[0]
    #UPSIT is only measured at baselin; so it's sloope doesn't make senes
    processed_x.Gender = gender
    processed_x.DOMSIDE = dom_side
    return processed_x



In [None]:
# compute average scores per patient
a = df_strat.groupby('PATNO').apply(process)
a['PATNO'] = a.index

In [None]:
strat_varaible = create_stratifying_varaible(a, nbins=5)
train_ids, test_ids, a_train, a_test = gen_Stratified_split(a, strat_varaible)

a.index.name=None
train_PD, test_PD = sainty_check(a_train, a_test, a)

In [None]:
data_split = {}
data_split['train_ids'] = train_ids
data_split['test_ids'] = test_ids
data_split['desc'] = 'An 80/20 split of the data based on Motor (changein motor), Dominant Side and Gender'

In [None]:
with open('train_test_split.pkl', 'wb') as handle:
    pickle.dump(data_split, handle)