Merge the motor and non-motor measures while also selecting only enrolled patients. Reconcile 'INFODT' errors using the signature form csv.

In [None]:
import pandas as pd
import numpy as np

In [None]:
path = ### PUT PATH TO DATA HERE ###

In [None]:
non_motor = pd.read_csv('non-motor.csv') #note that this is the processed data created by 'Non_Motor_Data_Processing.ipynb'
motor = pd.read_csv('motor.csv') #note that this is the processed data created by 'Motor_Data_Processing.ipynb'
demographics = pd.read_csv('demographics.csv') #note that this is the processed data created by 'Demographics_Data_Processing.ipynb'

In [None]:
motor.head()

In [None]:
pd_ids = demographics[(~demographics.ENROLLDT.isnull()) & (demographics.APPRDX==1.0)].PATNO #PD cohort
hc_ids = demographics[(~demographics.ENROLLDT.isnull()) & (demographics.APPRDX==2.0)].PATNO #HC cohort

In [None]:
print(len(pd_ids), len(hc_ids))

In [None]:
df = motor.merge(non_motor, how='outer', on=['PATNO','EVENT_ID','INFODT'])

In [None]:
df.head()

In [None]:
df = df[(df.PATNO.isin(pd_ids)) | (df.PATNO.isin(hc_ids))]

In [None]:
# create a data frame taking only the on measurements when paired testing was performed
pd_on = pd.DataFrame(columns = df.columns)

In [None]:
reconcile_pn = []
reconcile_ei = []
for pn in pd_ids:
    for ei in df[df.PATNO==pn].EVENT_ID.unique():
        rec = df[(df.PATNO==pn) & (df.EVENT_ID==ei)]
        if len(rec) > 1:
            if (rec.PAG_NAME != 'NUPDRS3A').all():
                print(pn, ei)
                reconcile_pn.append(pn)
                reconcile_ei.append(ei)
            else:
                on_rec = df[(df.PATNO==pn) & (df.EVENT_ID==ei) & (df.PAG_NAME=='NUPDRS3A')]
                pd_on = pd_on.append(on_rec,ignore_index=True)
        else:
            pd_on = pd_on.append(rec,ignore_index=True)

In [None]:
df_sig=pd.read_csv(path + "Signature_Form.csv")

In [None]:
interest_cols = [
'NP1COG',
'NP1HALL',
'NP1DPRS',
'NP1ANXS',
'NP1APAT',
'NP1DDS',
'NP1SLPN',
'NP1SLPD',
'NP1PAIN',
'NP1URIN',
'NP1CNST',
'NP1LTHD',
'NP1FATG',
'NP2SPCH',
'NP2SALV',
'NP2SWAL',
'NP2EAT',
'NP2DRES',
'NP2HYGN',
'NP2HWRT',
'NP2HOBB',
'NP2TURN',
'NP2TRMR',
'NP2RISE',
'NP2WALK',
'NP2FREZ',
'MSEADLG',
'PAG_NAME',
'DYSKPRES',
'DYSKIRAT',
'NHY',
'ON_OFF_DOSE',
'PD_MED_USE',
'NP3SPCH',
'NP3FACXP',
'NP3RIGN',
'NP3RIGRU',
'NP3RIGLU',
'PN3RIGRL',
'NP3RIGLL',
'NP3FTAPR',
'NP3FTAPL',
'NP3HMOVR',
'NP3HMOVL',
'NP3PRSPR',
'NP3PRSPL',
'NP3TTAPR',
'NP3TTAPL',
'NP3LGAGR',
'NP3LGAGL',
'NP3RISNG',
'NP3GAIT',
'NP3FRZGT',
'NP3PSTBL',
'NP3POSTR',
'NP3BRADY',
'NP3PTRMR',
'NP3PTRML',
'NP3KTRMR',
'NP3KTRML',
'NP3RTARU',
'NP3RTALU',
'NP3RTARL',
'NP3RTALL',
'NP3RTALJ',
'NP3RTCON',
'SDMTOTAL',
'STAI_TOT',
'SFT_TOT',
'SCOPA_AUT_TOT',
'REMSLEEP_TOT',
'QUIP_A',
'QUIP_B',
'QUIP_C',
'QUIP_D',
'QUIP_E',
'UPSIT_TOT',
'MoCA_score',
'LNS_TOT',
'HVLT_TOT_Recall',
'HVLT_DCR_REC',
'HVLT_RETENTION',
'GDS_TOT',
'GDS_Depressed',
'ESS_TOT',
'ESS_Sleepy',
'BJLOT_TOT',
'CAUDATE_R',
'CAUDATE_L',
'PUTAMEN_R',
'PUTAMEN_L']

def process_multi_rows(pat, visit, df):
    print("Working on",pat,visit)
    if len(df)<=1: 
        print("Length of selection for ",pat,visit," is ",len(df))
        return df
    # print("Length of current frame is:",len(cur_frame))
    res_row=df.iloc[0]
    info_dt = df_sig[(df_sig['PATNO']==pat) & (df_sig['EVENT_ID']==visit)]['INFODT'].values[0]
    res_row['INFODT'] = pd.to_datetime(info_dt, format='%m/%Y')
    res_row['INFODT'] = res_row['INFODT'].strftime('%m/%Y')

    for col in interest_cols:
            cur_col=df[col]
            for i in cur_col:
                if isinstance(i,str) or not np.isnan(i):
                    res_row[col]=i
                    break
    df = df.drop(df[(df['PATNO']==pat) & (df['EVENT_ID']==visit)].index)
    df = df.append(res_row)
    return df

In [None]:
for i in range(len(reconcile_pn)):
    rec = df[(df.PATNO==reconcile_pn[i]) & (df.EVENT_ID==reconcile_ei[i])]
    test = process_multi_rows(reconcile_pn[i], reconcile_ei[i], rec)
    pd_on = pd_on.append(test, ignore_index=True)

In [None]:
#create a dataframe of only healthy controls
hc = pd.DataFrame(columns = df.columns)

In [None]:
reconcile_pn = []
reconcile_ei = []
for pn in hc_ids:
    for ei in df[df.PATNO==pn].EVENT_ID.unique():
        rec = df[(df.PATNO==pn) & (df.EVENT_ID==ei)]
        if len(rec) > 1:
            print(pn, ei)
            reconcile_pn.append(pn)
            reconcile_ei.append(ei)
        else:
            hc = hc.append(rec,ignore_index=True)

In [None]:
for i in range(len(reconcile_pn)):
    rec = df[(df.PATNO==reconcile_pn[i]) & (df.EVENT_ID==reconcile_ei[i])]
    test = process_multi_rows(reconcile_pn[i], reconcile_ei[i], rec)
    hc = hc.append(test, ignore_index=True)

In [None]:
hc = hc.drop(index=hc[hc.EVENT_ID=='U01'].index)

In [None]:
pd_on = pd_on.drop(index=pd_on[pd_on.INFODT.isnull()].index)

In [None]:
pd_on.to_csv('pd_on.csv')
hc.to_csv('hc.csv')