In [30]:
import pandas as pd 
import numpy as np 

def process_chen_data(file):
    """Prepare data for analysis below
    """
    df = pd.read_excel(file)
    biomarker_name_change_dic = dict(zip(['FCI(HIP)', 'GMI(HIP)', 'FCI(Fusi)', 'FCI(PCC)', 'GMI(FUS)'],
                                         [1, 3, 5, 2, 4]))
    df.rename(
        columns={df.columns[0]: 
                 'participant_category', df.columns[1]: 
                 'participant'}, 
                 inplace=True)
    # df = df[df.participant_category.isin(['CN', 'AD '])]
    df['diseased'] = df.apply(lambda row: row.participant_category != 'CN', axis = 1)
    df = pd.melt(df, id_vars=['participant_category', "participant", "timestamp", 'diseased'], 
                        value_vars=["FCI(HIP)", "GMI(HIP)", "FCI(Fusi)", "FCI(PCC)", "GMI(FUS)"], 
                        var_name='biomarker', value_name='measurement')
    # convert participant id
    n_participant = len(df.participant.unique())
    participant_ids = [_ for _ in range(n_participant)]
    participant_string_id_dic = dict(zip(df.participant.unique(), participant_ids))
    df['participant'] = df.apply(lambda row: participant_string_id_dic[row.participant], axis = 1 )
    df['biomarker'] = df.apply(lambda row: f"{row.biomarker}-{biomarker_name_change_dic[row.biomarker]}", 
                               axis = 1)
    return df 

df = pd.read_excel("data/Chen2016Data.xlsx")
df

Unnamed: 0.1,Unnamed: 0,SubjectID,FCI(HIP),GMI(HIP),FCI(Fusi),FCI(PCC),GMI(FUS),timestamp
0,CN,002_S_0295,-2.544567,0.370580,-5.549706,10.366552,0.493363,6/2/2011
1,CN,002_S_4213,-1.603212,0.498440,-4.185865,2.926323,0.581211,9/2/2011
2,CN,002_S_4270,-4.716009,0.492732,-11.288656,7.100353,0.540190,10/11/2011
3,CN,006_S_4150,-4.232625,0.489120,-12.353966,5.970476,0.547796,8/8/2011
4,CN,006_S_4357,3.628361,0.507251,-10.554712,7.808309,0.555537,11/28/2011
...,...,...,...,...,...,...,...,...
139,AD,130_S_4984,3.899376,0.345804,-20.584442,14.900567,0.522749,10/17/2012
140,AD,130_S_5006,-2.452325,0.371281,-15.325589,4.977447,0.526017,2/15/2013
141,AD,130_S_5059,-7.310156,0.257667,-20.264099,2.833997,0.342435,2/6/2013
142,AD,130_S_5231,-5.092941,0.239180,-6.436553,2.258336,0.415011,7/16/2013


In [31]:
df = process_chen_data("data/Chen2016Data.xlsx")
df

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement
0,CN,0,6/2/2011,False,FCI(HIP)-1,-2.544567
1,CN,1,9/2/2011,False,FCI(HIP)-1,-1.603212
2,CN,2,10/11/2011,False,FCI(HIP)-1,-4.716009
3,CN,3,8/8/2011,False,FCI(HIP)-1,-4.232625
4,CN,4,11/28/2011,False,FCI(HIP)-1,3.628361
...,...,...,...,...,...,...
715,AD,139,10/17/2012,True,GMI(FUS)-4,0.522749
716,AD,140,2/15/2013,True,GMI(FUS)-4,0.526017
717,AD,141,2/6/2013,True,GMI(FUS)-4,0.342435
718,AD,142,7/16/2013,True,GMI(FUS)-4,0.415011


In [32]:
df.biomarker.unique()

array(['FCI(HIP)-1', 'GMI(HIP)-3', 'FCI(Fusi)-5', 'FCI(PCC)-2',
       'GMI(FUS)-4'], dtype=object)

In [33]:
df = pd.read_csv('data/participant_data.csv')
df.biomarker.unique()

array(['HIP-FCI', 'HIP-GMI', 'FUS-FCI', 'PCC-FCI', 'FUS-GMI'],
      dtype=object)

In [34]:
def get_data_we_have(data_source):
    if data_source == "Chen Data":
         data_we_have = process_chen_data("data/Chen2016Data.xlsx")
    else:
        biomarker_name_change_dic = dict(zip(['HIP-FCI', 'HIP-GMI', 'FUS-FCI', 'PCC-FCI', 'FUS-GMI'],
                                         [1, 3, 5, 2, 4]))
        original_data = pd.read_csv('data/participant_data.csv')
        original_data['diseased'] = original_data.apply(lambda row: row.k_j > 0, axis = 1)
        data_we_have = original_data.drop(['k_j', 'S_n', 'affected_or_not'], axis = 1)
        data_we_have['biomarker'] = data_we_have.apply(
            lambda row: f"{row.biomarker} ({biomarker_name_change_dic[row.biomarker]})", axis = 1)
    return data_we_have

In [35]:
get_data_we_have("Simulated Data")

Unnamed: 0,participant,biomarker,measurement,diseased
0,0,HIP-FCI (1),-6.565792,True
1,1,HIP-FCI (1),4.020001,True
2,2,HIP-FCI (1),-0.739722,True
3,3,HIP-FCI (1),-8.939073,True
4,4,HIP-FCI (1),-16.548433,True
...,...,...,...,...
495,95,FUS-GMI (4),0.579851,False
496,96,FUS-GMI (4),0.685910,True
497,97,FUS-GMI (4),0.533357,True
498,98,FUS-GMI (4),0.683437,True
