In [1]:
import pandas as pd 
import numpy as np 

In [53]:
def expand_data(df, participant_size):
    participant_ids = df.participant.unique()
    sampled_participant_ids = np.random.choice(participant_ids, participant_size)
    dff = df[df.participant == sampled_participant_ids[0]]
    dff = dff.assign(participant=[0]*len(dff))      
    # Parameters for the normal distribution
    mean = 0
    variance = 10**(-4)
    std_deviation = np.sqrt(variance)

    for idx, old_participant_id in enumerate(sampled_participant_ids[1:]):
        subset_df = df[df.participant == old_participant_id]
        # update participant id 
        subset_df = subset_df.assign(participant = [idx + 1]*len(subset_df))
        delta = np.random.normal(mean, std_deviation)
        new_measurement = [(x + delta) for x in subset_df.measurement]
        subset_df = subset_df.assign(measurement = new_measurement)
        dff = pd.concat([dff, subset_df])
    dff.reset_index(drop=True, inplace=True)
    return dff 

In [54]:
def process_chen_data(file, real_order, participant_size, seed = None):
    """Prepare data for analysis below
    """
    if seed is not None:
        np.random.seed(seed)  # Set the seed for numpy's random number generator
    df = pd.read_excel(file)
    biomarker_name_change_dic = dict(zip(['FCI(HIP)', 'GMI(HIP)', 'FCI(Fusi)', 'FCI(PCC)', 'GMI(FUS)'],
                                         real_order))
    df.rename(
        columns={df.columns[0]: 
                 'participant_category', df.columns[1]: 
                 'participant'}, 
                 inplace=True)
    # df = df[df.participant_category.isin(['CN', 'AD '])]
    df['diseased'] = df.apply(lambda row: row.participant_category != 'CN', axis = 1)
    df = pd.melt(df, id_vars=['participant_category', "participant", "timestamp", 'diseased'], 
                        value_vars=["FCI(HIP)", "GMI(HIP)", "FCI(Fusi)", "FCI(PCC)", "GMI(FUS)"], 
                        var_name='biomarker', value_name='measurement')
    # convert participant id
    n_participant = len(df.participant.unique())
    participant_ids = [_ for _ in range(n_participant)]
    participant_string_id_dic = dict(zip(df.participant.unique(), participant_ids))
    df['participant'] = df.apply(lambda row: participant_string_id_dic[row.participant], axis = 1 )
    df['biomarker'] = df.apply(lambda row: f"{row.biomarker}-{biomarker_name_change_dic[row.biomarker]}", 
                               axis = 1)
    if participant_size > n_participant:
        df = expand_data(df, participant_size)
    return df 


In [55]:
df = process_chen_data(
    "data/Chen2016Data.xlsx", 
    real_order = [1, 3, 5, 2, 4],
    participant_size = 500, 
    seed = None
    )
df

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement
0,MCI,0,8/2/2011,True,FCI(HIP)-1,3.380574
1,MCI,0,8/2/2011,True,GMI(HIP)-3,0.484600
2,MCI,0,8/2/2011,True,FCI(Fusi)-5,-16.726608
3,MCI,0,8/2/2011,True,FCI(PCC)-2,6.552117
4,MCI,0,8/2/2011,True,GMI(FUS)-4,0.569492
...,...,...,...,...,...,...
2495,CN,499,5/16/2013,False,FCI(HIP)-1,-0.185660
2496,CN,499,5/16/2013,False,GMI(HIP)-3,0.463110
2497,CN,499,5/16/2013,False,FCI(Fusi)-5,-12.692183
2498,CN,499,5/16/2013,False,FCI(PCC)-2,11.475025
