In [72]:
import pandas as pd 
import numpy as np 
import utils
import random

order_one_simulated = [1,2,4,5,3]
max_order_chen = [1,4,3,2,5]
real_order = [1,3,5,2,4]

In [64]:
def process_chen_data(file):
    """Prepare data for analysis below
    """
    df = pd.read_excel(file)
    df.rename(
        columns={df.columns[0]: 
                 'participant_category', df.columns[1]: 
                 'participant'}, 
                 inplace=True)
    # df = df[df.participant_category.isin(['CN', 'AD '])]
    df['diseased'] = df.apply(lambda row: row.participant_category != 'CN', axis = 1)
    df = pd.melt(df, id_vars=['participant_category', "participant", "timestamp", 'diseased'], 
                        value_vars=["FCI(HIP)", "GMI(HIP)", "FCI(Fusi)", "FCI(PCC)", "GMI(FUS)"], 
                        var_name='biomarker', value_name='measurement')
    # convert participant id
    num_participant = len(df.participant.unique())
    participant_string_id_dic = dict(zip(df.participant.unique(), [_ for _ in range(num_participant)]))
    df['participant'] = df.apply(lambda row: participant_string_id_dic[row.participant], axis = 1 )
    return df 

def get_data_we_have(data_source):
    if data_source == "Chen Data":
         data_we_have = process_chen_data("data/Chen2016Data.xlsx")
    else:
        original_data = pd.read_csv('data/participant_data.csv')
        original_data['diseased'] = original_data.apply(lambda row: row.k_j > 0, axis = 1)
        data_we_have = original_data.drop(['k_j', 'S_n', 'affected_or_not'], axis = 1)
    return data_we_have

In [65]:
data_we_have = get_data_we_have("Simulated Data")
data_we_have.head()

Unnamed: 0,participant,biomarker,measurement,diseased
0,0,HIP-FCI,23.239077,True
1,1,HIP-FCI,27.889339,False
2,2,HIP-FCI,19.442578,True
3,3,HIP-FCI,18.56904,True
4,4,HIP-FCI,24.009016,True


In [66]:
all_participant_stages = pd.read_csv("logs/simulated_data_conjugate_priors/participant_stages_at_the_end_of_each_iteartion.csv")
participant_stages = np.array(all_participant_stages.iloc[-1, :])
participant_stages

array([5., 0., 1., 5., 2., 2., 0., 3., 4., 3., 4., 0., 5., 5., 2., 0., 1.,
       0., 0., 2., 5., 0., 1., 1., 5., 1., 3., 3., 1., 2., 2., 2., 1., 3.,
       1., 1., 2., 0., 1., 4., 4., 1., 0., 5., 4., 0., 4., 1., 5., 0., 1.,
       5., 3., 1., 5., 5., 4., 0., 1., 5., 4., 4., 0., 2., 5., 2., 0., 1.,
       2., 0., 4., 5., 3., 3., 5., 3., 4., 3., 2., 5., 3., 3., 4., 5., 5.,
       5., 3., 0., 5., 3., 0., 3., 5., 0., 3., 4., 3., 2., 0., 0.])

In [67]:
def compute_ll_based_on_order_dict(
        data_we_have,
        order,
        participant_stages,
):
        n_participants = len(data_we_have.participant.unique())
        biomarkers = data_we_have.biomarker.unique()
        n_biomarkers = len(biomarkers)
        n_stages = n_biomarkers + 1
        diseased_stages = np.arange(start = 1, stop = n_stages, step = 1)
        non_diseased_participant_ids = data_we_have.loc[data_we_have.diseased == False].participant.unique()
        diseased_participant_ids = data_we_have.loc[data_we_have.diseased == True].participant.unique()

        order_dict = dict(zip(biomarkers, order))
        data = data_we_have.copy()
        data['S_n'] = data.apply(lambda row: order_dict[row['biomarker']], axis = 1)
        # add kj and affected for the whole dataset based on participant_stages
        # also modify diseased col (because it will be useful for the new theta_phi_kmeans)
        data = utils.add_kj_and_affected_and_modify_diseased(data, participant_stages, n_participants)
        theta_phi_kmeans = utils.get_theta_phi_kmeans(data.copy(), biomarkers, n_clusters = 2)
        estimated_theta_phi =utils.get_theta_phi_conjugate_priors(biomarkers, data.copy(), theta_phi_kmeans)

        all_participant_ln_likelihood = utils.compute_all_participant_ln_likelihood_and_update_participant_stages(
                n_participants,
                data,
                non_diseased_participant_ids,
                estimated_theta_phi,
                diseased_stages,
                participant_stages,
        )
        return all_participant_ln_likelihood
    

In [69]:
simulated_ll_max_order = compute_ll_based_on_order_dict(
        data_we_have,
        order_one_simulated,
        participant_stages,
)
simulated_ll_max_order

-1735.5572304632897

In [70]:
simulated_ll_real_order = compute_ll_based_on_order_dict(
        data_we_have,
        real_order,
        participant_stages,
)
simulated_ll_real_order

-1774.6685699166583

## Chen's data

In [71]:
data_we_have = get_data_we_have("Chen Data")
data_we_have.head()

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement
0,CN,0,6/2/2011,False,FCI(HIP),-2.544567
1,CN,1,9/2/2011,False,FCI(HIP),-1.603212
2,CN,2,10/11/2011,False,FCI(HIP),-4.716009
3,CN,3,8/8/2011,False,FCI(HIP),-4.232625
4,CN,4,11/28/2011,False,FCI(HIP),3.628361


In [73]:
all_participant_stages = pd.read_csv("logs/chen_data_conjugate_priors/participant_stages_at_the_end_of_each_iteartion.csv")
participant_stages = np.array(all_participant_stages.iloc[-1, :])
participant_stages

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 4., 1., 1., 3., 2.,
       3., 3., 3., 5., 3., 1., 3., 1., 1., 1., 3., 1., 5., 2., 1., 4., 3.,
       4., 2., 5., 1., 4., 3., 3., 1., 2., 3., 3., 2., 3., 5., 1., 2., 3.,
       3., 2., 5., 5., 1., 1., 3., 1., 2., 4., 4., 5., 2., 5., 1., 2., 2.,
       2., 4., 2., 4., 1., 3., 3., 2., 5., 3., 5., 5., 4., 4., 1., 5., 5.,
       3., 4., 5., 5., 2., 4., 1., 5., 2., 3., 2., 2., 1., 5., 5., 2., 5.,
       4., 5., 5., 5., 5., 5., 5., 1.])

In [74]:
chen_ll_max_order = compute_ll_based_on_order_dict(
        data_we_have,
        max_order_chen,
        participant_stages,
)
chen_ll_max_order

-881.7017393719559

In [75]:
chen_ll_real_order = compute_ll_based_on_order_dict(
        data_we_have,
        real_order,
        participant_stages,
)
chen_ll_real_order

-900.2773076608739