# Calculate likelihoods

In [161]:
import pandas as pd
import numpy as np
import utils

def process_chen_data(file):
    """Prepare data for analysis below
    """
    df = pd.read_excel(file)
    df.rename(
        columns={df.columns[0]: 
                 'participant_category', df.columns[1]: 
                 'participant'}, 
                 inplace=True)
    # df = df[df.participant_category.isin(['CN', 'AD '])]
    df['diseased'] = df.apply(lambda row: row.participant_category != 'CN', axis = 1)
    df = pd.melt(df, id_vars=['participant_category', "participant", "timestamp", 'diseased'], 
                        value_vars=["FCI(HIP)", "GMI(HIP)", "FCI(Fusi)", "FCI(PCC)", "GMI(FUS)"], 
                        var_name='biomarker', value_name='measurement')
    # convert participant id
    num_participant = len(df.participant.unique())
    participant_string_id_dic = dict(zip(df.participant.unique(), [_ for _ in range(num_participant)]))
    df['participant'] = df.apply(lambda row: participant_string_id_dic[row.participant], axis = 1 )
    return df 

def get_data_we_have(data_source):
    if data_source == "Chen Data":
         data_we_have = process_chen_data("data/Chen2016Data.xlsx")
    else:
        original_data = pd.read_csv('data/participant_data.csv')
        original_data['diseased'] = original_data.apply(lambda row: row.k_j > 0, axis = 1)
        data_we_have = original_data.drop(['k_j', 'S_n', 'affected_or_not'], axis = 1)
    return data_we_have

In [162]:
current_accepted_ll = -866.6745525609758

In [163]:
all_current_ll = pd.read_csv(
    "logs/chen_data_conjugate_priors/all_current_likelihoods.csv")
# the # of iteration that generated the first current likelihood
that_iteration = all_current_ll[
    all_current_ll.all_current_likelihoods == current_accepted_ll].reset_index().iloc[0, :]['iteration'] - 1
that_iteration

266.0

In [164]:
type(that_iteration - 1)

numpy.float64

In [165]:
participant_stages_df = pd.read_csv(
    "logs/chen_data_conjugate_priors/all_current_participant_stages.csv")
participant_stages_df.head()
participant_stages = participant_stages_df.iloc[int(that_iteration) - 1, 1:]
np.array(participant_stages)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 3, 0, 2, 2, 4, 5, 2, 5, 1, 0, 1, 2, 2, 2, 1, 0, 5, 4, 4,
       3, 1, 2, 2, 5, 2, 5, 5, 0, 1, 2, 2, 2, 4, 2, 0, 1, 2, 2, 2, 3, 4,
       5, 0, 4, 4, 2, 3, 4, 3, 5, 2, 3, 2, 0, 2, 2, 1, 1, 1, 5, 2, 0, 1,
       5, 2, 3, 1, 5, 4, 1, 5, 5, 1, 3, 3, 5, 5, 3, 0, 4, 4, 5, 5, 2, 0,
       2, 5, 0, 0, 0, 5, 5, 3, 4, 5, 3, 3])

In [166]:
current_orderings = participant_stages_df = pd.read_csv(
    "logs/chen_data_conjugate_priors/all_current_order_dicts.csv")
current_orderings.head()
current_ordering_dict = dict(current_orderings.iloc[-1, 1:])
current_ordering_dict

{'FCI(HIP)': 5, 'GMI(HIP)': 3, 'FCI(Fusi)': 2, 'FCI(PCC)': 1, 'GMI(FUS)': 4}

In [167]:
real_ordering_dict = {'FCI(HIP)': 1, 'GMI(HIP)': 3, 'FCI(Fusi)': 5, 'FCI(PCC)': 2, 'GMI(FUS)': 4}

In [168]:
data_we_have = get_data_we_have("Chen Data")
biomarkers = data_we_have.biomarker.unique()
n_biomarkers = len(biomarkers)
n_stages = n_biomarkers + 1
n_participants = len(data_we_have.participant.unique())
theta_phi_kmeans = utils.get_theta_phi_kmeans(data_we_have, biomarkers, n_clusters = 2)
data = data_we_have.copy()
data.head()

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement
0,CN,0,6/2/2011,False,FCI(HIP),-2.544567
1,CN,1,9/2/2011,False,FCI(HIP),-1.603212
2,CN,2,10/11/2011,False,FCI(HIP),-4.716009
3,CN,3,8/8/2011,False,FCI(HIP),-4.232625
4,CN,4,11/28/2011,False,FCI(HIP),3.628361


In [169]:
theta_phi_kmeans.head()

Unnamed: 0,biomarker,theta_mean,theta_std,phi_mean,phi_std
0,FCI(HIP),2.794918,2.983066,-5.546734,2.887983
1,GMI(HIP),0.341778,0.054571,0.482245,0.037704
2,FCI(Fusi),-19.767018,4.183856,-10.216922,3.021294
3,FCI(PCC),11.583134,3.358987,1.781998,3.292063
4,GMI(FUS),0.457107,0.044667,0.56915,0.035734


In [170]:
non_diseased_participants = data_we_have.loc[
        data_we_have.diseased == False].participant.unique()
non_diseased_participants

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])

In [171]:
n_participants

144

In [172]:
data['S_n'] = data.apply(lambda row: current_ordering_dict[row['biomarker']], axis = 1)
data.head()

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement,S_n
0,CN,0,6/2/2011,False,FCI(HIP),-2.544567,5
1,CN,1,9/2/2011,False,FCI(HIP),-1.603212,5
2,CN,2,10/11/2011,False,FCI(HIP),-4.716009,5
3,CN,3,8/8/2011,False,FCI(HIP),-4.232625,5
4,CN,4,11/28/2011,False,FCI(HIP),3.628361,5


In [173]:
data = utils.add_kj_and_affected(data, participant_stages, n_participants)
data.head()

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement,S_n,k_j,affected
0,CN,0,6/2/2011,False,FCI(HIP),-2.544567,5,0,False
1,CN,1,9/2/2011,False,FCI(HIP),-1.603212,5,0,False
2,CN,2,10/11/2011,False,FCI(HIP),-4.716009,5,0,False
3,CN,3,8/8/2011,False,FCI(HIP),-4.232625,5,0,False
4,CN,4,11/28/2011,False,FCI(HIP),3.628361,5,0,False


In [174]:
estimated_theta_phi = utils.get_theta_phi_conjugate_priors(
            biomarkers, data, theta_phi_kmeans=theta_phi_kmeans)
estimated_theta_phi.head()

Unnamed: 0,biomarker,theta_mean,theta_std,phi_mean,phi_std
0,FCI(HIP),-2.513914,5.279185,-1.724161,5.033141
1,GMI(HIP),0.342163,0.067984,0.467731,0.051609
2,FCI(Fusi),-14.318009,5.821399,-12.246062,5.227981
3,FCI(PCC),4.647519,6.051305,5.62751,5.060392
4,GMI(FUS),0.479318,0.066167,0.554471,0.05142


In [175]:
def cal_all_participant_ln_likelihood_based_on_ordering_and_participant_stages(
        data_we_have,
        ordering_dict,
        participant_stages,
        n_participants, 
        non_diseased_participants,
        n_stages,
        theta_phi_kmeans
    ):
    data = data_we_have.copy()
    data['S_n'] = data.apply(lambda row: ordering_dict[row['biomarker']], axis = 1)
    data = utils.add_kj_and_affected(data, participant_stages, n_participants)
    # get estimated_theta_phi
    estimated_theta_phi = utils.get_theta_phi_conjugate_priors(
        biomarkers, data, theta_phi_kmeans)

    all_participant_ln_likelihood = 0 
    for p in range(n_participants):
        # this participant data
        pdata = data[data.participant == p].reset_index(drop=True)

        """If this participant is not diseased (i.e., if we know k_j is equal to 0)
        We still need to compute the likelihood of this participant seeing this sequence of biomarker data
        but we do not need to estimate k_j like below

        We still need to compute the likelihood because we need to add it to all_participant_ln_likelihood
        """
        if p in non_diseased_participants:
            this_participant_likelihood = utils.compute_likelihood(
                pdata, k_j = 0, theta_phi = estimated_theta_phi)
            this_participant_ln_likelihood = np.log(this_participant_likelihood)
        else:
            # initiaze stage_likelihood
            stage_likelihood = np.zeros(n_stages)
            for k_j in range(n_stages):
                # even though data above has everything, it is filled up by random stages
                # we don't like it and want to know the true k_j. All the following is to update participant_stages

                # likelihood for this participant to have this specific sequence of biomarker values
                participant_likelihood = utils.compute_likelihood(pdata, k_j, estimated_theta_phi)

                # update each stage likelihood for this participant
                stage_likelihood[k_j] = participant_likelihood
            likelihood_sum = np.sum(stage_likelihood)
            normalized_stage_likelihood = [l/likelihood_sum for l in stage_likelihood]
            # sampled_stage = np.random.choice(
            #     np.arange(n_stages), p = normalized_stage_likelihood)
            # participant_stages[p] = sampled_stage   

            # if participant is in sampled_stage, what is the likelihood of 
            # seeing this sequence of biomarker data:
            # this_participant_likelihood = stage_likelihood[sampled_stage]

            # this_participant_likelihood = average_all_likelihood(pdata, num_biomarkers, estimated_theta_phi)

            # use weighted average likelihood because we didn't know the exact participant stage
            # all above to calculate participant_stage is only for the purpous of calculate theta_phi
            this_participant_likelihood = utils.weighted_average_likelihood(
                pdata, n_stages, normalized_stage_likelihood, estimated_theta_phi)
            
            # then, update all_participant_likelihood
            if this_participant_likelihood == 0:
                this_participant_ln_likelihood = np.log(this_participant_likelihood + 1e20)
            else:
                this_participant_ln_likelihood = np.log(this_participant_likelihood)
        """
        All the codes in between are calculating this_participant_ln_likelihood. 
        If we already know kj=0, then
        it's very simple. If kj is unknown, we need to calculate the likelihood of seeing 
        this sequence of biomarker
        data at different stages, and get the relative likelihood before 
        we get a sampled stage (this is for estimating theta and phi). 
        Then we calculate this_participant_ln_likelihood using average likelihood. 
        """
        all_participant_ln_likelihood += this_participant_ln_likelihood
    return all_participant_ln_likelihood

In [176]:
current_ll = cal_all_participant_ln_likelihood_based_on_ordering_and_participant_stages(
    data_we_have,
    current_ordering_dict,
    participant_stages,
    n_participants, 
    non_diseased_participants,
    n_stages,
    theta_phi_kmeans,
)

In [177]:
current_ll

-888.9376221102552

In [178]:
real_ll = cal_all_participant_ln_likelihood_based_on_ordering_and_participant_stages(
    data_we_have,
    real_ordering_dict,
    participant_stages,
    n_participants, 
    non_diseased_participants,
    n_stages,
    theta_phi_kmeans
)

In [179]:
real_ll

-881.9991419141088