In [1]:
import pandas as pd 
import numpy as np 
import eval 
import utils
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")


In [2]:
data_we_have = eval.get_data_we_have("Chen Data")
data_we_have.head()

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement
0,CN,0,6/2/2011,False,FCI(HIP),-2.544567
1,CN,1,9/2/2011,False,FCI(HIP),-1.603212
2,CN,2,10/11/2011,False,FCI(HIP),-4.716009
3,CN,3,8/8/2011,False,FCI(HIP),-4.232625
4,CN,4,11/28/2011,False,FCI(HIP),3.628361


In [3]:
n_participants = len(data_we_have.participant.unique())
biomarkers = data_we_have.biomarker.unique()
n_biomarkers = len(biomarkers)
n_stages = n_biomarkers + 1
non_diseased_participant_ids = data_we_have.loc[
    data_we_have.diseased == False].participant.unique()
diseased_stages = np.arange(start = 1, stop = n_stages, step = 1)

In [19]:
biomarkers

array(['FCI(HIP)', 'GMI(HIP)', 'FCI(Fusi)', 'FCI(PCC)', 'GMI(FUS)'],
      dtype=object)

In [4]:
non_diseased_participant_ids

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])

In [5]:
# obtain the iniial theta and phi estimates
theta_phi_estimates = utils.get_theta_phi_estimates(
    data_we_have, 
    biomarkers, 
    n_clusters = 2,
    method = "kmeans_only"
)

In [6]:
theta_phi_estimates

Unnamed: 0,biomarker,theta_mean,theta_std,phi_mean,phi_std
0,FCI(HIP),2.794918,2.983066,-5.546734,2.887983
1,GMI(HIP),0.341778,0.054571,0.482245,0.037704
2,FCI(Fusi),-19.767018,4.183856,-10.216922,3.021294
3,FCI(PCC),11.583134,3.358987,1.781998,3.292063
4,GMI(FUS),0.457107,0.044667,0.56915,0.035734


In [7]:
# truth = pd.read_csv("data/means_stds.csv")
# truth

In [8]:
current_accepted_order = np.random.permutation(np.arange(1, n_stages))
current_accepted_order_dict = dict(zip(biomarkers, current_accepted_order))
current_accepted_likelihood = -np.inf

In [9]:
iterations = 100

## Iteration

In [10]:
for i in range(iterations):
    new_order = current_accepted_order.copy()
    utils.shuffle_order(new_order, n_shuffle=2)
    current_order_dict = dict(zip(biomarkers, new_order))

    all_ll, hashmap = utils.calculate_all_participant_ln_likelihood_and_update_hashmap(
        data_we_have,
        current_order_dict,
        n_participants,
        non_diseased_participant_ids,
        theta_phi_estimates,
        diseased_stages,
    )   

    theta_phi_estimates = utils.soft_kmeans_theta_phi_estimates(
        i,
        data_we_have, 
        biomarkers, 
        current_order_dict, 
        n_participants, 
        non_diseased_participant_ids, 
        hashmap, 
        diseased_stages, 
        seed=1234
    )

    prob_of_accepting_new_order = np.exp(
        all_ll - current_accepted_likelihood)

    # it will definitly update at the first iteration
    if np.random.rand() < prob_of_accepting_new_order:
        print("change now")
        current_accepted_order = new_order
        current_accepted_likelihood = all_ll
        current_accepted_order_dict = current_order_dict
    print(f"iteration {i+1} done!")
    print(theta_phi_estimates)

change now
iteration 1 done!
   biomarker  theta_mean  theta_std   phi_mean   phi_std
0   FCI(HIP)    4.586966   3.078976  -2.816131  4.584130
1   GMI(HIP)    0.357818   0.074792   0.473351  0.046560
2  FCI(Fusi)  -15.550536   5.717119 -11.492067  4.858621
3   FCI(PCC)   11.538736   4.258356   3.482566  4.810499
4   GMI(FUS)    0.524240   0.064704   0.565867  0.050199
change now
iteration 2 done!
   biomarker  theta_mean  theta_std   phi_mean   phi_std
0   FCI(HIP)    6.468818   2.457429  -2.459345  4.659018
1   GMI(HIP)    0.363674   0.072811   0.478775  0.044267
2  FCI(Fusi)  -17.575815   5.769655 -11.667470  4.643482
3   FCI(PCC)   13.171987   3.283728   3.662199  4.777442
4   GMI(FUS)    0.524240   0.064704   0.565867  0.050199
change now
iteration 3 done!
   biomarker  theta_mean  theta_std   phi_mean   phi_std
0   FCI(HIP)    6.468818   2.457429  -2.459345  4.659018
1   GMI(HIP)    0.360686   0.069802   0.479678  0.044220
2  FCI(Fusi)  -19.847990   4.935696 -11.679288  4.517963
3

In [11]:
# new_order = current_accepted_order.copy()
# # random.shuffle(new_order)
# utils.shuffle_order(new_order, n_shuffle=2)
# current_order_dict = dict(zip(biomarkers, new_order))
# current_order_dict

In [12]:
# all_ll, hashmap = utils.calculate_all_participant_ln_likelihood_and_update_hashmap(
#         data_we_have,
#         current_order_dict,
#         n_participants,
#         non_diseased_participant_ids,
#         theta_phi_estimates,
#         diseased_stages,
# )
# all_ll, hashmap

In [13]:
# hashmap[49]

In [14]:
# theta_phi_estimates = utils.soft_kmeans_theta_phi_estimates(
#     iteration,
#     data_we_have, 
#     biomarkers, 
#     current_order_dict, 
#     n_participants, 
#     non_diseased_participant_ids, 
#     hashmap, 
#     diseased_stages, 
#     seed=1234
# )

In [15]:
# theta_phi_estimates

In [16]:
# prob_of_accepting_new_order = np.exp(
#     all_ll - current_accepted_likelihood)

# # it will definitly update at the first iteration
# if np.random.rand() < prob_of_accepting_new_order:
#     print("change now")
#     current_accepted_order = new_order
#     current_accepted_likelihood = all_ll
#     current_accepted_order_dict = current_order_dict

In [17]:
# iteration +=

In [18]:
truth = pd.read_csv("data/means_stds.csv")
truth

Unnamed: 0,biomarker,theta_mean,theta_std,phi_mean,phi_std
0,HIP-FCI,-5.0,6.666667,5.0,1.666667
1,HIP-GMI,0.3,0.333333,0.4,0.233333
2,FUS-FCI,-20.0,6.0,-10.0,3.333333
3,PCC-FCI,5.0,3.333333,12.0,4.0
4,FUS-GMI,0.5,0.066667,0.6,0.066667
