In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_biomarker_stage_probability(df, burn_in, thining):
    """filter through all_dicts using burn_in and thining 
    and for each biomarker, get probability of being in each possible stage

    Input:
        - df: all_ordering.csv
        - burn_in
        - thinning
    Output:
        - dff: a pandas dataframe where index is biomarker name, each col is each stage
        and each cell is the probability of that biomarker indicating that stage
    """
    df = df[(df.index > burn_in) & (df.index % thining == 0)]
    # Create an empty list to hold dictionaries
    dict_list = []

    # for each biomarker
    for col in df.columns:
        dic = {"biomarker": col}
        # get the frequency of biomarkers
        # value_counts will generate a Series where index is each cell's value
        # and the value is the frequency of that value
        stage_counts = df[col].value_counts()
        # for each stage
        # not that df.shape[1] should be equal to num_biomarkers
        for i in range(1, df.shape[1] + 1):
            # get stage:prabability
            dic[i] = stage_counts.get(i, 0)/len(df)
        dict_list.append(dic)

    dff = pd.DataFrame(dict_list)
    dff.set_index(dff.columns[0], inplace=True)
    return dff 

In [3]:
df = pd.read_csv("logs/conjugate_priors/all_ordering.csv")
df.set_index(df.columns[0], inplace=True)
# df = df.set_index("iteration")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'logs/conjugate_priors/all_ordering.csv'

In [None]:
df[(df.index > 10) & (df.index % 2 == 0)].head()

Unnamed: 0_level_0,MMSE,ADAS,AB,P-Tau,HIP-FCI
iteration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12,3,5,1,2,4
14,5,4,3,2,1
16,3,5,1,4,2
18,4,3,2,5,1
20,4,2,5,1,3


In [None]:
df.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500],
           dtype='int64', name='iteration', length=1500)

In [None]:
dict_list = []

# for each biomarker
for col in df.columns:
    dic = {"biomarker": col}
    # get the frequency of biomarkers
    # value_counts will generate a Series where index is each cell's value
    # and the value is the frequency of that value
    stage_counts = df[col].value_counts()
    # for each stage
    for i in range(1, df.shape[1] + 1):
        # get stage:prabability
        dic[i] = stage_counts.get(i, 0)/len(df)
    dict_list.append(dic)

dff = pd.DataFrame(dict_list)
dff

Unnamed: 0,biomarker,1,2,3,4,5
0,MMSE,0.192667,0.214667,0.195333,0.196667,0.200667
1,ADAS,0.213333,0.185333,0.209333,0.187333,0.204667
2,AB,0.201333,0.19,0.206,0.200667,0.202
3,P-Tau,0.200667,0.220667,0.186667,0.213333,0.178667
4,HIP-FCI,0.192,0.189333,0.202667,0.202,0.214


In [None]:
a1 = np.array([4, 5, 6])
a2 = np.array([1, 3, 4])
a = [a1, a2]
x = np.arange(1, 4, 1)
df = pd.DataFrame(a)
df.index.name = 'iteration'
df

Unnamed: 0_level_0,0,1,2
iteration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4,5,6
1,1,3,4


In [None]:
import utils
def get_data_we_have(file):
    """Prepare data for analysis below
    """
    df = pd.read_excel(file)
    df.rename(columns={df.columns[0]: 'participant_category', df.columns[1]: 'participant'}, inplace=True)
    df['diseased'] = df.apply(lambda row: row.participant_category != 'CN', axis = 1)
    df = pd.melt(df, id_vars=['participant_category', "participant", "timestamp", 'diseased'], 
                        value_vars=["FCI(HIP)", "GMI(HIP)", "FCI(Fusi)", "FCI(PCC)", "GMI(FUS)"], 
                        var_name='biomarker', value_name='measurement')
    # convert participant id
    num_participant = len(df.participant.unique())
    participant_string_id_dic = dict(zip(df.participant.unique(), [_ for _ in range(num_participant)]))
    df['participant'] = df.apply(lambda row: participant_string_id_dic[row.participant], axis = 1 )
    return df 

In [None]:
data_we_have = get_data_we_have("data/Chen2016Data.xlsx")
biomarkers = data_we_have.biomarker.unique()
num_biomarkers = len(biomarkers)
num_participant = len(data_we_have.participant.unique())

theta_phi_kmeans = utils.get_theta_phi_kmeans(data_we_have, biomarkers, n_clusters = 2)

In [None]:
data_we_have

Unnamed: 0,participant_category,participant,timestamp,diseased,biomarker,measurement
0,CN,0,6/2/2011,False,FCI(HIP),-2.544567
1,CN,1,9/2/2011,False,FCI(HIP),-1.603212
2,CN,2,10/11/2011,False,FCI(HIP),-4.716009
3,CN,3,8/8/2011,False,FCI(HIP),-4.232625
4,CN,4,11/28/2011,False,FCI(HIP),3.628361
...,...,...,...,...,...,...
715,AD,139,10/17/2012,True,GMI(FUS),0.522749
716,AD,140,2/15/2013,True,GMI(FUS),0.526017
717,AD,141,2/6/2013,True,GMI(FUS),0.342435
718,AD,142,7/16/2013,True,GMI(FUS),0.415011


In [None]:
num_participants = len(data_we_have.participant.unique())
num_biomarkers = len(data_we_have.biomarker.unique())
n_stages = num_biomarkers + 1
biomarkers = data_we_have.biomarker.unique()
non_diseased_participants = data_we_have.loc[data_we_have.diseased == False].participant.unique()

In [None]:
non_diseased_participants

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])

## Analyze conjugate priors results

In [None]:
participant_stages_df = pd.read_csv("logs/conjugate_priors/all_current_participant_stages.csv")
participant_stages_df.head()
participant_stages = participant_stages_df.iloc[-1, 1:]
np.array(participant_stages)

array([3, 4, 5, 4, 1, 2, 1, 4, 5, 1, 0, 3, 3, 5, 2, 0, 0, 4, 5, 4, 3, 4,
       1, 0, 2, 5, 2, 4, 2, 0, 3, 0, 2, 4, 4, 5, 1, 5, 2, 2, 2, 5, 4, 1,
       5, 1, 2, 0, 5, 1, 3, 3, 3, 1, 3, 4, 2, 4, 1, 4, 1, 2, 2, 4, 1, 3,
       1, 2, 5, 0, 2, 0, 0, 3, 2, 2, 3, 3, 5, 4, 5, 2, 1, 5, 4, 0, 0, 1,
       2, 4, 2, 2, 2, 1, 0, 3, 3, 0, 3, 4])

```
array([3, 4, 5, 4, 1, 2, 1, 4, 5, 1, 0, 3, 3, 5, 2, 0, 0, 4, 5, 4, 3, 4,
       1, 0, 2, 5, 2, 4, 2, 0, 3, 0, 2, 4, 4, 5, 1, 5, 2, 2, 2, 5, 4, 1,
       5, 1, 2, 0, 5, 1, 3, 3, 3, 1, 3, 4, 2, 4, 1, 4, 1, 2, 2, 4, 1, 3,
       1, 2, 5, 0, 2, 0, 0, 3, 2, 2, 3, 3, 5, 4, 5, 2, 1, 5, 4, 0, 0, 1,
       2, 4, 2, 2, 2, 1, 0, 3, 3, 0, 3, 4])
```

In [None]:
current_best_order_dict = {'MMSE': 5, 'ADAS': 4, 'AB': 2, 'P-Tau': 3, 'HIP-FCI': 1}
real_order_dict = {'MMSE': 4, 'ADAS': 5, 'AB': 2, 'P-Tau': 3, 'HIP-FCI': 1}

In [None]:
original_data = pd.read_csv('data/participant_data.csv')
original_data['diseased'] = original_data.apply(lambda row: row.k_j > 0, axis = 1)
data = original_data.drop(['k_j', 'S_n', 'affected_or_not'], axis = 1)
theta_phi_kmeans = pd.read_csv("data/estimate_means_stds_kmeans.csv")
biomarkers = data.biomarker.unique()
num_biomarkers = len(biomarkers)

In [None]:
# now data_we_have has S_n column
data['S_n'] = data.apply(lambda row: current_best_order_dict[row['biomarker']], axis = 1)

# add kj and affected for the whole dataset based on the initial randomized participant_stages
data = utils.add_kj_and_affected(data, participant_stages, num_participants)
# print(data.head())

# get estimated_theta_phi
estimated_theta_phi = utils.get_theta_phi_conjugate_priors(biomarkers, data, theta_phi_kmeans=theta_phi_kmeans)

In [None]:
current_best_likelihood = utils.compute_ln_likelihood_assuming_ordering(
    current_best_order_dict, data, num_biomarkers, estimated_theta_phi)
current_best_likelihood

-1846.4588138048964

In [None]:
# now data_we_have has S_n column
data['S_n'] = data.apply(lambda row: real_order_dict[row['biomarker']], axis = 1)

# add kj and affected for the whole dataset based on the initial randomized participant_stages
data = utils.add_kj_and_affected(data, participant_stages, num_participants)
# print(data.head())

# get estimated_theta_phi
estimated_theta_phi = utils.get_theta_phi_conjugate_priors(biomarkers, data, theta_phi_kmeans=theta_phi_kmeans)

In [None]:
real_likelihood = utils.compute_ln_likelihood_assuming_ordering(
    current_best_order_dict, data, num_biomarkers, estimated_theta_phi)
real_likelihood

-1853.3074420390506

## new

In [None]:
from collections import Counter
import numpy as np 
import pandas as pd
a = np.random.rand(3, 2)
a

array([[0.95423325, 0.36367457],
       [0.6303242 , 0.88388457],
       [0.14316481, 0.29911683]])

In [None]:
a1 = [2, 3, 4]
a2 = [2, 4, 3]
a3 = [3, 4, 5]
a4 = [2, 4, 3]
a = np.array([a1, a2, a3, a4])
a

array([[2, 3, 4],
       [2, 4, 3],
       [3, 4, 5],
       [2, 4, 3]])

In [None]:
a[:, 0]

array([2, 2, 3, 2])

In [None]:
Counter(a[:, 0])

Counter({2: 3, 3: 1})

In [None]:
unique_elements, counts = np.unique(a[:,0], return_counts=True)
unique_elements, counts

(array([2, 3]), array([3, 1]))

In [None]:
counts.sum()

4

In [None]:
prob = counts/counts.sum()
prob

array([0.75, 0.25])

In [None]:
np.random.choice(a[:, 0])

2

In [None]:
def sampled_row_based_on_column_frequencies(a):
    """for ndarray, sample one element in each col based on elements' frequencies
    input:
        a: a numpy ndarray 
    output:
        a 1d array 
    """
    sampled_row = []
    for col in range(a.shape[1]):
        col_arr = a[:, col]
        unique_elements, counts = np.unique(col_arr, return_counts=True)
        probs = counts/counts.sum()
        sampled_element = np.random.choice(unique_elements, p=probs)
        sampled_row.append(sampled_element)
    return np.array(sampled_row)

In [None]:
sampled_row = sampled_row_based_on_column_frequencies(a)
print("Original array:\n", a)
print("Sampled row:", sampled_row)

Original array:
 [[2 3 4]
 [2 4 3]
 [3 4 5]
 [2 4 3]]
Sampled row: [2 4 3]
