In [22]:
import pandas as pd
import numpy as np

In [23]:
def get_biomarker_stage_probability(df, burn_in, thining):
    """filter through all_dicts using burn_in and thining 
    and for each biomarker, get probability of being in each possible stage

    Input:
        - df: all_ordering.csv
        - burn_in
        - thinning
    Output:
        - dff: a pandas dataframe where index is biomarker name, each col is each stage
        and each cell is the probability of that biomarker indicating that stage
    """
    df = df[(df.index > burn_in) & (df.index % thining == 0)]
    # Create an empty list to hold dictionaries
    dict_list = []

    # for each biomarker
    for col in df.columns:
        dic = {"biomarker": col}
        # get the frequency of biomarkers
        # value_counts will generate a Series where index is each cell's value
        # and the value is the frequency of that value
        stage_counts = df[col].value_counts()
        # for each stage
        # not that df.shape[1] should be equal to num_biomarkers
        for i in range(1, df.shape[1] + 1):
            # get stage:prabability
            dic[i] = stage_counts.get(i, 0)/len(df)
        dict_list.append(dic)

    dff = pd.DataFrame(dict_list)
    dff.set_index(dff.columns[0], inplace=True)
    return dff 

In [45]:
df = pd.read_csv("logs/conjugate_priors/all_ordering.csv")
df.set_index(df.columns[0], inplace=True)
# df = df.set_index("iteration")
df.head()

Unnamed: 0_level_0,MMSE,ADAS,AB,P-Tau,HIP-FCI
iteration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5,4,3,2,1
2,5,3,1,2,4
3,4,3,2,5,1
4,2,5,1,4,3
5,4,3,1,5,2


In [43]:
df[(df.index > 10) & (df.index % 2 == 0)]

Unnamed: 0_level_0,MMSE,ADAS,AB,P-Tau,HIP-FCI
iteration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12,2,3,4,1,5
14,3,2,4,5,1
16,2,3,5,4,1
18,1,4,3,2,5
20,3,4,5,1,2
22,2,4,3,5,1
24,1,2,3,5,4
26,4,2,5,3,1
28,2,1,3,4,5
30,2,3,1,5,4


In [40]:
df.index

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
           dtype='int64', name='iteration')

In [35]:
dict_list = []

# for each biomarker
for col in df.columns:
    dic = {"biomarker": col}
    # get the frequency of biomarkers
    # value_counts will generate a Series where index is each cell's value
    # and the value is the frequency of that value
    stage_counts = df[col].value_counts()
    # for each stage
    for i in range(1, df.shape[1] + 1):
        # get stage:prabability
        dic[i] = stage_counts.get(i, 0)/len(df)
    dict_list.append(dic)

dff = pd.DataFrame(dict_list)
dff

Unnamed: 0,biomarker,1,2,3,4,5
0,MMSE,0.12,0.34,0.2,0.24,0.1
1,ADAS,0.08,0.22,0.22,0.22,0.26
2,AB,0.26,0.16,0.18,0.16,0.24
3,P-Tau,0.18,0.18,0.14,0.2,0.3
4,HIP-FCI,0.36,0.1,0.26,0.18,0.1
