# Estimating biomarker ordering

>The sampler for the biomarker ordering can be a bit tricker. The simplest way to do it might be to do a Metropolis-Hastings step where you select two indicies and propose swapping their order. Then you can work out the relative probabilities, evaluate and then accept/reject based on that. It's not the fastest sampler, but it is a lot more straightforward than some ways of doing it.  

In the following, we assume we know the actual $\theta$ and $\phi$ values. Other than those, we know nothing except for participants' observed biomarker values. And we want to estimate the current order in which different biomarkers are affected by the disease in question. 

In [156]:
import pandas as pd 
import numpy as np 
import re 
import altair as alt 
import matplotlib.pyplot as plt 
from collections import Counter

We only have three columns: biomarker, participant, and measurement. 

In [157]:
data = pd.read_csv('data/participant_data.csv')
data.Biomarker = [re.sub("Biomarker ", "", text) for text in data.Biomarker.tolist()]
data_we_have = data.drop(['k_j', 'S_n', 'affected_or_not'], axis = 1)
data_we_have.head()

Unnamed: 0,Biomarker,participant,measurement
0,0,0,24.71492
1,0,1,32.103408
2,0,2,21.084512
3,0,3,27.019921
4,0,4,27.673772


In [158]:
theta_phi = pd.read_csv('data/means_vars.csv')
theta_phi.head()

Unnamed: 0,biomarker,theta_mean,theta_var,phi_mean,phi_var
0,0,1.0,0.3,32.0,6.3
1,1,3.0,0.5,31.0,7.4
2,2,5.0,0.2,34.0,9.4
3,3,6.0,1.3,36.0,4.9
4,4,8.0,3.3,38.0,2.5


In [159]:
type(theta_phi['biomarker'][0])

numpy.int64

In [160]:
def fill_up_pdata(pdata, k_j):
    '''Fill up participant data using k_j
    Input:
    - pdata: a dataframe of ten biomarker values for a specific participant 
    - k_j: a scalar
    '''
    data = pdata.copy()
    data['k_j'] = k_j
    data['affected'] = data.apply(lambda row: row.k_j >= row.S_n, axis = 1)
    return data 

In [161]:
# def compute_single_measurement_log_likelihood(theta_phi, biomarker, affected, measurement):
#     '''Computes the log likelihood of the measurement value of a single biomarker
#     We know the normal distribution defined by either theta or phi
#     and we know the measurement. This will give us the probability
#     of the given measurement. 

#     input:
#     - theta_phi: the dataframe containing theta and phi values for each biomarker
#     - biomarker: an integer between 0 and 9 
#     - affected: boolean 
#     - measurement: the observed value for a biomarker in a specific participant

#     output: a number 
#     '''
#     biomarker_params = theta_phi[theta_phi.biomarker == biomarker].reset_index()
#     mu = biomarker_params['theta_mean'][0] if affected else biomarker_params['phi_mean'][0]
#     var = biomarker_params['theta_var'][0] if affected else biomarker_params['phi_var'][0]
#     return -0.5*np.log(2*np.pi*var) - ((measurement - mu)**2/(2*var))

In [162]:
def compute_single_measurement_likelihood(theta_phi, biomarker, affected, measurement):
    '''Computes the likelihood of the measurement value of a single biomarker
    We know the normal distribution defined by either theta or phi
    and we know the measurement. This will give us the probability
    of this given measurement value. 

    Note that because the likelihood tends to be very very small, 
    we take the natural log of it

    input:
    - theta_phi: the dataframe containing theta and phi values for each biomarker
    - biomarker: an integer between 0 and 9 
    - affected: boolean 
    - measurement: the observed value for a biomarker in a specific participant

    output: a scalar
    '''
    biomarker_params = theta_phi[theta_phi.biomarker == biomarker].reset_index()
    mu = biomarker_params['theta_mean'][0] if affected else biomarker_params['phi_mean'][0]
    var = biomarker_params['theta_var'][0] if affected else biomarker_params['phi_var'][0]
    # sigma = np.sqrt(var)
    likelihood = np.exp(-(measurement - mu)**2/(2*var))/np.sqrt(2*np.pi*var)
    return likelihood

In [163]:
# def compute_log_likelihood(pdata, k_j):
#     '''This implementes the formula of https://ebm-book2.vercel.app/distributions.html#known-k-j
#     '''
#     data = fill_up_pdata(pdata, k_j)
#     likelihood = 0
#     for i, row in data.iterrows():
#         biomarker = int(row['Biomarker'])
#         measurement = row['measurement']
#         affected = row['affected']
#         likelihood += compute_single_measurement_log_likelihood(
#             theta_phi, biomarker, affected, measurement)
#     return likelihood

In [164]:
def compute_likelihood(pdata, k_j, theta_phi):
    '''This implementes the formula of https://ebm-book2.vercel.app/distributions.html#known-k-j
    '''
    data = fill_up_pdata(pdata, k_j)
    likelihood = 1
    for i, row in data.iterrows():
        biomarker = int(row['Biomarker'])
        measurement = row['measurement']
        affected = row['affected']
        likelihood *= compute_single_measurement_likelihood(
            theta_phi, biomarker, affected, measurement)
    return likelihood

## Testing

The above functions can compute the likelihood of a participant's sequence of biomarker data, given that we know the exact ordering and we assume a `k_j`. Next, we will test those functions by selecting a specific participant. We compute the likelihood by trying all possible `k_j` and see whether the one with the highest likelihood is the real `k_j` in the original data. 

In [165]:
p = 15 # we chose this participant
pdata = data[data.participant == p].reset_index(drop=True)
pdata

Unnamed: 0,Biomarker,participant,measurement,k_j,S_n,affected_or_not
0,0,15,36.547174,6,7,not_affected
1,1,15,2.909218,6,5,affected
2,2,15,5.042268,6,3,affected
3,3,15,37.349589,6,9,not_affected
4,4,15,9.50903,6,1,affected
5,5,15,2.335076,6,2,affected
6,6,15,4.764619,6,6,affected
7,7,15,17.633576,6,10,not_affected
8,8,15,8.029674,6,4,affected
9,9,15,35.276224,6,8,not_affected


In [166]:
# ordering of biomarker affected by the disease
real_ordering_dic = dict(zip(np.arange(10), pdata.S_n))
real_ordering_dic

{0: 7, 1: 5, 2: 3, 3: 9, 4: 1, 5: 2, 6: 6, 7: 10, 8: 4, 9: 8}

In [167]:
# get the participant data without k_j, S_n, and affected or not
pdata = data_we_have[data_we_have.participant == p].reset_index(drop=True)
# obtain real ordering:
pdata['S_n'] = pdata.apply(lambda row: real_ordering_dic[int(row['Biomarker'])], axis = 1)
pdata

Unnamed: 0,Biomarker,participant,measurement,S_n
0,0,15,36.547174,7
1,1,15,2.909218,5
2,2,15,5.042268,3
3,3,15,37.349589,9
4,4,15,9.50903,1
5,5,15,2.335076,2
6,6,15,4.764619,6
7,7,15,17.633576,10
8,8,15,8.029674,4
9,9,15,35.276224,8


In [168]:
num_biomarkers = len(pdata.Biomarker.unique())
# calculate likelihood for all possible k_j
likelihood_list = [
    compute_likelihood(pdata=pdata, k_j=x, theta_phi=theta_phi) for x in range(num_biomarkers+1)]
kjs = np.arange(11)
dic = dict(zip(kjs, likelihood_list))
df = pd.DataFrame.from_dict(dic, orient='index', columns=['likelihood']).reset_index()
df.sort_values('likelihood', ascending=False)

Unnamed: 0,index,likelihood
6,6,8.498626e-15
5,5,1.070326e-36
4,4,1.961882e-60
3,3,2.230992e-80
2,2,1.390695e-100
1,1,9.775606e-150
0,0,4.9412080000000005e-220
7,7,0.0
8,8,0.0
9,9,0.0


<!-- From the above result, we can see that the most likelihood `k_j` is 8, which is in fact the real `k_j` in the participant data.  -->

## Metropolis-Hastings Algorithm Implementation

Next, we will implement the metropolis-hastings algorithm using the above functions. 

In [169]:
def average_all_likelihood(pdata, num_biomarkers, theta_phi):
    '''This is to compute https://ebm-book2.vercel.app/distributions.html#unknown-k-j
    '''
    return np.mean([compute_likelihood(pdata=pdata, k_j=x, theta_phi=theta_phi) for x in range(num_biomarkers+1)])

In [170]:
def compute_likelihood_based_on_ordering(ordering, data, num_participants, num_biomarkers, theta_phi):
    """Compute likelihood
    Inputs:
        - ordering: an array of ordering for biomarker 1-10
        - data: data_we_have
        - num_participants
        - num_biomarkers 
    Outputs:
        - likelihood
    """
    # biomarker - order dict
    ordering_dic = dict(zip(np.arange(num_biomarkers), ordering))
    # fill up S_n column using the ordering dict
    # copy first in order not to change data_we_have
    filled_data = data.copy()
    filled_data['S_n'] = filled_data.apply(lambda row: ordering_dic[int(row['Biomarker'])], axis = 1)
    likelihood = 0 
    for p in range(num_participants):
        pdata = filled_data[filled_data.participant == p].reset_index(drop=True)
        average_likelihood = average_all_likelihood(pdata, num_biomarkers, theta_phi)
        # print(average_likelihood)
        if average_likelihood == 0:
            # this is to avoid np.log(0)
            log_likelihood = np.log(average_likelihood + 1e-20)
        else:
            log_likelihood = np.log(average_likelihood)
            # print(log_likelihood)
        likelihood += log_likelihood
    return likelihood

In [171]:
def metropolis_hastings(data, iterations, theta_phi):
    '''Implement the metropolis-hastings algorithm
    Inputs: 
        - data: data_we_have
        - iterations: number of iterations

    Outputs:
        - best_order: a numpy array
        - best_likelihood: a scalar 
    '''
    num_participants = len(data.participant.unique())
    num_biomarkers = len(data.Biomarker.unique())

    # initialize an ordering and likelihood
    best_order = np.arange(num_biomarkers)
    best_likelihood = -np.inf 
    # best_order = np.array(list(real_ordering_dic.values()))
    # best_likelihood = compute_likelihood_based_on_ordering(
    #     best_order, data, num_participants, num_biomarkers
    # )
    for _ in range(iterations):
        new_order = best_order.copy()
        # randomly select two indices
        a, b = np.random.choice(num_biomarkers, 2, replace=False)
        # swaping the order
        new_order[a], new_order[b] = new_order[b], new_order[a]
        likelihood = compute_likelihood_based_on_ordering(new_order, data, num_participants, num_biomarkers, theta_phi)
        if likelihood > best_likelihood:
            best_likelihood = likelihood 
            best_order = new_order
        else: 
            ratio = likelihood/best_likelihood
            random_number = np.random.rand()
            if random_number < ratio:
                best_likelihood = likelihood
                best_order = new_order
        print(f"iteration {_ + 1} done")
    return best_order, best_likelihood


In [172]:
best_order, best_likelihood = metropolis_hastings(data_we_have, 10, theta_phi)

iteration 1 done
iteration 2 done
iteration 3 done
iteration 4 done
iteration 5 done
iteration 6 done
iteration 7 done
iteration 8 done
iteration 9 done
iteration 10 done


In [173]:
best_order, np.array(list(real_ordering_dic.values()))

(array([3, 0, 7, 2, 5, 9, 1, 8, 6, 4]),
 array([ 7,  5,  3,  9,  1,  2,  6, 10,  4,  8]))

## Unknown theta and phi

In [174]:
data_we_have.head()

Unnamed: 0,Biomarker,participant,measurement
0,0,0,24.71492
1,0,1,32.103408
2,0,2,21.084512
3,0,3,27.019921
4,0,4,27.673772


In [None]:
def metropolis_hastings(data, iterations):
    '''Implement the metropolis-hastings algorithm
    Inputs: 
        - data: data_we_have
        - iterations: number of iterations

    Outputs:
        - best_order: a numpy array
        - best_likelihood: a scalar 
    '''
    num_participants = len(data.participant.unique())
    num_biomarkers = len(data.Biomarker.unique())

    # initialize an ordering and likelihood
    best_order = np.arange(num_biomarkers)
    best_likelihood = -np.inf 

    for _ in range(iterations):
        new_order = best_order.copy()
        # randomly select two indices
        a, b = np.random.choice(num_biomarkers, 2, replace=False)
        # swaping the order
        new_order[a], new_order[b] = new_order[b], new_order[a]

        likelihood = compute_likelihood_based_on_ordering(new_order, data, num_participants, num_biomarkers)
        
        if likelihood > best_likelihood:
            best_likelihood = likelihood 
            best_order = new_order
        else: 
            ratio = likelihood/best_likelihood
            random_number = np.random.rand()
            if random_number < ratio:
                best_likelihood = likelihood
                best_order = new_order
        print(f"iteration {_ + 1} done")
    return best_order, best_likelihood