# Estimate

## Conjugacy 

Conjugacy occurs when the posterior distribution is in the same family of distribution as the prior distribution, but with new parameter values. 

Why conjugacy is important? Because without it, one has to do the integration, which oftentimes is hard. 

Three major conjugate families:

- Beta-Binomial
- Gamma-Poisson
- Normal-Normal

In our example, we assume that the measurement data for each biomarker follows a normal distribution; however, we do not know the exact $\mu$ and $\sigma$. Our job is to estimate the two parameters for each biomarker based on the data we have. 

According to [*An Introduction to Bayesian Thinking*](https://statswithr.github.io/book/inference-and-decision-making-with-multiple-parameters.html#sec:normal-gamma) by Clyde et al. (2022), if the data comes from a normal distribution with unknown $\mu$ and $\sigma$, the conjugate prior for $\mu$ has a normal distribution with mean $m_0$ and variance $\frac{\sigma^2}{n_0}$. The conjugate prior for $\frac{1}{\sigma^2}$ has a Gamma distribution with shape $\frac{v_0}{2}$ and rate $\frac{v_0 \sigma_0^{2}}{2}$ where

- $m_0$: prior estimate of $\mu$.
- $n_0$: how strongly is the prior belief in $m_0$ is held.
- $\sigma_0^2$: prior estimate of $\sigma^2$.
- $v_0$: prior degress of freedome, influencing the certainty of $\sigma_0^2$.

In the following, I will `pymc` package to estimate the mean and stadard deviation for both groups (diseased and healthy) of each biomarker. 


In [64]:
import pandas as pd 
import numpy as np
import pymc as pm
import logging
logger = logging.getLogger('pymc')
logger.setLevel(logging.WARNING)
import seaborn as sns 
import altair as alt 
import plotly.graph_objects as go

import matplotlib.pyplot as plt
from scipy.stats import norm

print(f"Running on PyMC v{pm.__version__}")

Running on PyMC v5.6.1


In [65]:
def estimate_params(m0, n0, s0_sq, v0, data):
    '''
    data: a vector of measurements 
    '''
    # Hyperparameters for the Normal-Gamma prior
    m0 = m0        # prior mean for mu
    n0 = n0        # relates to the variance of the mu prior (how strongly m0 is believed)
    s0_sq = s0_sq     # prior estimate for the variance sigma^2
    v0 = v0        # prior degrees of freedom for the precision of sigma^2

    with pm.Model() as model:
        # Prior for the precision of the normal distribution (1/sigma^2)
        # lambda is sigma_squared_inverse
        lambda_ = pm.Gamma('lambda', alpha=v0 / 2, beta=v0 * s0_sq / 2)

        # Prior for the mean of the normal distribution
        mu = pm.Normal('mu', mu=m0, tau=n0 * lambda_)

        # Likelihood (sampling distribution) of observations
        Y_obs = pm.Normal('Y_obs', mu=mu, sigma=1/pm.math.sqrt(lambda_), observed=data)

        # Draw posterior samples
        trace = pm.sample(1000, progressbar=False)
        
    mu_estimate = np.mean(trace.posterior['mu'].values)
    lambda_estimate = np.mean(trace.posterior['lambda'].values)
    var_estimate = 1 / lambda_estimate  # Convert precision to variance
    return mu_estimate, var_estimate

In [66]:
# truth data; used to validate the estimates
truth = pd.read_csv('data/means_vars.csv')
# observed data:
observed_data = pd.read_csv('data/participant_data.csv')
observed_data.head()

Unnamed: 0,Biomarker,participant,measurement,k_j,k_n,drawn_from
0,Biomarker 0,0,3.492852,2,9,healthy
1,Biomarker 0,1,2.287075,3,9,healthy
2,Biomarker 0,2,2.428878,2,9,healthy
3,Biomarker 0,3,2.518235,8,9,healthy
4,Biomarker 0,4,3.370019,8,9,healthy


In [67]:
# empty list of dictionaries to store the estimates from pymc
means_vars_estimate_dict_list = []
biomarkers = [_ for _ in range(10)]
for biomarker in biomarkers: 
    dic = {'biomarker': biomarker}  # Initialize dictionary outside the inner loop
    for i in ['healthy', 'diseased']:
        data_full = observed_data[(observed_data.Biomarker == f"Biomarker {biomarker}") & (
        observed_data.drawn_from == i)]
        data = data_full.measurement
        mu_estimate, var_estimate = estimate_params(
            m0 = 0, n0 = 1, s0_sq = 1, v0 = 1, data=data)
        if i == 'diseased':
            dic['theta_mean'] = mu_estimate
            dic['theta_var'] = var_estimate
        else:
            dic['phi_mean'] = mu_estimate
            dic['phi_var'] = var_estimate
        print(f"biomarker {biomarker}, {i} group is done!")
    means_vars_estimate_dict_list.append(dic)

In [68]:
estimate_df = pd.DataFrame(means_vars_estimate_dict_list)
estimate_df.to_csv("data/estimate_means_vars.csv", index = False)
estimate_df

Unnamed: 0,biomarker,phi_mean,phi_var,theta_mean,theta_var
0,0,3.037711,0.496365,1.647786,0.744325
1,1,5.969125,0.726479,0.003735,0.02768
2,2,6.826521,1.309947,-0.030133,0.664226
3,3,4.940239,0.393609,3.966239,4.19158
4,4,1.063836,0.310465,0.045811,0.037642
5,5,4.928492,0.45204,2.933975,0.740712
6,6,5.389798,2.397884,3.034067,0.765651
7,7,5.89574,0.542444,4.883195,1.569774
8,8,0.928472,0.103274,7.851681,1.728183
9,9,1.032412,0.30658,4.933967,1.061399


In [89]:
biomarker_data_est = estimate_df[estimate_df.biomarker == 1].reset_index()
biomarker_data_est.theta_mean[0]

0.003735091654693489

In [91]:
biomarkers = [_ for _ in range(10)]
charts = []
for n in biomarkers: 
    biomarker_data_est = estimate_df[estimate_df.biomarker == n].reset_index()
    biomarker_data = truth[truth.biomarker == n].reset_index()
    # theta for diseased
    theta_mean_est = biomarker_data_est.theta_mean[0]
    theta_var_est = biomarker_data_est.theta_var[0]
    theta_std_est = np.sqrt(theta_var_est)

    theta_mean = biomarker_data.theta_mean[0]
    theta_var = biomarker_data.theta_var[0]
    theta_std = np.sqrt(theta_var)
    # phi for healthy
    phi_mean_est = biomarker_data_est.phi_mean[0]
    phi_var_est = biomarker_data_est.phi_var[0]
    phi_std_est = np.sqrt(phi_var_est)

    phi_mean = biomarker_data.phi_mean[0]
    phi_var = biomarker_data.phi_var[0]
    phi_std = np.sqrt(phi_var)

    mean1, std1 = theta_mean, theta_std
    mean2, std2 = theta_mean_est, theta_std_est

    # Generating points on the x axis
    x_thetas = np.linspace(min(mean1 - 3*std1, mean2 - 3*std2), 
                    max(mean1 + 3*std1, mean2 + 3*std2), 1000)

    # Creating DataFrames for each distribution
    df1 = pd.DataFrame({'x': x_thetas, 'pdf': norm.pdf(x_thetas, mean1, std1), 'Distribution': 'Actual Theta'})
    df2 = pd.DataFrame({'x': x_thetas, 'pdf': norm.pdf(x_thetas, mean2, std2), 'Distribution': 'Estimated Theta'})

    # Combining the DataFrames
    df3 = pd.concat([df1, df2])

    # Altair plot
    chart_theta = alt.Chart(df3).mark_line().encode(
        x='x',
        y='pdf',
        color=alt.Color('Distribution:N', legend=alt.Legend(title="Theta Distribution"))
    ).properties(title=f'Comparison of Two Normal Distributions for Biomarker {n}, Theta')

    mean1, std1 = phi_mean, phi_std
    mean2, std2 = phi_mean_est, phi_std_est

    # Generating points on the x axis
    x_phis = np.linspace(min(mean1 - 3*std1, mean2 - 3*std2), 
                    max(mean1 + 3*std1, mean2 + 3*std2), 1000)

    # Creating DataFrames for each distribution
    df1 = pd.DataFrame({'x': x_phis, 'pdf': norm.pdf(x_phis, mean1, std1), 'Distribution': 'Actual Phi'})
    df2 = pd.DataFrame({'x': x_phis, 'pdf': norm.pdf(x_phis, mean2, std2), 'Distribution': 'Estimated Phi'})

    # Combining the DataFrames
    df3 = pd.concat([df1, df2])

    # Altair plot
    chart_phi = alt.Chart(df3).mark_line().encode(
        x='x',
        y='pdf',
        color=alt.Color('Distribution:N', legend=alt.Legend(title="Phi Distribution"))
    ).properties(title=f'Comparison of Two Normal Distributions for Biomarker {n}, Phi')
    
    # Concatenate theta and phi charts horizontally
    hconcat_chart = alt.hconcat(chart_theta, chart_phi).resolve_scale(color="independent")

    # Append the concatenated chart to the list of charts
    charts.append(hconcat_chart)
# Concatenate all the charts vertically
final_chart = alt.vconcat(*charts)

# Display the final chart
final_chart.display()