<a href="https://colab.research.google.com/github/kslote1/HW-SIAM/blob/main/HW_Reporduction_plus_Online.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hui-Walter

Below is a reproduction of the HW paper results using MLE estimates.

In [1]:
import numpy as np
from scipy.optimize import minimize

# Define the likelihood function
def likelihood(params, data):
    o1, o2, a1, a2, b1, b2 = params
    X1, Y1, Z1, W1, X2, Y2, Z2, W2 = data

    tot1 = X1 + Y1 + Z1 + W1
    tot2 =  X2 + Y2 + Z2 + W2

    p1 = (o1 * (1-a1) * (1-b1) + (1-o1) * a1 * b1)
    p2 = (o1 * (1-a1) * b1 + (1-o1) * a1 * (1-b1))
    p3 = (o1 * a1 * (1-b1) + (1-o1) * (1-a1) * b1)
    p4 = (o1 * a1 * b1 + (1-o1) * (1-a1) * (1-b1))
    p5 = (o2 * (1-a2) * (1-b2) + (1-o2) * a2 * b2)
    p6 = (o2 * (1-a2) * b2 + (1-o2) * a2 * (1-b2))
    p7 = (o2 * a2 * (1-b2) + (1-o2) * (1-a2) * b2)
    p8 = (o2 * a2 * b2 + (1-o2) * (1-a2) * (1-b2))
    eps = 1e-10
    L = (
        (p1 + eps)**(X1/tot1) *
        (p2 + eps)**(Y1/tot1) *
        (p3 + eps)**(Z1/tot1) *
        (p4 + eps)**(W1/tot1) *
        (p5 + eps)**(X2/tot2) *
        (p6 + eps)**(Y2/tot2) *
        (p7 + eps)**(Z2/tot2) *
        (p8 + eps)**(W2/tot2)
    )

    return -np.log(L) # Return the negative log-likelihood for minimization


# Observed data: X1, Y1, Z1, W1, X2, Y2, Z2, W2
#data = (25, 30, 35, 60, 45, 50, 55, 80)
data = (14, 4, 9, 528, 887, 21, 37, 367)

# Initial parameter guesses: o1, o2, a1, a2, b1, b2
init_params = [0.5, 0.5, 0.1, 0.1, 0.1, 0.1]

# Parameter bounds
bounds = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]

# Minimize the negative log-likelihood
result = minimize(likelihood, init_params, args=(data,), bounds=bounds)

# Get the maximum likelihood estimates for the parameters
o1_mle, o2_mle, a1_mle, a2_mle, b1_mle, b2_mle = result.x

print("Maximum Likelihood Estimates:")
print(f"o1: {o1_mle:.4f}, o2: {o2_mle:.4f}, a1: {a1_mle:.4f}, a2: {a2_mle:.4f}, b1: {b1_mle:.4f}, b2: {b2_mle:.4f}")

Maximum Likelihood Estimates:
o1: 0.0257, o2: 0.7075, a1: 0.0071, a2: 0.0371, b1: 0.0166, b2: 0.0077


# Online Version

Below is an online version that uses MLE esitmate again.
It seems to work.

Plan is to change the estimator, expand to n-classes and n-populations.

In [None]:
from scipy.optimize import minimize
import numpy as np


class OnlineEMHuiWalter:
    def __init__(self, params, step_size=0.1):
        self.params = params
        self.step_size = step_size
        self.alpha1 = []
        self.alpha2 = []
        self.beta1 = []
        self.beta2 = []
        self.theta1 = []
        self.theta2 = []


    def calculate_probabilities(self, alpha1, alpha2, beta1, beta2, theta1, theta2):
        # Calculate probabilities for each cell in the first population
        p1 = theta1 * (1 - beta1) * (1 - beta2) + (1 - theta1) * alpha1 * alpha2
        p2 = theta1 * (1 - beta1) * beta2 + (1 - theta1) * alpha1 * (1 - alpha2)
        p3 = theta1 * beta1 * (1 - beta2) + (1 - theta1) * (1 - alpha1) * alpha2
        p4 = theta1 * beta1 * beta2 + (1 - theta1) * (1 - alpha1) * (1 - alpha2)

        # Calculate probabilities for each cell in the second population
        p5 = theta2 * (1 - beta1) * (1 - beta2) + (1 - theta2) * alpha1 * alpha2
        p6 = theta2 * (1 - beta1) * beta2 + (1 - theta2) * alpha1 * (1 - alpha2)
        p7 = theta2 * beta1 * (1 - beta2) + (1 - theta2) * (1 - alpha1) * alpha2
        p8 = theta2 * beta1 * beta2 + (1 - theta2) * (1 - alpha1) * (1 - alpha2)

        return p1, p2, p3, p4, p5, p6, p7, p8

    def negative_log_likelihood(self, params, pop_one_data, pop_two_data):
        # Unpack the parameters
        alpha1, alpha2, beta1, beta2, theta1, theta2 = params
        p1, p2, p3, p4, p5, p6, p7, p8 = self.calculate_probabilities(alpha1, alpha2, beta1, beta2, theta1, theta2)

        X1, X2, X3, X4, tot1 = self.table_counts(pop_one_data)
        X5, X6, X7, X8, tot2 = self.table_counts(pop_two_data)

        eps = 1e-10
        L = (
            (p1 + eps)**(X1/tot1) *
            (p2 + eps)**(X2/tot1) *
            (p3 + eps)**(X3/tot1) *
            (p4 + eps)**(X4/tot1) *
            (p5 + eps)**(X5/tot2) *
            (p6 + eps)**(X6/tot2) *
            (p7 + eps)**(X7/tot2) *
            (p8 + eps)**(X8/tot2)
        )

        return -np.log(L) # Return the negative log-likelihood for minimization

    def mle(self, pop_one_data, pop_two_data):

        # Initial guess
        initial_guess = self.params #[0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

        bounds = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]

        # Perform minimization
        result = minimize(self.negative_log_likelihood, initial_guess, args=(pop_one_data, pop_two_data,), bounds=bounds)

        return result.x

    def table_counts(self, data):
        X1 = np.sum((data[:, 0] == 1) & (data[:, 1] == 1))  # Both classifiers predict 1
        X2 = np.sum((data[:, 0] == 1) & (data[:, 1] == 0))  # First predicts 1, second predicts 0
        X3 = np.sum((data[:, 0] == 0) & (data[:, 1] == 1))  # First predicts 0, second predicts 1
        X4 = np.sum((data[:, 0] == 0) & (data[:, 1] == 0))  # Both classifiers predict 0

        tot1 = X1 + X2 + X3 + X4  # Total counts
        return X1, X2, X3, X4, tot1

    def update(self, pop_one_data, pop_two_data):

         # Unpack the parameters
        alpha1, alpha2, beta1, beta2, theta1, theta2 = self.params

        a1, a2, b1, b2, t1, t2 = self.mle(pop_one_data, pop_two_data)

        self.alpha1.append(a1)
        self.alpha2.append(a2)
        self.beta1.append(b1)
        self.beta2.append(b2)
        self.theta1.append(t1)
        self.theta2.append(t2)

        alpha1 =  np.mean(self.alpha1)
        alpha2 = np.mean(self.alpha2)
        beta1  = np.mean(self.beta1)
        beta2  = np.mean(self.beta2)
        theta1 = np.mean(self.theta1)
        theta2 = np.mean(self.theta2)

        # Ensure parameters remain within valid range
        self.params = [
            max(0, min(1, alpha1)), max(0, min(1, alpha2)),
            max(0, min(1, beta1)), max(0, min(1, beta2)),
            max(0, min(1, theta1)), max(0, min(1, theta2))
        ]


def generate_data(n_samples, n_classes, prior, se, sp):
    n_tests = len(se)

    data = np.zeros((n_samples, n_tests))
    true_classes = np.random.choice(n_classes, n_samples, p=[prior, 1 - prior])

    for i, c in enumerate(true_classes):
        for j in range(n_tests):
            if c == 1:
                data[i, j] = np.random.choice([0, 1], p=[1-se[j], se[j]])
            else:
                data[i, j] = np.random.choice([0, 1], p=[sp[j], 1-sp[j]])

    return data

# True parameters for generating synthetic data
true_priors = [0.6, 0.4]
true_se = [0.9, 0.7]
true_sp = [0.8, 0.6]
n_classes = 2

# Generate synthetic data
n_samples = 100000
pop_one_data = generate_data(n_samples, n_classes, true_priors[0], true_se, true_sp)
pop_two_data = generate_data(n_samples, n_classes, true_priors[1], true_se, true_sp)

# Initialize the OnlineEMHuiWalter class
init_params = [0.6, 0.4, 0.9, 0.7, 0.8, 0.6]
step_size = 0.001
em = OnlineEMHuiWalter(init_params, step_size)
for i in range(n_samples):
    em.update(pop_one_data[:i], pop_two_data[:i])

print(em.params)