### AB test calculator explainer
I will just outline the main functions and the formulas I am using. 

In [1]:
import math
from scipy.stats import norm, beta
import numpy as np

### The confidence interval
Quite straight forward


\begin{equation}
CI = \left( \hat{p} - z_{\alpha} \times \sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}, \hat{p} + z_{\alpha} \times \sqrt{\frac{\hat{p}(1 - \hat{p})}{n}} \right)
\end{equation}

- p_hat: is the conversion rate,
- z_alpha is the z-score for the confidence level,
- n  is the number of visitors.


In [None]:
def calculate_confidence_interval(conversion_rate: float, 
                                  visitors: int, 
                                  z_alpha: float) -> float:
    """
    Calculate the confidence interval for a given conversion rate.
    Parameters:
        conversion_rate (float): The conversion rate.
        visitors (int): The number of visitors.
        z_alpha (float): The z-score corresponding to the desired confidence level.
    Returns:
        Tuple[float, float]: The lower and upper bounds of the confidence interval.
    """
    margin_of_error: float = z_alpha * math.sqrt((conversion_rate * (1 - conversion_rate)) / visitors)
    return (conversion_rate - margin_of_error, conversion_rate + margin_of_error)

### Calculate sample size

I think this is quite standard.

The pooled standard deviation
\begin{equation}
\sigma_{\text{pooled}} = \sqrt{2 \cdot p_{\text{avg}} \cdot (1 - p_{\text{avg}})}
\end{equation}

where the p_avg is the average conversion rate of the control and variant

The standard error of proportions:
\begin{equation}
\text{standard\_error} = \sqrt{p_1 \cdot (1 - p_1) + p_2 \cdot (1 - p_2)}
\end{equation}

Then the numerator and denominator
\begin{equation}
\text{numerator} = \left( z_{\alpha} \cdot \sigma_{\text{pooled}} + z_{\beta} \cdot \text{standard\_error} \right)^2
\end{equation}

where z_alpha and z_beta are the ppfs of the confidence level and statistical power, respectively

\begin{equation}
\text{denominator} = \Delta p^2
\end{equation}

And finally the sample size:
\begin{equation}
n = \frac{\text{numerator}}{\text{denominator}}
\end{equation}


In [None]:
def calculate_sample_size(
    p1: float,
    lift_percentage: float,
    confidence_level: float,
    power: float,
    num_variants=2,
    is_one_sided=True
) -> int:
    """
    Calculate the required sample size for an A/B test.
    I have compared this method with the ones described here:
    https://towardsdatascience.com/required-sample-size-for-a-b-testing-6f6608dd330a

    And I get the same results.
    
    Parameters:
        p1 (float): The baseline conversion rate (proportion) for the control group.
        lift_percentage (float): The expected percentage increase in conversion rate for the treatment group.
        confidence_level (float): The desired confidence level (e.g., 95 for 95% confidence).
        power (float): The desired statistical power (e.g., 80 for 80% power).
        num_variants (int, optional): The number of variants being tested (default is 2 for control and one variant).
        is_one_sided (bool, optional): Whether the test is one-sided (default is True).
    Returns:
        int: The required sample size per group for the A/B test.
    """
    # Calculate p2 based on lift percentage
    p2 = p1 * (1 + lift_percentage / 100)

    # Ensure p2 does not exceed 1
    if p2 >= 1:
        p2 = 0.9999

    # Calculate z-scores
    alpha = 1 - confidence_level / 100
    if is_one_sided:
        z_alpha = norm.ppf(1 - alpha) # should be one-sided for AB testign
    else:
        z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power / 100)

    # Adjust alpha for multiple variants if needed
    # I am doing a Bonferroni correction here, which might be too conservative
    # however, the rest of the code isn't handling multiple variants correctly yet either
    if num_variants > 2:
        alpha_adjusted = alpha / (num_variants - 1)
        z_alpha = norm.ppf(1 - alpha_adjusted)

    # Pooled proportion
    p_avg = (p1 + p2) / 2
    sigma_pooled = math.sqrt(2 * p_avg * (1 - p_avg))

    # Numerator and denominator
    # standard error of proportions 
    standard_error = math.sqrt(p1 * (1 - p1) + p2 * (1 - p2))
    
    numerator = (z_alpha * sigma_pooled + z_beta * standard_error) ** 2
    delta_p = p2 - p1
    denominator = delta_p ** 2
    sample_size_per_group = math.ceil(numerator / denominator)

    return sample_size_per_group

### Calculate relative MDE

Adjusted standard error for unequal sample sizes
\begin{equation}
\text{se} = \sqrt{ \frac{\hat{p}(1 - \hat{p})}{n_{\text{control}}} + \frac{\hat{p}(1 - \hat{p})}{n_{\text{variant}}} }
\end{equation}

The absolute MDE:
\begin{equation}
\text{absolute\_mde} = (z_{\alpha} + z_{\beta}) \cdot \text{se}
\end{equation}

where the z_alpha and z_beta are the ppfs of the confidence level and statistical power respectively

The relative MDE:
\begin{equation}
\text{relative\_mde\_percentage} = \left( \frac{\text{absolute\_mde}}{\hat{p}} \right) \times 100
\end{equation}

Where p_hat is the conversion rate for the control

In [None]:
def calculate_relative_mde(control_sample_size, variant_sample_size, baseline_conversion_rate, confidence_level, statistical_power):
    # confidence level and power are in percentages
    # Convert confidence level and power to proportions
    confidence_level = confidence_level / 100
    statistical_power = statistical_power / 100
    # Get z-scores for significance level and power (one-tailed test)
    z_alpha = norm.ppf(confidence_level)  # One-tailed test
    z_beta = norm.ppf(statistical_power)

    # Adjusted standard error for unequal sample sizes
    se = math.sqrt(
        (baseline_conversion_rate * (1 - baseline_conversion_rate) / control_sample_size) +
        (baseline_conversion_rate * (1 - baseline_conversion_rate) / variant_sample_size)
    )

    # Calculate the absolute MDE (difference) using z-scores and standard error
    absolute_mde = (z_alpha + z_beta) * se

    # Calculate the relative MDE as a percentage of the baseline conversion rate
    relative_mde_percentage = (absolute_mde / baseline_conversion_rate) * 100

    return relative_mde_percentage

In [3]:
import math
from scipy.stats import norm

def calculate_relative_difference_metrics_v2(control_rate, variant_rate, control_size, variant_size, confidence_level=0.95):
    # Calculate the absolute difference and relative difference
    absolute_diff = variant_rate - control_rate
    relative_diff = (absolute_diff / control_rate) * 100  # Expressed as a percentage

    # Variance and Standard Errors
    control_variance = control_rate * (1 - control_rate)
    variant_variance = variant_rate * (1 - variant_rate)

    # Standard Error of the relative difference
    standard_error = math.sqrt((control_variance / control_size) + (variant_variance / variant_size)) / control_rate * 100

    # Z-scores for confidence intervals
    z_alpha_two_sided = norm.ppf(1 - (1 - confidence_level) / 2)
    z_alpha_one_sided = norm.ppf(confidence_level)

    # Confidence intervals for the relative difference
    lower_ci_relative = relative_diff - z_alpha_two_sided * standard_error
    upper_ci_relative = relative_diff + z_alpha_two_sided * standard_error
    right_sided_interval = relative_diff - z_alpha_one_sided * standard_error
    left_sided_interval = relative_diff + z_alpha_one_sided * standard_error

    # Value ± 95% SE
    se_value = z_alpha_two_sided * standard_error

    # Z-Score and P-Value
    z_score = relative_diff / standard_error
    p_value = 2 * (1 - norm.cdf(abs(z_score)))  # Two-sided p-value

    # One-sided p-value (H0: B ≤ A)
    one_sided_p_value = 1 - norm.cdf(z_score)

    return {
        "Relative Difference": f"{relative_diff:.2f}%",
        "95% Confidence Interval": [f"{lower_ci_relative:.3f}%", f"{upper_ci_relative:.3f}%"],
        "95% Right-Sided Interval": [f"{right_sided_interval:.3f}%", "∞"],
        "95% Left-Sided Interval": ["-∞", f"{left_sided_interval:.3f}%"],
        "Relative difference ± 95% SE": f"{relative_diff:.2f} ±{se_value:.3f}%",
        "P-value (H0: B ≤ A)": one_sided_p_value,
        "Z-score": z_score
    }

# Test the function with sample values
control_rate = 0.10
variant_rate = 0.14
control_size = 1000
variant_size = 1000

results_v2 = calculate_relative_difference_metrics_v2(control_rate, variant_rate, control_size, variant_size)

# Display the results
import pandas as pd
df = pd.DataFrame.from_dict(results_v2, orient='index', columns=['Value'])
print(df)

                                            Value
Relative Difference                         0.40%
95% Confidence Interval       [-28.030%, 28.830%]
95% Right-Sided Interval            [-23.459%, ∞]
95% Left-Sided Interval             [-∞, 24.259%]
Relative difference ± 95% SE        0.40 ±28.430%
P-value (H0: B ≤ A)                         0.489
Z-score                                  0.027576
