# Packages

# Power Analysis

## Treatment Effect
- A laboratory experiment (Kaiser et al (2022)) asked users to imagine that somebody the participant followed posted misinformation. Then the study asked participant's to rate their intentions to unfollow that individual on a scale of 1-6, with 1 being unfollowing "does not at all apply" and 6 being "does fully apply". Summing the percentage of respondents who answered 4-6 is a rough measure of the percentage of respondent's who would be more likely than not to unfollow. That sum is ~19%.

- One study (Lin et al 2024) compared the effect size of accuracy nudges in the lab to the effect size of accurarcy nudges when used in a digital ad experiment similar to ours. We use this ratio to inform how much to discount Kaiser et al. (2022). From page 11, we use the lower bound of the ad efficacy (2.6%) and the upper bound of the lab estimate (10%) to arrive at a base discount factor of (1 - 2.6/10) 74%. Doing so would yield 26% of 19%  = 5%

## Control Rate
- Ashkinaze et al (2024) looked at the unfollowing rate of health misinformation spreaders and found it was 0.52% per month
- We perturn this amount by +- 20%

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
from statsmodels.stats.proportion import proportions_ztest
from tqdm.auto import tqdm

# Constants 
NOISE = 0.2

# Treatment assumptions
TREATMENT_FX = 0.19
BASE_TREATMENT_DISCOUNT = 1 - (2.6 / 10)
TREATMENT_DISCOUNT_RANGE = [(1 - NOISE) * BASE_TREATMENT_DISCOUNT, (1 + NOISE) * BASE_TREATMENT_DISCOUNT]

# Control assumptions
CONTROL_MONTHLY_UNFOLLOW = 0.0052
CONTROL_MONTHLY_UNFOLLOW_RANGE = [(1 - NOISE) * CONTROL_MONTHLY_UNFOLLOW, (1 + NOISE) * CONTROL_MONTHLY_UNFOLLOW]

def process_task(control_prop, treatment_prop, n_days, N, treat_eligible, n_simulations, df):
    adj_control_prop = control_prop * (n_days / 30)
    power = estimate_power(df, N, adj_control_prop, treatment_prop, treat_eligible, n_simulations)
    return {'control_prop': control_prop, 'treatment_prop': treatment_prop, 'n_days': n_days, 'N': N, 'power': power}


def run_power_analysis(params, df):
    tasks = [(control_prop, treatment_prop, n_days, N, params['treat_eligible'][0], params['n_simulations'], df)
             for control_prop in params['control_prop']
             for treatment_prop in params['treatment_prop']
             for n_days in params['n_days']
             for N in params['N_values']]
    
    results = Parallel(n_jobs=-1)(delayed(process_task)(*task) for task in tqdm(tasks, desc='Running simulations'))
    
    df_results = pd.DataFrame(results)
    return df_results

def estimate_power(df, N, control_prop, treatment_prop, treat_eligible, alpha=0.05, n_simulations=100):
    df_sample = df.sample(N, replace=False)
    N_control = df_sample[df_sample['treated'] == 0].shape[0]
    N_treatment = df_sample[df_sample['treated'] == 1].shape[0]
    N_eligible_treatment = int(N_treatment * treat_eligible)  # Calculate the number of eligible subjects in the treatment group
    N_effective_treatment = int(np.floor(N_eligible_treatment))  # Take the integer part of the eligible subjects

    significant_count = 0
    for _ in range(n_simulations):
        is_significant = difference_in_proportions_test(N_control, N_effective_treatment, control_prop, treatment_prop, alpha)
        if is_significant:
            significant_count += 1
    return significant_count / n_simulations

def difference_in_proportions_test(N_control, N_treatment, control_prop, treatment_prop, alpha=0.05):

    control_outcomes = np.random.binomial(1, control_prop, size=N_control)
    treatment_outcomes = np.random.binomial(1, treatment_prop, size=N_treatment)

    success_control =  control_outcomes.sum()
    success_treatment = treatment_outcomes.sum()


    success_control = max(success_control, 1)
    success_treatment = max(success_treatment, 1)
    
    count = np.array([success_treatment, success_control])
    nobs = np.array([N_treatment, N_control])
    z_stat, p_value = proportions_ztest(count, nobs)
    #print(f"Control successes: {success_control}, Treatment successes: {success_treatment}, p-value: {p_value}")  # Debug line
    return (p_value <= alpha)
def make_graph(df_results):
    df_results['treatment_prop_label'] = df_results['treatment_prop'].apply(lambda x: f"{x:.3f}")
    plt.figure(figsize=(10, 6))
    title = "Power Analysis Graph"
    plt.title(title)
    sns.lineplot(data=df_results, x="N", y="power", hue="treatment_prop_label")
    plt.axhline(y=0.8, linestyle='dashed', color='red')
    plt.legend(title="Treatment Prop.")
    plt.xlabel("Sample Size")
    plt.ylabel("Power")
    sns.despine()
    plt.show()

def find_required_N(df_results):
    df_filtered = df_results[df_results['power'] >= 0.8]
    df_required_N = df_filtered.groupby('treatment_prop', as_index=False)['N'].min()
    df_required_N['treatment_prop_label'] = df_required_N['treatment_prop'].apply(lambda x: f"{x:.3f}")
    return df_required_N



df = pd.read_csv("treat_status_MINIMAL_FOLLOWERS_03.04.2024__17.11.03__START0_END-1.csv")
params = {
    'control_prop': [0.005],
    'treatment_prop': np.linspace(0.005, 0.007,10),
    'N_values': np.linspace(500, 10000, 50, dtype=int),
    'n_days':[30],
    'n_simulations': 1000,
    "treat_eligible":[1]
}

df_results = run_power_analysis(params, df)
make_graph(df_results)
print(find_required_N(df_results))

Running simulations:   0%|          | 0/500 [00:00<?, ?it/s]

Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment successes: 1, p-value: 0.2356799134290376
Control successes: 1, Treatment succes

In [3]:
df_results

Unnamed: 0,control_prop,treatment_prop,n_days,N,power,treatment_prop_label
0,0.5,0.5,30,10,0.99,0.5
1,0.5,0.5,30,500,1.0,0.5
2,0.5,0.7,30,10,0.99,0.7
3,0.5,0.7,30,500,1.0,0.7
