In [1]:
"""
Author: Joshua Ashkinaze

Description: Downsample treatment and control users based on power analysis; we do not need ALL followers.

Date: 2024-04-15 17:08:50
"""


import math
import pandas as pd
import random
import numpy as np
random.seed(42)
np.random.seed(42)

def make_group_files(tdf):
    spreaders = list(tdf['main'].unique())
    for spreader in spreaders:
        spreader_df = tdf[tdf['main'] == spreader]
        spreader_treat = spreader_df.query("treated==1")
        spreader_ctrl = spreader_df.query("treated==0")
        spreader_treat[['followers_id']].to_csv(f"final_treat_twit_{spreader.lower()}.txt", index=False, header=None)
        spreader_ctrl[['followers_id']].to_csv(f"final_ctrl_twit_{spreader.lower()}.txt", index=False, header=None)
        
def print_stats(df):
    print("PRINTING STATS")
    print("Total N edges:", len(df))
    print("Followers by spreader:")
    print(df.groupby(by=['main']).count().reset_index().sort_values(by=['followers_id']))
    

def assign_participants(total_n, n_blocks):
    # Round total_n up to the nearest multiple of 25
    # Logic for 25 is this:
    # 1. Needs to be divisible by 5 initially since 5 spreaders
    # 2. Then we have 80% treatment, meaning 4:1 ratio of treat:ctrl --> 
    #    so need to be able to divide through by 5 a 2nd time. 
    new_n = math.ceil(total_n / 25) * 25
    
    # Calculate the number of participants per block
    participants_per_block = new_n // n_blocks

    treat_per_block = int(participants_per_block * 0.8) # 80% treatment
    ctrl_per_block = participants_per_block - treat_per_block  # 20% control

    assignments = [(block, treat_per_block, ctrl_per_block) for block in range(1, n_blocks + 1)]
    for block, treat, ctrl in assignments:
        print(f"Block {block}: Treatment = {treat}, Control = {ctrl}")


    # AUTOMATED TESTS TO MAKE SURE MATH WORKS
    assert new_n == (treat_per_block+ctrl_per_block)*5, f"Failed test: Total N is {n_blocks} times treat and ctrl"
    assert new_n >= total_n, "Failed test: New N >= original N"
    assert treat_per_block/(ctrl_per_block+treat_per_block) == 0.8, "Failed test: treat not 80%"
    print("CHECK: Passed automated tests to make sure math works out")
    return {'new_n': new_n, 'ctrl': ctrl_per_block, 'treat':treat_per_block}


def downsample_df(df, n_treat, n_control):
    def sample_group(group):
        treated_group = group[group['treated'] == 1]
        control_group = group[group['treated'] == 0]
        treated_sample = treated_group.sample(n=n_treat, replace=False, random_state=42)
        control_sample = control_group.sample(n=n_control, replace=False, random_state=42)

        return pd.concat([treated_sample, control_sample])

    sampled_df = df.groupby('main').apply(sample_group).reset_index(drop=True)

    return sampled_df


TOTAL_N = 367301

df = pd.read_csv("treat_status_MINIMAL_FOLLOWERS_03.04.2024__17.11.03__START0_END-1.csv", dtype={'followers_id':'object'})

################################
# Get the new `n` to use 
################################
results = assign_participants(total_n=TOTAL_N, n_blocks=5)
print(f"Adjusted Total Participants: {results['new_n']}")

################################
# Downsample to the new n
################################
print("====="*5)
new_df = downsample_df(df, n_treat=results['treat'], n_control=results['ctrl'])
assert len(new_df) == results['new_n'], "Error: New dataframe is not expected length"
print("CHECK: New dataframe is expected length")

################################
# Print some stats about new df
################################
print("====="*5)
print_stats(new_df)
print("====="*5)

################################
# Double check this logic again
################################
print("Verify logic works again:")
print(new_df.groupby(by=['main'])['group'].value_counts(normalize=True))
print("====="*5)
print("\nView raw counts")
print(new_df.groupby(by=['main'])['group'].value_counts(normalize=False))

################################
# Make needed files
################################
make_group_files(new_df)
new_df.to_csv("final_treat_status_MINIMAL_FOLLOWERS_03.04.2024__17.11.03__START0_END-1.csv")

Block 1: Treatment = 58772, Control = 14693
Block 2: Treatment = 58772, Control = 14693
Block 3: Treatment = 58772, Control = 14693
Block 4: Treatment = 58772, Control = 14693
Block 5: Treatment = 58772, Control = 14693
CHECK: Passed automated tests to make sure math works out
Adjusted Total Participants: 367325


  sampled_df = df.groupby('main').apply(sample_group).reset_index(drop=True)


CHECK: New dataframe is expected length
PRINTING STATS
Total N edges: 367325
Followers by spreader:
            main  Unnamed: 0  followers_id  group  treated
0   JackPosobiec       73465         73465  73465    73465
1   RealCandaceO       73465         73465  73465    73465
2  charliekirk11       73465         73465  73465    73465
3  gatewaypundit       73465         73465  73465    73465
4       stkirsch       73465         73465  73465    73465
Verify logic works again:
main           group    
JackPosobiec   treatment    0.8
               control      0.2
RealCandaceO   treatment    0.8
               control      0.2
charliekirk11  treatment    0.8
               control      0.2
gatewaypundit  treatment    0.8
               control      0.2
stkirsch       treatment    0.8
               control      0.2
Name: proportion, dtype: float64

View raw counts
main           group    
JackPosobiec   treatment    58772
               control      14693
RealCandaceO   treatment    5877