In [2]:
import pandas as pd
import numpy as np
import random

In [4]:
ASSIGN_TREAT = 0.8

def make_group_files(tdf):
    spreaders = list(tdf['main'].unique())
    for spreader in spreaders:
        spreader_df = tdf[tdf['main'] == spreader]
        spreader_treat = spreader_df.query("treated==1")
        spreader_ctrl = spreader_df.query("treated==0")
        spreader_treat[['followers_id']].to_csv(f"treat_{spreader.lower()}.txt", index=False, header=None)
        spreader_ctrl[['followers_id']].to_csv(f"ctrl_{spreader.lower()}.txt", index=False, header=None)

def assign_group(sub_df, treat_prob=ASSIGN_TREAT):
    np.random.seed(42)  
    random.seed(42)
    sub_df['group'] = np.random.choice(['treatment', 'control'], size=len(sub_df), p=[treat_prob, 1-treat_prob])
    return sub_df
    
def print_stats(df):
    print("PRINTING STATS")
    print("Total N edges:", len(df))
    print("Followers by spreader:")
    print(df.groupby(by=['main']).count().reset_index().sort_values(by=['followers_id']))
    


df = pd.read_csv("MINIMAL_FOLLOWERS_03.04.2024__17.11.03__START0_END-1_uofmisinfowatch_acresearcher.csv", dtype={'followers_id':'object'})
df = df.sample(frac = 1, random_state=42)

print("STATS BEFORE DE-DUPE")
print_stats(df)
df = df.drop_duplicates(subset=['followers_id'])

print("STATS AFTER DE-DUPE")
print_stats(df)

df = df.groupby('main').apply(assign_group).reset_index(drop=True)
df['treated'] = df['group'].apply(lambda x: 1 if x=='treatment' else 0)

print("\nSplitting into treatment and control with treat prob = {}".format(ASSIGN_TREAT))
print("Verify logic works:")
print(df.groupby(by=['main'])['group'].value_counts(normalize=True))
    
print("\nRaw counts")
print(df.groupby(by=['main'])['group'].value_counts(normalize=False))

make_group_files(df)

df.to_csv("treat_status_MINIMAL_FOLLOWERS_03.04.2024__17.11.03__START0_END-1.csv")

STATS BEFORE DE-DUPE
PRINTING STATS
Total N edges: 2250330
Followers by spreader:
            main  followers_id
0   JackPosobiec        450066
1   RealCandaceO        450066
2  charliekirk11        450066
3  gatewaypundit        450066
4       stkirsch        450066
STATS AFTER DE-DUPE
PRINTING STATS
Total N edges: 1762288
Followers by spreader:
            main  followers_id
2  charliekirk11        333970
0   JackPosobiec        342556
3  gatewaypundit        345211
1   RealCandaceO        370233
4       stkirsch        370318


  df = df.groupby('main').apply(assign_group).reset_index(drop=True)



Splitting into treatment and control with treat prob = 0.8
Verify logic works:
main           group    
JackPosobiec   treatment    0.799846
               control      0.200154
RealCandaceO   treatment    0.799921
               control      0.200079
charliekirk11  treatment    0.799757
               control      0.200243
gatewaypundit  treatment    0.799914
               control      0.200086
stkirsch       treatment    0.799926
               control      0.200074
Name: proportion, dtype: float64

Raw counts
main           group    
JackPosobiec   treatment    273992
               control       68564
RealCandaceO   treatment    296157
               control       74076
charliekirk11  treatment    267095
               control       66875
gatewaypundit  treatment    276139
               control       69072
stkirsch       treatment    296227
               control       74091
Name: count, dtype: int64
