<h1>Combining Interventions to reduce the spread of misinformation online: Data</h1>

In [57]:
%%capture
!pip install fastparquet
!pip install pandarallel
!pip install pyarrow

In [58]:
#Set up environment
from tqdm.notebook import trange, tqdm
import pickle
import src.utils as srcu
import src.segmentation as srcseg
import pandas as pd
from pandarallel import pandarallel
import numpy as np
#Set up parallel processing
pandarallel.initialize(nb_workers=8,
                       verbose=True,progress_bar=True,use_memory_fs=None)
tqdm.pandas()

#Make sure things reload
%load_ext autoreload
%autoreload 2

#Set up directories
root = '.'
srcu.create_output_directories(root)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<h2>Pull data and segment events</h2>

<h3>Pull</h3>

In [59]:
#Gather list of incidents 
import src.database as sdb
engine = sdb.get_engine('/home/joebak/Misc/venus_cred.txt')
incidents = sdb.list_incidents(engine)

In [60]:
incidents

incidents['incident'].unique()

array(['ballot box: CA Key in Box',
       'ballot box: Collecting After Election',
       'ballot box: Fire Baldwin Park',
       'ballot box: Franklin County Police', 'ballot box: MA fire',
       'ballot box: Observe Boxes', 'ballot box: PA Civilian Removal',
       'ballot box: Trash Can Disguise',
       'ballot box: Unofficial Boxes CA',
       'ballot box: WA Start Fire Suggestion',
       'ballot harvesting: Baltimore elderly votes',
       'Ballot harvesting: Baltimore elderly votes',
       'ballot harvesting: Biden Texas Director',
       'Ballot harvesting: Biden Texas Director',
       'ballot harvesting: Granny farming ',
       'ballot harvesting: Ilhan Omar Project Veritas Video ',
       'ballot harvesting: Lebron/Bloomberg',
       'ballot harvesting: Michigan Union',
       'Ballot harvesting: Minnesota Somali voters',
       'ballot harvesting: new Michigan law',
       'ballot harvesting: Paid 500 to vote',
       'Ballot harvesting: Reseda, California',
       'Ba

In [61]:
#We need something without spaces and / to call each incident. 
fix_name = lambda name: name.replace(' ','_').replace('/','_')
incidents = incidents[incidents['incident']!='tech: dominion']

#Dominion is high volume, very noisy, has daily patterns, is prolonged
#and doesn't conform to our notion of "events". 
incidents['incident_name'] = incidents['incident'].apply(fix_name)

In [62]:
incidents.to_csv(root + '/data/incidents.csv')

<h3>Aggregate</h3>

In [63]:
incidents = pd.read_csv(root + '/data/incidents.csv')

In [64]:
len(incidents)

462

In [65]:
import src.database as sdb
engine = sdb.get_engine('/home/joebak/Misc/venus_cred.txt')
agg_save = lambda row: sdb.aggregate_and_save(row,engine,keep=False)
incidents.head(1).T.apply(agg_save)

                           user_followers_count  user_verified  total_tweets  \
created_at                                                                     
2020-11-01 07:35:00+00:00                   482              0           2.0   
2020-11-01 07:40:00+00:00                     0              0           0.0   
2020-11-01 07:45:00+00:00                     0              0           0.0   
2020-11-01 07:50:00+00:00                     0              0           0.0   
2020-11-01 07:55:00+00:00                     0              0           0.0   
...                                         ...            ...           ...   
2020-11-06 23:10:00+00:00                     0              0           0.0   
2020-11-06 23:15:00+00:00                     0              0           0.0   
2020-11-06 23:20:00+00:00                     0              0           0.0   
2020-11-06 23:25:00+00:00                     0              0           0.0   
2020-11-06 23:30:00+00:00               

0    True
dtype: bool

In [None]:
import src.database as sdb
engine = sdb.get_engine('/home/joebak/Misc/venus_cred.txt')
agg_save = lambda row: sdb.aggregate_and_save(row,engine)
incidents.T.parallel_apply(agg_save)

<h3>Repeat Offenders</h3>

In [None]:
removed = pickle.load(open(root + '/data/removed.p','rb'))


In [None]:
import src.database as sdb
engine = sdb.get_engine()

query_ro = '''SELECT user_screen_name,incident, user_followers_count,row_number() over (partition by user_screen_name order by created_at) 
            FROM (SELECT DISTINCT ON (user_screen_name, incident) user_screen_name, user_followers_count,
            incident, created_at, row_number() over (partition by user_screen_name order by created_at)
            FROM public.incident_tweets WHERE user_followers_count > 10000 AND incident IS NOT NULL) AS nested'''

query_v = '''SELECT user_screen_name,incident, user_followers_count,row_number() over (partition by user_screen_name order by created_at) 
            FROM (SELECT DISTINCT ON (user_screen_name, incident) user_screen_name, user_followers_count,
            incident, created_at, row_number() over (partition by user_screen_name order by created_at)
            FROM public.incident_tweets 
            WHERE incident IS NOT NULL AND user_verified) AS nested'''

query_all = "SELECT * FROM incident_tweets LIMIT 10;"
ro_all =pd.read_sql(query_ro, con=engine)
ro_verified =pd.read_sql(query_v, con=engine)


In [None]:
ro_all.to_csv(root + '/data/ro_all.csv')
ro_verified.to_csv(root + '/data/ro_verified.csv')

In [None]:
ro_all = pd.read_csv(root+'/data/ro_all.csv')
ro_verified=pd.read_csv(root + '/data/ro_verified.csv')

In [None]:
def get_repeat_offenders_dict(df, incidents, follower_thresh=1,strikes=3):
    repeat_offenders = {}
    for incident in incidents:
        temp = df[df['incident']==incident]
        temp = temp[temp['row_number'] > strikes]
        temp = temp[temp['user_followers_count'] > follower_thresh]
        repeat_offenders[incident] = temp['user_screen_name'].unique()
    return repeat_offenders

ro_dict_10k = get_repeat_offenders_dict(ro_all, incidents['incident'],follower_thresh=10000)
ro_dict_50k = get_repeat_offenders_dict(ro_all, incidents['incident'],follower_thresh=50000)
ro_dict_100k = get_repeat_offenders_dict(ro_all,incidents['incident'], follower_thresh=100000)
ro_dict_500k = get_repeat_offenders_dict(ro_all, incidents['incident'],follower_thresh=500000)
ro_dict_v = get_repeat_offenders_dict(ro_verified, incidents['incident'])

In [None]:
ro_dict_modest = {}
for item in ro_dict_100k.keys():
    temp = np.unique(np.hstack([ro_dict_100k[item], 
              ro_dict_v[item],
              removed])).tolist()
    ro_dict_modest[item] = temp

ro_dict_aggressive = {}
for item in ro_dict_100k.keys():
    temp = np.unique(np.hstack([ro_dict_50k[item], 
              ro_dict_v[item],
              removed])).tolist()
    ro_dict_aggressive[item] = temp


In [None]:
import src.database as sdb
engine = sdb.get_engine('/home/joebak/venus_cred.txt')

agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_10k,floc='/data/timeseries/10K/')
row = incidents.iloc[45]
agg_save(row)

In [None]:
import src.database as sdb
engine = sdb.get_engine('/home/joebak/venus_cred.txt')
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_10k,floc='/data/timeseries/10K/',keep=True)
_ = incidents.T.apply(agg_save)

In [None]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_50k,floc='/data/timeseries/50K/',keep=True)
_ = incidents.T.apply(agg_save)

In [None]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_100k,floc='/data/timeseries/100K/',keep=True)
incidents.T.apply(agg_save)

In [None]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_500k,floc='/data/timeseries/500K/',keep=True)
_ = incidents.T.apply(agg_save)

In [None]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_v,floc='/data/timeseries/Verified/',keep=True)
_ = incidents.T.apply(agg_save)

In [None]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=removed,floc='/data/timeseries/currently/',keep=True)
_ = incidents.T.apply(agg_save)

In [None]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_modest,floc='/data/timeseries/modest/',keep=True)
_ = incidents.T.apply(agg_save)
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_aggressive,floc='/data/timeseries/aggressive/',keep=True)
_ = incidents.T.apply(agg_save)

In [None]:
#Get totals banned by policy
ro_dicts = {'10K':ro_dict_10k, 
             '50K':ro_dict_50k,
              '100K':ro_dict_100k, 
              '500K':ro_dict_500k, 
                'Verified':ro_dict_v,
               'Modest':ro_dict_modest, 
               'Aggressive':ro_dict_aggressive}
ban_df = pd.DataFrame()
for policy in ['10K','50K', '100K','500K', 'Verified','Modest','Aggressive']:
    N_banned = np.unique(np.hstack([ro_dicts[policy][item] for item in ro_dicts['10K'].keys()])).size
    ban_df =ban_df.append({'Total removed':N_banned, 
                   'Policy':policy},ignore_index=True)
ban_df=ban_df.append({'Total removed':np.unique(removed).size, 
                'Policy':'Currently'},ignore_index=True)

In [None]:
ban_df.to_cs(root + '/data/ban_df_counts.csv',compression=None)

In [None]:
incidents = pd.read_csv('./data/incidents.csv')

In [None]:
def get_incident_count(incident,engine):
    """Return user_followers_count, user_screen_name, created_at, and user_verified 
     for and incident.
    Keyword arguments:
    incident -- the name of an incident, as identified in our database
    engine -- postgres engine created with src.database.get_engine
    """
    query = "SELECT  count(*) FROM incident_tweets WHERE incident=(%(incident)s)"
    incident_df = pd.read_sql(query, params={'incident':incident},con=engine)
    return incident_df

In [None]:
totals = []
get_incident_count(incidents['incident'][0], engine)