<h1>Combining Interventions to reduce the spread of misinformation online: Data</h1>

In [1]:
%%capture
!pip install fastparquet
!pip install pandarallel

In [2]:
#Set up environment
from tqdm.notebook import trange, tqdm
import pickle
import src.utils as srcu
import src.segmentation as srcseg
import pandas as pd
from pandarallel import pandarallel
import numpy as np
#Set up parallel processing
pandarallel.initialize(nb_workers=8,verbose=True,progress_bar=True)
tqdm.pandas()

#Make sure things reload
%load_ext autoreload
%autoreload 2

#Set up directories
root = '.'
srcu.create_output_directories(root)

<h2>Pull data and segment events</h2>

<h3>Pull</h3>

In [7]:
#Gather list of incidents 
import src.database as sdb
engine = sdb.get_engine('/home/joebak/venus_cred.txt')
incidents = sdb.list_incidents(engine)

In [8]:
incidents

Unnamed: 0,incident,count
0,bad_statistics_1,185193
1,bad statistics 3,60909
2,bad statistics 4,78951
3,bad statistics 5,101674
4,bad statistics 6,4736
...,...,...
148,whistleblower 1,415614
149,whistleblower 2,591838
150,whistleblower 3,498366
151,whistleblower 4,40534


In [9]:
#We need something without spaces and / to call each incident. 
fix_name = lambda name: name.replace(' ','_').replace('/','_')
incidents = incidents[incidents['incident']!='Dominion1']

#Dominion is high volume, very noisy, has daily patterns, is prolonged
#and doesn't conform to our notion of "events". 
incidents['incident_name'] = incidents['incident'].apply(fix_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incidents['incident_name'] = incidents['incident'].apply(fix_name)


In [10]:
incidents.to_parquet(root + '/data/incidents.parquet', compression=None)

<h3>Aggregate</h3>

In [9]:
incidents = pd.read_parquet(root + '/data/incidents.parquet')

In [10]:
len(incidents)

152

In [11]:
import src.database as sdb
engine = sdb.get_engine('/home/joebak/venus_cred.txt')
agg_save = lambda row: sdb.aggregate_and_save(row,engine)
incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

index
0      True
1      True
2      True
3      True
4      True
       ... 
148    True
149    True
150    True
151    True
152    True
Length: 152, dtype: bool

<h3>Repeat Offenders</h3>`

In [12]:
removed = pickle.load(open(root + '/data/removed.p','rb'))


In [13]:
import src.database as sdb
engine = sdb.get_engine()

query_ro = '''SELECT user_screen_name,incident, user_followers_count,row_number() over (partition by user_screen_name order by created_at) 
            FROM (SELECT DISTINCT ON (user_screen_name, incident) user_screen_name, user_followers_count,
            incident, created_at, row_number() over (partition by user_screen_name order by created_at)
            FROM public.all_ticket_tweets WHERE user_followers_count > 10000 AND incident IS NOT NULL) AS nested'''

query_v = '''SELECT user_screen_name,incident, user_followers_count,row_number() over (partition by user_screen_name order by created_at) 
            FROM (SELECT DISTINCT ON (user_screen_name, incident) user_screen_name, user_followers_count,
            incident, created_at, row_number() over (partition by user_screen_name order by created_at)
            FROM public.all_ticket_tweets 
            WHERE incident IS NOT NULL AND user_verified) AS nested'''

query_all = "SELECT * FROM all_ticket_tweets LIMIT 10;"
ro_all =pd.read_sql(query_ro, con=engine)
ro_verified =pd.read_sql(query_v, con=engine)


In [14]:
ro_all.to_csv(root + '/data/ro_all.csv')
ro_verified.to_csv(root + '/data/ro_verified.csv')

In [15]:
ro_all = pd.read_csv(root+'/data/ro_all.csv')
ro_verified=pd.read_csv(root + '/data/ro_verified.csv')

In [16]:
def get_repeat_offenders_dict(df, incidents, follower_thresh=1,strikes=3):
    repeat_offenders = {}
    for incident in incidents:
        temp = df[df['incident']==incident]
        temp = temp[temp['row_number'] > strikes]
        temp = temp[temp['user_followers_count'] > follower_thresh]
        repeat_offenders[incident] = temp['user_screen_name'].unique()
    return repeat_offenders

ro_dict_10k = get_repeat_offenders_dict(ro_all, incidents['incident'],follower_thresh=10000)
ro_dict_50k = get_repeat_offenders_dict(ro_all, incidents['incident'],follower_thresh=50000)
ro_dict_100k = get_repeat_offenders_dict(ro_all,incidents['incident'], follower_thresh=100000)
ro_dict_500k = get_repeat_offenders_dict(ro_all, incidents['incident'],follower_thresh=500000)
ro_dict_v = get_repeat_offenders_dict(ro_verified, incidents['incident'])

In [17]:
ro_dict_modest = {}
for item in ro_dict_100k.keys():
    temp = np.unique(np.hstack([ro_dict_100k[item], 
              ro_dict_v[item],
              removed])).tolist()
    ro_dict_modest[item] = temp

ro_dict_aggressive = {}
for item in ro_dict_100k.keys():
    temp = np.unique(np.hstack([ro_dict_50k[item], 
              ro_dict_v[item],
              removed])).tolist()
    ro_dict_aggressive[item] = temp


In [18]:
import src.database as sdb
engine = sdb.get_engine('/home/joebak/venus_cred.txt')

agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_10k,floc='/data/timeseries/10K/')
row = incidents.iloc[45]
agg_save(row)

True

In [19]:
import src.database as sdb
engine = sdb.get_engine('/home/joebak/venus_cred.txt')
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_10k,floc='/data/timeseries/10K/')
_ = incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [20]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_50k,floc='/data/timeseries/50K/',keep=False)
_ = incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [21]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_100k,floc='/data/timeseries/100K/',keep=False)
incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

index
0      True
1      True
2      True
3      True
4      True
       ... 
148    True
149    True
150    True
151    True
152    True
Length: 152, dtype: bool

In [22]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_500k,floc='/data/timeseries/500K/',keep=False)
_ = incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [23]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_v,floc='/data/timeseries/Verified/',keep=False)
_ = incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [24]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=removed,floc='/data/timeseries/currently/',keep=False)
_ = incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [25]:
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_modest,floc='/data/timeseries/modest/',keep=False)
_ = incidents.T.parallel_apply(agg_save)
agg_save = lambda row: sdb.aggregate_and_save(row,engine,removed=ro_dict_aggressive,floc='/data/timeseries/aggressive/',keep=False)
_ = incidents.T.parallel_apply(agg_save)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

In [26]:
#Get totals banned by policy
ro_dicts = {'10K':ro_dict_10k, 
             '50K':ro_dict_50k,
              '100K':ro_dict_100k, 
              '500K':ro_dict_500k, 
                'Verified':ro_dict_v,
               'Modest':ro_dict_modest, 
               'Aggressive':ro_dict_aggressive}
ban_df = pd.DataFrame()
for policy in ['10K','50K', '100K','500K', 'Verified','Modest','Aggressive']:
    N_banned = np.unique(np.hstack([ro_dicts[policy][item] for item in ro_dicts['10K'].keys()])).size
    ban_df =ban_df.append({'Total removed':N_banned, 
                   'Policy':policy},ignore_index=True)
ban_df=ban_df.append({'Total removed':np.unique(removed).size, 
                'Policy':'Currently'},ignore_index=True)

In [None]:
ban_df.to_parquet(root + '/data/ban_df_counts.parquet',compression=None)

In [28]:
incidents = pd.read_parquet('./data/incidents.parquet')

In [35]:
def get_incident_count(incident,engine):
    """Return user_followers_count, user_screen_name, created_at, and user_verified 
     for and incident.
    Keyword arguments:
    incident -- the name of an incident, as identified in our database
    engine -- postgres engine created with src.database.get_engine
    """
    query = "SELECT  count(*) FROM all_ticket_tweets WHERE incident=(%(incident)s)"
    incident_df = pd.read_sql(query, params={'incident':incident},con=engine)
    return incident_df

In [None]:
totals = []
get_incident_count(incidents['incident'][0], engine)