In [1]:
###Once the users have been identified, collect their twitter activity during the 30 days preceding and following the suspended user's suspension
#Then process data

In [2]:
from src.DBtools import copy_from_db, connect_to_db, run_query, insert_dataframe
from src.fetch_user_tweets import fetch_from_list
from src.data_processing import load_data, get_counts
import pandas as pd
import os
import csv 

from glob import glob

In [3]:
su_df = pd.read_csv("data/suspended_users.csv")
su_df["suspension_date"] = pd.to_datetime(su_df["suspension_date"])
databases = json.load(open("data/databases.json","r"))

In [4]:
top_retweeters = pd.read_csv("data/top_retweeters.csv",index_col=0)
moderate_retweeters = pd.read_csv("data/moderate_retweeters.csv",index_col=0)

## Get data from internal databases

In [None]:
base_query = """SELECT * FROM {} WHERE user_id IN (SELECT retweeter_user_id FROM {} WHERE suspended_user = '{}') AND created_at > '{}' AND created_at < '{}'"""

for db_name in databases["dbs"]:
    for group,df in [("top",top_retweeters),("moderate",moderate_retweeters)]:
        connection = connect_to_db(host=databases["host"], user=databases["username"], db=db_name)
        cur = connection.cursor()
        cur.execute(f"CREATE TEMPORARY TABLE {group} (suspended_user text not null, retweeter_user_id bigint not null)")
        connection.commit()
        
        #insert the list of user ids for each suspended user
        insert_dataframe(connection, df[["suspended_user","retweeter_user_id"]], group)

        for row in su_df.itertuples():
            su = row.user_screen_name
            print(f"Collecting {group} retweeter activity for {su} from {db_name} database...")
            suspension_date = row.suspension_date
            start_date = suspension_date - pd.Timedelta("32 d")
            end_date = suspension_date + pd.Timedelta("32 d")
            outfile = "data/database_retweeter_activity/{}_{}_retweeter_activity_{}.csv".format(su,group,db_name)
            query = base_query.format(databases["tweet_table"],
                                    group,
                                    su,
                                    str(start_date),
                                    str(end_date))
            copy_from_db(query,connection,outfile)

        cur.close()

Collecting top retweeter activity for AlexBerenson from vaccines database...


### Get tweets from Twitter API

In [None]:
for group,df in [("top",top_retweeters),("moderate",moderate_retweeters)]:
    for row in su_df.itertuples():
        su = row.user_screen_name
        suspension_date = row.suspension_date
        start_date = (suspension_date - pd.Timedelta("32 d")).strftime('%Y-%m-%dT%H:%M:00Z')
        end_date = (suspension_date + pd.Timedelta("32 d")).strftime('%Y-%m-%dT%H:%M:00Z')
        outdir = "data/twitter_api_retweeter_activity/{}_{}_retweeter_activity".format(su,group)
        if not os.path.isdir(outdir):
            os.makedirs(outdir)
        
        print(f"Collecting {group} retweeter activity for {su} from the Twitter API between {start_date} and {end_date}")

        fetch_args = {"credentials": "data/tw_academic_api_credentials.json",
                    "account_list":df[df.suspended_user==su].retweeter_user_id.astype(str).values,
                    "output":outdir,
                    "starting":start_date,
                    "stopping":end_date}
    
        fetch_from_list(**fetch_args)

## Process the data

In [None]:
covid_keyword_file = "data/updated_covid_keywords.txt"

count_list = []
for group in ["top","moderate"]:
    for su in su_df.user_screen_name:
        print(f"Processing data for {su}....")  
        db_data_files = glob(f"data/database_retweeter_activity/{su}_{group}_retweeter_activity*")
        tw_data_files = glob(f"data/twitter_api_retweeter_activity/{su}_{group}_retweeter_activity/*")

        data = load_data(su,
                         su_df,
                         covid_keyword_file,
                         db_data_files,
                         tw_data_files)
         
        data.to_csv(f"data/tweet_data/{su}_{group}_retweeter_combined_api_db_tweets.csv",
                    index=False,
                    quoting=csv.QUOTE_NONNUMERIC)

        print("Data loaded, getting counts...")
        daily_counts = get_counts(su, su_df,data)

        daily_counts.to_csv(f"data/count_data/{su}_{group}_retweeter_tweet_counts.csv",index=False)
        daily_counts["suspended_user"]=su
        daily_counts["group"]=group
        
        count_list.append(daily_counts)