In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re

from tqdm import tqdm
from collections import defaultdict, Counter
from html import unescape
#import fasttext

#Note: please refer to Model set 2- fixed effect - daily/10 day aggregation for current tables.

## Sentiment Classification

In [None]:
#Use Existing tool for pilot & building pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
sid_obj = SentimentIntensityAnalyzer()

In [None]:
def get_sentiment(text):
    try:
        sentiment_dict = sid_obj.polarity_scores(text)
        return sentiment_dict['compound']
    except:
        return None

In [None]:
#Getting matched users
#matched_user = "./xr_sample_aggregated.csv"
#matched_nobot = "./xr_matching_aggregated.csv"

#matched_user = "./0422_rf_edition/user_aggregated.csv"
#matched_nobot = "./0422_rf_edition/matching_aggregated.csv"

#matched_user = "./xgb_matching/user_aggregated_xgb.csv"
#matched_nobot = "./xgb_matching/matching_aggregated_xgb.csv"

In [None]:
matched_user = "user_aggregated_0.5.csv"
matched_nobot = "sample_botometer_0.5.csv"

In [None]:
df = pd.read_csv(matched_user)
df_matched = pd.read_csv(matched_nobot)

In [None]:
# Get user-level aggregation 
df['time_gap_unified'] = pd.to_timedelta(df.time_gap)
df = df[df.time_gap_unified != "0 days"]

df['after'] = [1 if round(i.total_seconds()) > 0 else 0 for i in df.time_gap_unified]
df['after_timegap'] = [round(i.total_seconds()/86400) for i in df.time_gap_unified] #get daily aggregation

df_matched['time_gap_unified'] = pd.to_timedelta(df_matched.time_gap)
df_matched = df_matched[df_matched.time_gap_unified != "0 days"]
df_matched['after'] = [1 if round(i.total_seconds()) > 0 else 0 for i in df_matched.time_gap_unified]
df_matched['after_timegap'] = [round(i.total_seconds()/86400) for i in df_matched.time_gap_unified]


In [None]:
bot_probs = pd.read_csv("./0422_rf_edition/xr_users_classified_0704_four_methods_wprob.csv") #user profile data
bot_replies = pd.read_csv("./bot_interaction_tweet_updated_botometer_0.5.csv") #users and their reply to bots
#bot_original_text = pd.read_csv("./botometer/bot_original_tweet_updated_botometer.csv") #the original tweets of bots...note this is quite a limited sample.
matched_users = pd.read_csv("./botometer/xr2019_user_matched_fifths_botometer_available_0.5.csv")

In [None]:
bot_probs.columns

#### generate table on users/matched users bot prob.

In [None]:
bot_probs_to_merge = ["id", "is_bot_rf20_prob", "is_bot_rf20", "is_bot_dl_rough", "is_bot_dl_prob",
                                          "is_bot_xgb", "is_bot_xgb_prob","is_bot_botometer_prob",  'astroturf_display_english', 'fake_follower_display_english',
       'financial_display_english', 'other_display_english',
       'overall_display_english', 'self_declared_display_english',
       'spammer_display_english']

In [None]:
users_and_matched = matched_users[["uid", "matched_uid"]]

users_and_matched = users_and_matched.merge(bot_probs[bot_probs_to_merge], left_on = "uid", right_on = "id", how = 'inner',copy = False,
                               suffixes = ("", "_humans"))
users_and_matched = users_and_matched.merge(bot_probs[bot_probs_to_merge], left_on = "matched_uid", right_on = "id", how = 'inner',copy = False,
                               suffixes = ("", "_matched"))

In [None]:
users_and_matched.columns

In [None]:
users_and_matched.drop(columns = ['id', "id_matched"], inplace = True)
users_and_matched.columns =  ['uid', 'matched_uid', 'RF_bot_prob_user','RF_bot_pred_user',
       'DL_bot_pred_user',"DL_bot_prob_user" , 'XGB_bot_pred_user', 'XGB_bot_prob_user',
                              "BOTOMETER_bot_prob_user",
        'RF_bot_prob_matched', 'RF_bot_pred_matched',
       'DL_bot_pred_matched', 'DL_bot_prob_matched', 'XGB_bot_pred_matched', 'XGB_bot_prob_matched',
                             "BOTOMETER_bot_prob_matched", 'astroturf_display_user', 'fake_follower_display_user',
       'financial_display_user', 'other_display_user',
       'overall_display_user', 'self_declared_display_user',
       'spammer_display_user',
                             'astroturf_display_matched', 'fake_follower_display_matched',
       'financial_display_matched', 'other_display_matched',
       'overall_display_matched', 'self_declared_display_matched',
       'spammer_display_matched']

In [None]:
#users_and_matched.to_csv("./ctrls/RF_based_user_and_matched_bot_prob.csv", index = False)
#users_and_matched.to_csv("./ctrls/DL_based_user_and_matched_bot_prob.csv", index = False)
#users_and_matched.to_csv("./ctrls/XGB_based_user_and_matched_bot_prob.csv", index = False)

In [None]:
users_and_matched.to_csv("./botometer_ctrl_table/BOTOMETER_based_user_and_matched_bot_prob_0.5.csv", index = False)

#### Generate other ctrls

In [None]:
bot_replies.shape[0]

In [None]:
bot_replies.columns

In [None]:
bot_replies['created_at_dt'] = pd.to_datetime(bot_replies.created_at, infer_datetime_format=True)

In [None]:
bot_replies = bot_replies.merge(bot_probs[bot_probs_to_merge], left_on = "in_reply_to_user_id", right_on = "id", how = 'inner',copy = False,
                               suffixes = ("", "_y"))

bot_replies = bot_replies.merge(bot_probs[bot_probs_to_merge], left_on = "author_id", right_on = "id", how = 'inner',copy = False,
                               suffixes = ("", "_original_human"))

In [None]:
import math
all_text = list(bot_replies.text)
sentiment_score = [get_sentiment(i) for i in all_text]
bot_replies['interaction_senti'] = sentiment_score
bot_replies['interaction_senti_category'] = [1 if i > 0.05 else -1 if i < -0.05 else 0 for i in sentiment_score]

bot_all_text = list(bot_replies.original_text)
bot_sentiment_score = [get_sentiment(i) for i in bot_all_text]
bot_replies['bot_senti'] = bot_sentiment_score
bot_replies['bot_senti_category'] = [None if i==None else 1 if i > 0.05 else -1 if i < -0.05 else 0 for i in bot_sentiment_score]


In [None]:
# Get whether if it is a cascade
topic_cascade = pd.read_csv("topic_time_burstiness.csv")
topic_cascade['created_at_hr'] = [pd.to_datetime(i) for i in topic_cascade.created_at_dt]

In [None]:
bot_replies['created_at_hr'] = [pd.to_datetime(i).floor('H').to_pydatetime() for i in bot_replies.created_at_dt]

In [None]:
bot_replies = bot_replies.merge(topic_cascade[["created_at_hr", "topic", "burstiness", "cascade", "pos_num"]], 
                  on = ['created_at_hr', "topic"], how = 'left')

In [None]:
# Get control vars: number of interactions, sentiment of the interaction
#ctrls = ['text', 'senti', "senti_category",'retweet_count','reply_count', 
#         'like_count', 'quote_count', 'topic', "burstiness", "cascade", "pos_num", "is_bot_rf20_y", "is_bot_rf20_prob"]#,"is_bot_dl_prob"]
user_ctrl = bot_replies.groupby(by = 'author_id').agg({"text":"count", 
                                                           "bot_senti":"mean", 
                                                       "interaction_senti":"mean",
                                                            'retweet_count_original':"sum", 
                                                       'like_count_original':"sum",
                                                      'retweet_count_interaction':"sum", 
                                                       'like_count_interaction':"sum",
                                                               'topic': 'first',
                                                               "burstiness":"first",
                                                               "cascade":"first", "pos_num":"first",
                                                              "is_bot_rf20":"first",
                                                               "is_bot_rf20_prob":"mean",
                                                              "is_bot_dl_rough":"first",
                                                               "is_bot_dl_prob":"mean",
                                                              "is_bot_xgb":"first",
                                                               "is_bot_xgb_prob":"mean",
                                                              "is_bot_botometer_prob":"mean",
                                                       'astroturf_display_english':"first", 
                                                       'fake_follower_display_english':"first",
                                                        'financial_display_english':"first", 'other_display_english':"first",
                                                       'overall_display_english':"first", 'self_declared_display_english':"first",
                                                           'spammer_display_english':"first",
                                                              'is_bot_rf20_prob_original_human':"first",
                                                              'is_bot_rf20_original_human':"first", 
                                                              'is_bot_dl_rough_original_human':"first",
                                                              'is_bot_dl_prob_original_human':"first", 
                                                              'is_bot_xgb_original_human':"first",
                                                              'is_bot_xgb_prob_original_human':"first",
                                                        'is_bot_botometer_prob_original_human':"first",
                                                        'astroturf_display_english_original_human':"first",
                                                       'fake_follower_display_english_original_human':"first",
                                                       'financial_display_english_original_human':"first",
                                                       'other_display_english_original_human':"first",
                                                       'overall_display_english_original_human':"first",
                                                       'self_declared_display_english_original_human':"first",
                                                       'spammer_display_english_original_human':"first"
                                                             }).reset_index()


In [None]:
user_ctrl.columns = ['author_id', 'interaction_times','bot_senti', 'interaction_senti', 
                     'retweet_count_original', 'like_count_original',  'retweet_count_interaction', 'like_count_interaction',
                      'topic', 'burstiness',
       'cascade', 'pos_num', 'RF_bot_pred_bot', 'RF_bot_prob_bot',
       'DL_bot_pred_bot', 'DL_bot_prob_bot', 'XGB_bot_pred_bot', 'XGB_bot_prob_bot',
                     "BOTOMETER_bot_prob_bot", "astroturf_score_bot", "fake_follower_score_bot", "financial_score_bot", "other_score_bot",
                     "overall_score_bot", "self_declared_score_bot", "spammer_score_bot",
        'RF_bot_prob_user', 'RF_bot_pred_bot_user',
       'DL_bot_pred_user', 'DL_bot_prob_user', 'XGB_bot_pred_user', 'XGB_bot_prob_user', "BOTOMETER_bot_prob_user",  
                     "astroturf_score_human", "fake_follower_score_human", "financial_score_human", "other_score_human",
                     "overall_score_human", "self_declared_score_human", "spammer_score_human",]

In [None]:
user_ctrl['retweet_log_original'] = [np.log(x+1) for x in user_ctrl.retweet_count_original]
user_ctrl['like_log_original'] = [np.log(x+1) for x in user_ctrl.like_count_original]
user_ctrl['without_like_original'] = [0 if i != 0 else 1 for i in user_ctrl.like_count_original]

user_ctrl['retweet_log_interaction'] = [np.log(x+1) for x in user_ctrl.retweet_count_interaction]
user_ctrl['like_log_interaction'] = [np.log(x+1) for x in user_ctrl.like_count_interaction]
user_ctrl['without_like_interaction'] = [0 if i != 0 else 1 for i in user_ctrl.like_count_interaction]

In [None]:
user_ctrl['few_like_original'] = [1 if i <=6 else 0 for i in user_ctrl.like_count_original]
user_ctrl['few_like_interaction'] = [1 if i <=6 else 0 for i in user_ctrl.like_count_interaction]
user_ctrl['frequent_interact'] = [1 if i>=5 else 0 for i in user_ctrl.interaction_times]

In [None]:
#user_ctrl.to_csv("./ctrls/RF_based_user_and_bot_prob_and_ctrls.csv", index = False)
#user_ctrl.to_csv("./ctrls/DL_based_user_and_bot_prob_and_ctrls.csv", index = False)
#user_ctrl.to_csv("./ctrls/XGB_based_user_and_bot_prob_and_ctrls.csv", index = False)

In [None]:
user_ctrl.to_csv("./botometer_ctrl_table/BOTOMETER_based_user_and_bot_prob_and_ctrls_0.5.csv", index = False)

## Generate reg tables

In [None]:
df_did2_retweet = df[df.referenced_tweets_0_type == "retweeted"]
df_did2_retweet = df_did2_retweet.groupby(['author_id', 'after_timegap']).agg({"senti":'size'
                                                         }).reset_index()
df_did2_retweet.columns = ['author_id_', "after_timegap_", "retweet_size"] # generate activity on RTs

df_did2_reply = df[df.referenced_tweets_0_type == "replied_to"]
df_did2_reply = df_did2_reply.groupby(['author_id', 'after_timegap']).agg({"senti":'size'
                                                         }).reset_index()
df_did2_reply.columns = ['author_id_', "after_timegap_", "reply_size"] # generate activity on replies

In [None]:
import math 
df_did2 = df.groupby(['author_id', 'after_timegap']).agg({"senti":['mean','size'], 'senti_category':'mean',
                                                         }).reset_index()
df_did2.columns = ["_".join(a) for a in df_did2.columns.to_flat_index()]
df_did2 = df_did2.merge(df_did2_retweet, on = ["author_id_", "after_timegap_"],how = "left", copy = False)
df_did2 = df_did2.merge(df_did2_reply, on = ["author_id_", "after_timegap_"],how = "left", copy = False)

df_did2['bot_interaction'] = 1
shifted = df_did2[['author_id_','senti_mean']].groupby("author_id_").shift(2)
df_did2['senti_mean_t2'] = shifted.senti_mean
df_did2[['reply_size', 'retweet_size']] = df_did2[['reply_size', 'retweet_size']].fillna(0)


matched_did2 = df_matched.groupby(['author_id', 'after_timegap']).agg({"senti":['mean','size'], 'senti_category':'mean'}).reset_index()
matched_did2.columns = ["_".join(a) for a in matched_did2.columns.to_flat_index()]
matched_did2['bot_interaction'] = 0
shifted_match = matched_did2[['author_id_','senti_mean']].groupby("author_id_").shift(2)
matched_did2['senti_mean_t2'] = shifted_match.senti_mean

matched_did2 = matched_did2.merge(matched_users[["user", "matched_user"]], how = 'left', left_on = "author_id_", right_on = "matched_user", copy = False)
#matched_did2 = matched_did2.merge(user_ctrl, how = 'inner', left_on = "user", right_on = "author_id", copy = False)
matched_did2['sample_uid'] = matched_did2.user
#matched_did2.drop(columns = ['user','matched_user', 'author_id'], inplace = True)


df_did2 = df_did2.merge(matched_users[["user", "matched_user"]], how = 'left', left_on = "author_id_", right_on = "user", copy = False)
#df_did2 = df_did2.merge(user_ctrl, how = 'inner', left_on = "author_id_", right_on = "author_id", copy = False)
df_did2['sample_uid'] = None
#df_did2.drop(columns = ['user','matched_user','author_id'], inplace = True)

df_did2 = pd.concat([df_did2,matched_did2])

df_did2['log_daily_tweet'] = [np.log(i) for i in df_did2.senti_size]
df_did2['senti_abs'] = [abs(i) for i in df_did2.senti_mean]
df_did2['senti_abs_t2'] = [abs(i) for i in df_did2.senti_mean_t2]

df_did2['tg1'] = df_did2.after_timegap_ * df_did2.bot_interaction
df_did2['senti_abs_sqrt'] = [math.sqrt(abs(i)) for i in df_did2.senti_abs]
df_did2['senti_abs_sqrt_t2'] = [abs(i) for i in df_did2.senti_abs_t2]


In [None]:
df_did2['after_10days'] = [1 if (i > 0 and i <= 10) else 0 for i in df_did2.after_timegap_]
df_did2['after_20days'] = [1 if (i > 10 and i <= 20) else 0 for i in df_did2.after_timegap_]
df_did2['after_30days'] = [1 if i > 20 else 0 for i in df_did2.after_timegap_]


In [None]:
# 10-day gap based interaction terms
df_did2['tg1_10days'] = df_did2.after_10days * df_did2.bot_interaction
df_did2['tg1_20days'] = df_did2.after_20days * df_did2.bot_interaction
df_did2['tg1_30days'] = df_did2.after_30days * df_did2.bot_interaction

In [None]:
#df_did2.to_csv("./ctrls/RF_table_for_reg_no_ctrls.csv", index = False)
#df_did2.to_csv("./ctrls/DL_table_for_reg_no_ctrls.csv", index = False)
df_did2.to_csv("./ctrls/XGB_table_for_reg_no_ctrls.csv", index = False)

In [None]:
df_did2.to_csv("./botometer_ctrl_table/BOTOMETER_table_for_reg_no_ctrls_0.5.csv", index = False)

In [None]:
df_did2.to_csv("./botometer_ctrl_table/BOTOMETER_table_for_reg_with_ctrls_0.5.csv", index = False)