In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
from scipy.spatial import distance

In [None]:
user_path = "./0422_rf_edition/xr_users_classified_0422_wprob.csv"
user_prob_path = "./0422_rf_edition/xr_users_classified_0704_four_methods_wprob.csv"
tweet_path = "tweet_table_xr_2019_0621_fixed.csv"
tweet_topic_modelled = "./NLP/text_classified.csv"

In [31]:
bot_classification_col = "is_bot_botometer_0.5"
suffix = "_botometer_0.5"

## Generate users w/ bot interaction

### Filtering communications

In [4]:
#Preprocessing
users = pd.read_csv(user_path, lineterminator='\n')
users_prob = pd.read_csv(user_prob_path, lineterminator='\n')
df = pd.read_csv(tweet_path)
tweet_classified = pd.read_csv(tweet_topic_modelled, lineterminator='\n')

  df = pd.read_csv(tweet_path)


In [None]:
users = users.merge(users_prob[["id","is_bot_xgb","is_bot_xgb_prob","is_bot_botometer", "is_bot_botometer_prob"]], on = "id", how = "left")

In [None]:
#Filtering out tweet not related to XR like iphone sellers
df = df[~df["text"].str.contains("(?i)iphone")]
df = df[~df["text"].str.contains("(?i)phone")]
df = df[~df["text"].str.contains("(?i)max")]
df = df[~df["text"].str.contains("(?i)11")]
df = df[~df["text"].str.contains("(?i)vr")]
df = df[~df["text"].str.contains("(?i)xpel")]
df = df[~df["text"].str.contains("(?i)tint")]
#text = df["text"]

In [None]:
df_original = df.copy() # keep a copy for further analysis

In [None]:
# Filter out interaction records of humans REPLYING bots
df = df[df.referenced_tweets_0_type != "retweeted"]
df['created_at_dt'] = pd.to_datetime(df.created_at, infer_datetime_format=True)
users['created_at_dt'] = pd.to_datetime(users.created_at, infer_datetime_format=True)
df = df.merge(users[[bot_classification_col,'id']], how = 'left', left_on = 'author_id', right_on = 'id', copy = False)
df = df.merge(tweet_classified, how = 'left', left_on = "text", right_on = "text", copy = False)

df_comm = df.merge(users[[bot_classification_col, 'id']], how = 'left', left_on = 'in_reply_to_user_id', right_on = 'id', suffixes=('', '_reply'))
reply_df = df_comm[(df_comm[bot_classification_col+"_reply"] == 1) & (df_comm[bot_classification_col] == 0)]


In [None]:
reply_df.to_csv(f"./xr2019_user_reply_bots{suffix}.csv", index = False)
df_comm.to_csv(f'./df_comm_edges{suffix}.csv', index = False)

In [None]:
bot_post_id = [i if i is not None else j for i,j in zip(df_bot_reply.referenced_tweets_0_id, df_bot_reply.referenced_tweets_1_id)]
bot_post_id = [str(int(i)) for i in bot_post_id if str(i) != "nan"] # Saving the ids of bot posts

In [None]:
with open('./bot_conversation_id.txt', 'w') as f:
    for line in bot_post_id:
        f.write(str(line))
        f.write('\n')

### Generating sample for matching

In [None]:
# Calculate the 60 day time range before & after interaction for data collection
user_time = users.merge(reply_df[["author_id", "created_at", "text"]], how = 'inner', left_on = "id", right_on = "author_id", copy = False)
user_time = user_time[['author_id','created_at_x', 'created_at_y', 'text']]
user_time.columns = ['author_id', 'user_created_at', 'interaction_at', 'text']
user_time = user_time.drop_duplicates(subset = ['author_id'])
user_time['user_created_at'] = pd.to_datetime(user_time.user_created_at, infer_datetime_format=True)
user_time['interaction_at'] = pd.to_datetime(user_time.interaction_at, infer_datetime_format=True)
time_window = timedelta(days = 30)
user_time['time_start'] = user_time.interaction_at - time_window
user_time['time_end'] = user_time.interaction_at + time_window

In [None]:
user_time['time_start_str'] = [i.strftime("%Y-%m-%dT%H:%M:%SZ") for i in user_time.time_start]
user_time['time_end_str'] = [i.strftime("%Y-%m-%dT%H:%M:%SZ") for i in user_time.time_end]

In [None]:
user_time.to_csv(f"./matching_user_data_collection_xr2019{suffix}.csv", index = False)

### Generating matched users who do not have direct interaction with bots

In [None]:
df = df_original

In [None]:
reply_df_short = reply_df[["author_id", "conversation_id", "created_at"]]
df['created_at_dt'] = pd.to_datetime(df.created_at, infer_datetime_format=True)
df = df.merge(users[[bot_classification_col,'id']], how = 'left', left_on = 'author_id', right_on = 'id', copy = False)

In [None]:
df = df[(df[bot_classification_col] == 0) & (df.referenced_tweets_0_type != "replied_to")]
reply_df_short['created_at_dt'] = pd.to_datetime(reply_df_short.created_at, infer_datetime_format=True)
df['created_at_rough'] = [i[:-9] for i in df.created_at]
reply_df_short['created_at_rough'] = [i[:-9] for i in reply_df_short.created_at]

In [None]:
matched_records = df.merge(reply_df_short, how = 'right', on = "created_at_rough", copy = False)
matched_records = matched_records[matched_records.author_id_x != matched_records.author_id_y]
matched_uids = matched_records[["author_id_x", "author_id_y", "created_at_x","created_at_y"]]
matched_uids.columns = ['matched_user', "user", "interaction_matched", "interaction"]

In [None]:
#all matched uids without eculid based filterings
matched_uids

In [None]:
matched_uids.to_csv(f"./xr2019_user_matched_rough{suffix}.csv", index = False)

#### Further filtering based on ecudlian distance

In [None]:
users = users[users.created_at != "0"]
#users.dropna(subset = ['created_at'], inplace = True)
#users['created_at_dt'] = pd.to_datetime(users.created_at, format = "%a %b %d %H:%M:%S +0000 %Y ", errors = 'coerce')

In [None]:
matched_uids = matched_uids.drop_duplicates(subset = ['matched_user'])
euclid = pd.DataFrame(columns = ['uid', "matched_uid", "euclid"])
cols = ['statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'followers_growth', 'friends_growth', 'favourites_growth',
       'listed_growth', 'follower_friend_ratio']


In [None]:
uids = []
matched_user = []
eculid = []
for i in range(matched_uids.shape[0]):
    uid = matched_uids.iloc[i].user
    matched_uid = matched_uids.iloc[i].matched_user
    user_info = users[users.id == uid][cols]
    matched_info = users[users.id == matched_uid][cols]
    if user_info.shape[0] == 1 and matched_info.shape[0] == 1:
        user_info = [np.log(float(i)+1) for i in list(user_info.T.iloc[:,0])]

        matched_info = [np.log(float(i) + 1) for i in list(matched_info.T.iloc[:,0])]

        X = np.vstack([user_info, matched_info])
        euclidean_ = distance.pdist(X)[0]
        uids.append(uid)
        eculid.append(euclidean_)
        matched_user.append(matched_uid)
    else:
        pass



In [None]:
eculid_df = pd.DataFrame()
eculid_df['uid'] = uids
eculid_df['eculid'] = eculid
eculid_df['matched_uid'] = matched_user

In [None]:
eculid_df.sort_values(by = ['uid', 'eculid'], ascending = [True, False], inplace = True)

In [None]:
matched_with_euclid = eculid_df.merge(matched_uids, right_on = 'matched_user', left_on = 'matched_uid', how = 'left', copy = False)
matched_with_euclid.dropna(subset = ['matched_user'], inplace = True)
matched_with_euclid['interaction_matched'] = pd.to_datetime(matched_with_euclid.interaction_matched, infer_datetime_format=True)
time_window = timedelta(days = 30)
matched_with_euclid['time_start'] = matched_with_euclid.interaction_matched - time_window
matched_with_euclid['time_end'] = matched_with_euclid.interaction_matched + time_window
matched_with_euclid.to_csv(f".xr2019_user_matched_with_euclid{suffix}.csv", index = False)

In [None]:
# Getting first best match
firsts = matched_with_euclid.drop_duplicates(subset=['uid'], keep='first')
firsts['time_start'] = pd.to_datetime(firsts.time_start, infer_datetime_format=True)
firsts['time_end'] = pd.to_datetime(firsts.time_end, infer_datetime_format=True)
firsts['time_start_str'] = [i.strftime("%Y-%m-%dT%H:%M:%SZ") for i in seconds.time_start]
firsts['time_end_str'] = [i.strftime("%Y-%m-%dT%H:%M:%SZ") for i in seconds.time_end]
firsts.to_csv(f"./xr2019_user_matched_firsts{suffix}.csv", index = False)

In [None]:
## Getting second best match
seconds = matched_with_euclid.groupby('uid').head(2)
seconds['time_start'] = pd.to_datetime(seconds.time_start, infer_datetime_format=True)
seconds['time_end'] = pd.to_datetime(seconds.time_end, infer_datetime_format=True)
seconds['time_start_str'] = [i.strftime("%Y-%m-%dT%H:%M:%SZ") for i in seconds.time_start]
seconds['time_end_str'] = [i.strftime("%Y-%m-%dT%H:%M:%SZ") for i in seconds.time_end]
seconds.to_csv(f"./xr2019_user_matched_seconds{suffix}.csv", index = False)

## Collecting bot original tweets

In [24]:
bot_replies = pd.read_csv("./botometer/xr2019_user_reply_bots_botometer_0.5.csv")

In [11]:
from TwitterAPI import TwitterAPI, TwitterOAuth, HydrateType, TwitterPager
import json

In [12]:
def load_token(textfile):
    try:
        with open(textfile, 'r') as file:
            auth = file.readlines()
            keys = []
            for i in auth:
                i = str(i).strip()
                keys.append(i)
        return keys
    except EnvironmentError:
        print('Error loading access token from file')

In [13]:
consumer_key, consumer_secret, access_token_key, access_token_secret, bearer_token = load_token('/home/lindali/Documents/DPhil studies/thesis_work/twitter_auth')
api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret, auth_type='oAuth1', api_version='1.1')

In [14]:
conversation = load_token("./botometer/bot_conversation_id.txt")

In [15]:
r = api.request('statuses/show/:' + conversation[0])

In [16]:
item = r.json()

In [18]:
item

{'errors': [{'code': 144, 'message': 'No status found with that ID.'}]}

In [21]:
import pandas as pd
bot_original_tweet = []
error_ids = []
for i,id_ in enumerate(conversation):
    #print(id_)
    r = api.request('statuses/show/:' + id_)
    try:
        item = r.json()
        #print(item)
        #print(item['text'])
        #print(item['user']['id_str'])
        temp_df_row = {}
        temp_df_row['text'] = item['text']
        temp_df_row['id'] = item['id']
        temp_df_row['author_id'] = item['user']['id_str']
        temp_df_row['retweet_count'] = item['retweet_count']
        temp_df_row['like_count'] = item['favorite_count']
        bot_original_tweet.append(temp_df_row)
        if i%200==0:
            print(f"{i} conversation collected")
    except Exception as e:
        error_ids.append(i)
        #print(e)
        pass
    

    

200 conversation collected
400 conversation collected
800 conversation collected
1800 conversation collected
2000 conversation collected
2200 conversation collected


In [22]:
bot_original_tweet_df = pd.DataFrame.from_records(bot_original_tweet)

In [27]:
reply_to_bot_and_original_post = reply_to_bot_and_original_post.drop_duplicates()

In [26]:
reply_to_bot_and_original_post = bot_replies.merge(bot_original_tweet_df, left_on = 'referenced_tweets_0_id', right_on = 'id', suffixes = ("_original", "_reply"), how = 'left', copy = False)

In [28]:
reply_to_bot_and_original_post = reply_to_bot_and_original_post[['text_original','text_reply',"created_at", 'author_id_original','author_id_reply',"id_original",'id_reply', 
                                     'retweet_count', 'like_count', 'public_metrics_retweet_count','public_metrics_like_count', "in_reply_to_user_id",'topic']]
reply_to_bot_and_original_post = reply_to_bot_and_original_post.loc[:,~reply_to_bot_and_original_post.columns.duplicated()].copy()



In [29]:
reply_to_bot_and_original_post.columns = ['text', 'original_text', "created_at",'author_id', 'bot_id',
                            'id', 'original_tweet_id',"retweet_count_original", "like_count_original",
                                          'retweet_count_interaction', 'like_count_interaction',"in_reply_to_user_id",'topic'
                                         ]

In [30]:
import math
reply_to_bot_and_original_post['id'] = [str(int(i)) for i in reply_to_bot_and_original_post.id]
reply_to_bot_and_original_post['original_tweet_id'] = [i if math.isnan(i) else str(int(i)) for i in reply_to_bot_and_original_post.original_tweet_id]

In [32]:
reply_to_bot_and_original_post.to_csv(f"./botometer/bot_original_tweet_updated{suffix}.csv", index = False)