## Notebook: initialization.ipynb
This notebook is used for scraping data and storing it in a directory for use in our classifier.

In [1]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize


Load data, normalize json, combine with bot identification

Dataset are from [OSoMe botometer](https://botometer.osome.iu.edu/bot-repository/datasets.html)

| Dataset               | Bots  | Humans | Notes                  |
|-----------------------|-------|--------|------------------------|
| cresci-rtbust-2019    | 353   | 339    |                        |
| midterm-2018          | 42445 | 8092   |                        |
| gilani-2017           | 1089  | 1413   |                        |
| pronbots-2019         | 17881 | 0      | Spam bots              |
| vendor-purchased-2019 | 1086  | 0      | Fake follower accounts |

In [2]:
with open('../datasets/cresci-rtbust-2019_tweets.json') as jsfile:
    cresci_rtbust_2019_data = json.load(jsfile)
cresci_rtbust_2019_data = json_normalize(cresci_rtbust_2019_data)
cresci_rtbust_2019_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
cresci_rtbust_2019_data.columns = cresci_rtbust_2019_data.columns.str.replace(r'^user\.','')

cresci_rtbust_2019_identification = pd.read_csv('../datasets/cresci-rtbust-2019.tsv', sep='\t')
cresci_rtbust_2019_identification.columns = ['user-id','identification']
cresci_rtbust_2019 = pd.merge(cresci_rtbust_2019_identification, cresci_rtbust_2019_data, how='inner', on='user-id')
cresci_rtbust_2019_bots = cresci_rtbust_2019[(cresci_rtbust_2019.identification=='bot')]
cresci_rtbust_2019_humans = cresci_rtbust_2019[(cresci_rtbust_2019.identification=='human')]


In [3]:
cresci_rtbust_2019_bots[['followers_count','friends_count','favourites_count','verified','statuses_count','listed_count']]

Unnamed: 0,followers_count,friends_count,favourites_count,verified,statuses_count,listed_count
0,289,401,213,False,3210,1
3,9,26,140,False,2581,0
5,15,24,52,False,496,0
7,2,1,15,False,1641,0
10,27,16,153,False,1750,0
...,...,...,...,...,...,...
681,4,8,4851,False,4853,15
682,9,8,4023,False,4028,0
684,2201,1113,128007,False,130575,12
688,1041,1380,2301,False,28522,4


In [4]:
midterm_2018_data = pd.read_json('../datasets/midterm-2018_processed_user_objects.json')
midterm_2018_identification = pd.read_csv('../datasets/midterm-2018.tsv', sep='\t')
midterm_2018_identification.columns = ['user_id','identification']
midterm_2018 = pd.merge(midterm_2018_data, midterm_2018_identification, how='inner', on='user_id')
midterm_2018_bots = midterm_2018[(midterm_2018.identification=='bot')]
midterm_2018_humans = midterm_2018[(midterm_2018.identification=='human')]

In [5]:
with open('../datasets/gilani-2017_tweets.json') as jsfile:
    gilani_2017_data = json.load(jsfile)
gilani_2017_data = json_normalize(gilani_2017_data)
gilani_2017_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
gilani_2017_data.columns = gilani_2017_data.columns.str.replace(r'^user\.','')

gilani_2017_identification = pd.read_csv('../datasets/gilani-2017.tsv', sep='\t')
gilani_2017_identification.columns = ['user-id','identification']
gilani_2017 = pd.merge(gilani_2017_identification, gilani_2017_data, how='inner', on='user-id')
gilani_2017_bots = gilani_2017[(gilani_2017.identification=='bot')]
gilani_2017_humans = gilani_2017[(gilani_2017.identification=='human')]

In [6]:
with open('../datasets/pronbots-2019_tweets.json') as jsfile:
    pronbots_2019_data = json.load(jsfile)
pronbots_2019_data = json_normalize(pronbots_2019_data)
pronbots_2019_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
pronbots_2019_data.columns = pronbots_2019_data.columns.str.replace(r'^user\.','')

pronbots_2019_identification = pd.read_csv('../datasets/pronbots-2019.tsv', sep='\t')
pronbots_2019_identification.columns = ['user-id','identification']
pronbots_2019 = pd.merge(pronbots_2019_identification, pronbots_2019_data, how='inner', on='user-id')
pronbots_2019_bots = pronbots_2019[(pronbots_2019.identification=='bot')]
pronbots_2019_humans = pronbots_2019[(pronbots_2019.identification=='human')]

In [7]:
with open('../datasets/vendor-purchased-2019_tweets.json') as jsfile:
    vendor_purchased_2019_data = json.load(jsfile)
vendor_purchased_2019_data = json_normalize(vendor_purchased_2019_data)
vendor_purchased_2019_data.rename(columns= {'created_at': 'probe_timestamp', 'user.id': 'user-id'}, inplace=True)
vendor_purchased_2019_data.columns = vendor_purchased_2019_data.columns.str.replace(r'^user\.','')

vendor_purchased_2019_identification = pd.read_csv('../datasets/vendor-purchased-2019.tsv', sep='\t')
vendor_purchased_2019_identification.columns = ['user-id','identification']
vendor_purchased_2019 = pd.merge(vendor_purchased_2019_identification, vendor_purchased_2019_data, how='inner', on='user-id')
vendor_purchased_2019_bots = vendor_purchased_2019[(vendor_purchased_2019.identification=='bot')]
vendor_purchased_2019_humans = vendor_purchased_2019[(vendor_purchased_2019.identification=='human')]