In [1]:
# Automatically load changes in dependency files (may be unnecessary here, but useful tool in case you're modifying packages that this file relies on)
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


import os

In [17]:
# Recursive find for bot data from current directory
path = None
for dirpath, dirnames, filenames in os.walk("."):
    for dirname in dirnames:
        if dirname == "botometer-feedback-2019":
            path = os.path.join(dirpath, dirname)

assert path is not None, "botometer-feedback directory not found"
print(f"Path to csv file: {path}")

node_path = os.path.join(path, "node.json")
label_path = os.path.join(path, "label.csv")


Path to csv file: .\data\TwiBot22-Other-Datasets\Other-Dataset-TwiBot22-Format\botometer-feedback-2019


In [18]:
# Read in data into 2 dataframes and merge on the user id column
df_node = pd.read_json(node_path)
df_label = pd.read_csv(label_path)

df = pd.merge(df_node, df_label, on='id')

df

Unnamed: 0,created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld,label
0,2012-06-07 22:16:27+00:00,Strategic Creative Social Media & Community En...,{'url': {'urls': [{'url': 'https://t.co/e5t6p9...,u602249341,London - mostly,Emma Dingle🐧🏳️‍🌈🇪🇺🇬🇧🏴🤓,,http://pbs.twimg.com/profile_images/9239243429...,False,"{'followers_count': 790, 'following_count': 32...",https://t.co/e5t6p9w7D8,EmmaDingle,False,,human
1,2016-11-13 01:48:58+00:00,Injustice and corruption make me really mad. I...,{'description': {'urls': []}},u797617218511060992,All over North America,Wolverine 2018💥,,http://pbs.twimg.com/profile_images/8552445716...,False,"{'followers_count': 16039, 'following_count': ...",,Wolv_2018,False,,bot
2,2012-10-18 23:19:38+00:00,Whistler REALTOR® specializing in mountain lif...,{'url': {'urls': [{'url': 'http://t.co/7gh2Iu1...,u889925474,"Whistler, BC CANADA",Rob Palm Whistler,,http://pbs.twimg.com/profile_images/9640798322...,False,"{'followers_count': 7618, 'following_count': 7...",http://t.co/7gh2Iu1AT4,RobPalmWhistler,False,,human
3,2009-12-12 22:53:04+00:00,O scrivi Italia o scrivi libertà. Due termini ...,{'description': {'urls': []}},u96435556,"Roma, Lazio",Mariano,,http://pbs.twimg.com/profile_images/3114299697...,False,"{'followers_count': 388, 'following_count': 67...",,Marianocrt,False,,bot
4,2008-10-22 13:43:42+00:00,"Education, Flaneur, Digitalisation—Tweets Germ...",{'url': {'urls': [{'url': 'https://t.co/VRgsX8...,u16905397,"Frankfurt am Main, Deutschland",Torsten Larbig,,http://pbs.twimg.com/profile_images/9697051416...,False,"{'followers_count': 19677, 'following_count': ...",https://t.co/VRgsX8eVR2,herrlarbig,False,,human
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,2009-12-14 14:01:49+00:00,Drinker of coffee and player of games. Now als...,{'url': {'urls': [{'url': 'https://t.co/BgFtjC...,u96766049,"West Cork, Ireland",Cat Tobin,,http://pbs.twimg.com/profile_images/7831096645...,False,"{'followers_count': 1151, 'following_count': 6...",https://t.co/BgFtjCD2q9,CatTHM,False,,human
514,2009-01-30 12:31:25+00:00,InfoSci PhD student. I research how we embed o...,{'url': {'urls': [{'url': 'https://t.co/80qIwR...,u19765236,"Chapel Hill, NC",The third John D. Martin from the sun,,http://pbs.twimg.com/profile_images/9638442277...,False,"{'followers_count': 2172, 'following_count': 1...",https://t.co/80qIwR30Z7,jdmar3,False,,human
515,2013-01-08 04:36:06+00:00,"Sports, Politics and Video Games. Bringing the...",{'description': {'urls': []}},u1069984033,,Rusty Colbert,,http://pbs.twimg.com/profile_images/7971210462...,False,"{'followers_count': 168, 'following_count': 73...",,RustyColbert,False,,human
516,2012-12-03 14:58:40+00:00,it's hard being a single mother when you don't...,{'description': {'urls': []}},u986789138,,austin,,http://pbs.twimg.com/profile_images/9327526221...,False,"{'followers_count': 193, 'following_count': 66...",,Aschmigel,False,,human


In [19]:
df.dtypes

created_at           datetime64[ns, UTC]
description                       object
entities                          object
id                                object
location                          object
name                              object
pinned_tweet_id                  float64
profile_image_url                 object
protected                           bool
public_metrics                    object
url                               object
username                          object
verified                            bool
withheld                         float64
label                             object
dtype: object

In [21]:
# Change created at to Unix epoch time in seconds
df["created_at"] = pd.to_datetime(df["created_at"]).astype("int64") // 10**9 # Convert to seconds

# Drop columns with urls or identifying information (username, id, etc)
df.drop(columns=['entities', 'id', 'profile_image_url', 'url'])
df.head()

Unnamed: 0,created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld,label
0,1,Strategic Creative Social Media & Community En...,{'url': {'urls': [{'url': 'https://t.co/e5t6p9...,u602249341,London - mostly,Emma Dingle🐧🏳️‍🌈🇪🇺🇬🇧🏴🤓,,http://pbs.twimg.com/profile_images/9239243429...,False,"{'followers_count': 790, 'following_count': 32...",https://t.co/e5t6p9w7D8,EmmaDingle,False,,human
1,1,Injustice and corruption make me really mad. I...,{'description': {'urls': []}},u797617218511060992,All over North America,Wolverine 2018💥,,http://pbs.twimg.com/profile_images/8552445716...,False,"{'followers_count': 16039, 'following_count': ...",,Wolv_2018,False,,bot
2,1,Whistler REALTOR® specializing in mountain lif...,{'url': {'urls': [{'url': 'http://t.co/7gh2Iu1...,u889925474,"Whistler, BC CANADA",Rob Palm Whistler,,http://pbs.twimg.com/profile_images/9640798322...,False,"{'followers_count': 7618, 'following_count': 7...",http://t.co/7gh2Iu1AT4,RobPalmWhistler,False,,human
3,1,O scrivi Italia o scrivi libertà. Due termini ...,{'description': {'urls': []}},u96435556,"Roma, Lazio",Mariano,,http://pbs.twimg.com/profile_images/3114299697...,False,"{'followers_count': 388, 'following_count': 67...",,Marianocrt,False,,bot
4,1,"Education, Flaneur, Digitalisation—Tweets Germ...",{'url': {'urls': [{'url': 'https://t.co/VRgsX8...,u16905397,"Frankfurt am Main, Deutschland",Torsten Larbig,,http://pbs.twimg.com/profile_images/9697051416...,False,"{'followers_count': 19677, 'following_count': ...",https://t.co/VRgsX8eVR2,herrlarbig,False,,human


In [31]:
print(df[df['withheld'].notna()])
print(df[df['pinned_tweet_id'].notna()])

Empty DataFrame
Columns: [created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld, label]
Index: []
Empty DataFrame
Columns: [created_at, description, entities, id, location, name, pinned_tweet_id, profile_image_url, protected, public_metrics, url, username, verified, withheld, label]
Index: []


In [None]:
twi_path = os.path.join(os.getcwd(), "label.csv")

test_df = pd.read_csv(twi_path)
test_df.shape

# We'll narrow the dataset down to 5000 bots and 5000 normal users, the dataset file should be ~ 1 GB

(1000000, 2)