In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import random
import matplotlib.pyplot as plt
import seaborn as sns

Now, let's look at a sample from our database

In [2]:
sample = pd.read_csv('../data/data.csv')
sample.dropna(inplace=True)
print(sample.shape)
print(sample.dtypes)

(39522, 10)
id               int64
name            object
screen_name     object
description     object
date_created    object
followers        int64
following        int64
last_updated    object
image_url       object
is_bot          object
dtype: object


Populate `is_bot` with random values

In [3]:
import random
vals = ['NOT', 'BOT', 'UNKNOWN']
sample['category'] = [random.choice(vals) for k in sample.index]

In [4]:
sample.head(10)

Unnamed: 0,id,name,screen_name,description,date_created,followers,following,last_updated,image_url,is_bot,category
0,74013,CJ n TX,ccjones,Nothing is as it seems.,2006-12-16 18:21:51,3918,3398,2019-04-13 08:10:57,http://pbs.twimg.com/profile_images/1060243585...,unknown,NOT
1,681403,Jason Benway,BenwayNet,"Christian,husband,father,Computer geek, and xb...",2007-01-22 16:42:28,1068,1560,2019-04-15 00:32:18,http://pbs.twimg.com/profile_images/1111705638...,unknown,BOT
2,736463,Won Peace of a Hole,dtweete,probably a bot. ✝,2007-01-31 07:20:08,624,1610,2019-04-14 06:21:22,http://pbs.twimg.com/profile_images/149195479/...,unknown,UNKNOWN
3,755703,Dean Roth,deanroth,See that name up there? That's me. \n\nAlso av...,2007-02-07 04:13:21,649,790,2019-04-13 16:11:00,http://pbs.twimg.com/profile_images/4706719918...,unknown,BOT
4,759066,David Neuland,Galloway,"Engineer, writer, Mac-addict. I. Am. Not. Geor...",2007-02-08 23:18:36,173,458,2019-04-14 04:50:49,http://pbs.twimg.com/profile_images/22917772/I...,unknown,UNKNOWN
5,759251,CNN,CNN,It’s our job to #GoThere & tell the most diffi...,2007-02-09 00:35:02,41555368,1110,2019-03-20 14:42:04,http://pbs.twimg.com/profile_images/5089607618...,unknown,NOT
6,796409,Steve Spinks,StevenMSpinks,Article V of the Constitution supplies the rem...,2007-02-26 23:55:57,3150,4126,2019-04-14 02:24:58,http://pbs.twimg.com/profile_images/1108452822...,unknown,BOT
7,810600,mzchief,mzchief,"Everyone has some issue, it is how we deal wit...",2007-03-04 19:28:27,65,160,2019-04-14 13:41:10,http://pbs.twimg.com/profile_images/191610065/...,unknown,BOT
8,813286,Barack Obama,BarackObama,"Dad, husband, President, citizen.",2007-03-05 22:08:25,105228666,615149,2019-03-20 04:05:08,http://pbs.twimg.com/profile_images/8225477323...,unknown,UNKNOWN
9,1053931,Roy Bragg,roybragg,I do nothing & try to avoid anything b/c it mi...,2007-03-12 23:47:18,3281,153,2019-04-15 14:34:32,http://pbs.twimg.com/profile_images/1039925325...,unknown,BOT


In [5]:
# calculate follower-following ratio
def calculate_f2f(row):
    try:
        val = row['followers'] / row['following']
    except ZeroDivisionError:
        val = 9999999999999
    return val

sample['f2f'] = sample.apply (lambda row: calculate_f2f(row), axis=1)

In [6]:
sample.head(4)

Unnamed: 0,id,name,screen_name,description,date_created,followers,following,last_updated,image_url,is_bot,category,f2f
0,74013,CJ n TX,ccjones,Nothing is as it seems.,2006-12-16 18:21:51,3918,3398,2019-04-13 08:10:57,http://pbs.twimg.com/profile_images/1060243585...,unknown,NOT,1.153031
1,681403,Jason Benway,BenwayNet,"Christian,husband,father,Computer geek, and xb...",2007-01-22 16:42:28,1068,1560,2019-04-15 00:32:18,http://pbs.twimg.com/profile_images/1111705638...,unknown,BOT,0.684615
2,736463,Won Peace of a Hole,dtweete,probably a bot. ✝,2007-01-31 07:20:08,624,1610,2019-04-14 06:21:22,http://pbs.twimg.com/profile_images/149195479/...,unknown,UNKNOWN,0.387578
3,755703,Dean Roth,deanroth,See that name up there? That's me. \n\nAlso av...,2007-02-07 04:13:21,649,790,2019-04-13 16:11:00,http://pbs.twimg.com/profile_images/4706719918...,unknown,BOT,0.821519


In [7]:
# check if description for account contains 'bot', 'parody' or 'fake'
def guess_if_bot(row):
    _str = row['description'].lower() + row['screen_name'].lower()
    flag = 0
    bot_words = ['bot', 'paroody', 'fake']
    if any(word in _str for word in bot_words):
        flag = 1
    return flag

sample['desc_bot'] = sample.apply(lambda row: guess_if_bot(row), axis = 1)
sample.head(5)

Unnamed: 0,id,name,screen_name,description,date_created,followers,following,last_updated,image_url,is_bot,category,f2f,desc_bot
0,74013,CJ n TX,ccjones,Nothing is as it seems.,2006-12-16 18:21:51,3918,3398,2019-04-13 08:10:57,http://pbs.twimg.com/profile_images/1060243585...,unknown,NOT,1.153031,0
1,681403,Jason Benway,BenwayNet,"Christian,husband,father,Computer geek, and xb...",2007-01-22 16:42:28,1068,1560,2019-04-15 00:32:18,http://pbs.twimg.com/profile_images/1111705638...,unknown,BOT,0.684615,0
2,736463,Won Peace of a Hole,dtweete,probably a bot. ✝,2007-01-31 07:20:08,624,1610,2019-04-14 06:21:22,http://pbs.twimg.com/profile_images/149195479/...,unknown,UNKNOWN,0.387578,1
3,755703,Dean Roth,deanroth,See that name up there? That's me. \n\nAlso av...,2007-02-07 04:13:21,649,790,2019-04-13 16:11:00,http://pbs.twimg.com/profile_images/4706719918...,unknown,BOT,0.821519,0
4,759066,David Neuland,Galloway,"Engineer, writer, Mac-addict. I. Am. Not. Geor...",2007-02-08 23:18:36,173,458,2019-04-14 04:50:49,http://pbs.twimg.com/profile_images/22917772/I...,unknown,UNKNOWN,0.377729,0


In [8]:
# build random forest model
clf2 = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
feats = ['f2f', 'desc_bot']
labs = ['category']
features = sample[feats]
labels = sample[labs]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, labels, test_size=0.33, random_state=42)
clf2.fit(X_train2, y_train2)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
pred = clf2.predict(X_test2)
accuracy_score(y_test2, pred)

0.33289887295867515

This is probably pretty good for us since the values for the category were just randomly assigned. In the future, we will need to do this kind of identification based on the account's tweets

In [10]:
tweets = pd.read_csv('../data/tweets.csv')
tweets.dropna(inplace=True)
print(tweets.dtypes)
print(tweets.shape)
tweets.head(5)

created_at    object
id_str         int64
text          object
user_id        int64
dtype: object
(145436, 4)


Unnamed: 0,created_at,id_str,text,user_id
0,2018-06-04 10:51:53,1003590157038182400,#WhereAreTheChildren #MAGA #QAnon #WWG1WGA @re...,943544870857256961
1,2018-06-06 16:28:26,1004399626852724738,@realDonaldTrump God let's hope so!,741279278407417856
2,2018-06-08 11:22:34,1005047428876292096,@realDonaldTrump That's a fact!!!!! #MAGA,741279278407417856
3,2018-06-12 22:30:23,1006665040777314304,@realDonaldTrump Lol!,741279278407417856
4,2018-06-19 05:51:23,1008950350181617664,That's what all this traffic and cop cars and ...,65847626


In [11]:
import string
def remove_punct(row):
    s = row['text']
    return s.translate(str.maketrans('', '', string.punctuation))
tweets['text'] = tweets.apply(lambda row: remove_punct(row), axis = 1)

In [12]:
all_df = pd.merge(tweets, sample, left_on='user_id', right_on='id', how = 'left').drop('user_id', axis=1)
all_df.dropna(inplace=True)

# don't really care about when the status of an account is UNKNOWN
all_df = all_df[all_df['category'] != 'UNKNOWN']
print(all_df.shape)
all_df.head(5)

(75827, 16)


Unnamed: 0,created_at,id_str,text,id,name,screen_name,description,date_created,followers,following,last_updated,image_url,is_bot,category,f2f,desc_bot
0,2018-06-04 10:51:53,1003590157038182400,WhereAreTheChildren MAGA QAnon WWG1WGA realDon...,9.435449e+17,KeenWitGreen,KeenwithGreen,#Grateful to have been to #hell and back so th...,2017-12-20 18:13:22,1334.0,3404.0,2019-04-11 01:41:14,http://pbs.twimg.com/profile_images/9874906210...,unknown,NOT,0.391892,0.0
12,2018-07-12 16:21:40,1017443888666218496,RT charliekirk11 I am now convinced that Democ...,1183037000.0,lovey,travelingthroug,#MAGA #Trump2020 #MICHIGAN #DemocratsHateAmer...,2013-02-15 15:57:58,612.0,785.0,2019-04-15 12:26:27,http://pbs.twimg.com/profile_images/9655721204...,unknown,NOT,0.779618,0.0
16,2018-08-13 22:32:27,1029133610995961856,Venezuela isnt this the poster child the ANC i...,995149100.0,Warren,DebruinWarren,International gypsy relations,2012-12-07 14:10:59,19.0,50.0,2019-04-14 14:04:17,http://pbs.twimg.com/profile_images/2943694984...,unknown,BOT,0.38,0.0
20,2018-09-04 13:43:46,1036973093485301760,FoxNews DRUDGE ABC CBSNews NBCNews BreitbartNe...,1337672000.0,Campaign4America2016,Campain4America,2run as a Independent Presidential Candidate4 ...,2013-04-08 21:26:34,1044.0,1895.0,2019-04-13 19:17:31,http://pbs.twimg.com/profile_images/3496126894...,unknown,BOT,0.550923,0.0
22,2018-09-17 21:25:45,1041800399555112960,RT TomBradysEgo Josh Gordon pulling up to Gill...,553068700.0,Logan moore,L0ganmoore_,Dammit Jim,2012-04-13 20:27:11,308.0,212.0,2019-04-14 02:20:21,http://pbs.twimg.com/profile_images/1008854461...,unknown,NOT,1.45283,0.0


In [13]:
# add some weak NLP stuff 
from textblob import TextBlob
all_df[['polarity', 'subjectivity']] = all_df['text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

In [14]:
# build random forest model
clf3 = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
feats = ['polarity', 'subjectivity', 'f2f', 'desc_bot']
labs = ['category']
features = all_df[feats]
labels = all_df[labs]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, labels, test_size=0.33, random_state=42)
clf3.fit(X_train2, y_train2)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [15]:
pred = clf3.predict(X_test2)
accuracy_score(y_test2, pred)

0.5341885465371858