## Dataset Builder
This will build a dataset which can then be utilized to build machine learning models. It will essentially modify the accounts table by adding a few useful values such as average tweet sentiment, average tweet length, average number of hashtags used, and some other data

In [21]:
import numpy as np
import pandas as pd

In [24]:
verified = pd.read_csv('../data/verified_accounts.csv')
verified.dropna(inplace=True)
print(verified.shape)
print(verified.dtypes)
verified.head(5)

(404, 10)
id               int64
name            object
screen_name     object
description     object
created_at      object
followers        int64
following        int64
last_updated    object
image_url       object
is_bot           int64
dtype: object


Unnamed: 0,id,name,screen_name,description,created_at,followers,following,last_updated,image_url,is_bot
1,11489,Brian Solis,briansolis,"Digital Analyst and Anthropologist, Best-Selli...",2006-11-04 15:36:18,281706,2875,2019-05-05 19:27:43,http://pbs.twimg.com/profile_images/2536862457...,0
2,12925,janina gavankar,Janina,actor. musician. geek.,2006-11-18 07:00:35,179683,1121,2019-05-05 19:27:42,http://pbs.twimg.com/profile_images/1074201164...,0
3,78453,Khoi Vinh,khoi,Principal designer at @adobe working on @adobe...,2006-12-18 22:14:59,326623,2883,2019-05-05 19:27:42,http://pbs.twimg.com/profile_images/1775709982...,0
5,740983,Loic Le Meur,loic,entrepreneur - building a service for event or...,2007-02-01 00:10:23,125582,1233,2019-05-05 19:27:41,http://pbs.twimg.com/profile_images/1031552782...,0
6,749963,Pierre Omidyar,pierre,"Be you. Be cool. eBay, Omidyar Network, Peer N...",2007-02-03 02:41:31,488554,458,2019-05-05 19:27:40,http://pbs.twimg.com/profile_images/860459345/...,0


In [25]:
bots = pd.read_csv('../data/known_bots.csv')
bots.drop('political_leaning', inplace = True, axis=1)
bots.drop('account_category', inplace = True, axis=1)
print(bots.shape)
print(bots.dtypes)
bots.head(5)

(2075, 10)
date_created     object
description     float64
followers         int64
following         int64
id                int64
image_url       float64
is_bot            int64
last_updated     object
name            float64
screen_name      object
dtype: object


Unnamed: 0,date_created,description,followers,following,id,image_url,is_bot,last_updated,name,screen_name
0,0000-00-00 00:00:00,,5246,105,131812518,,1,0000-00-00 00:00:00,,PRIBYSHIN
1,0000-00-00 00:00:00,,58,35,189295762,,1,0000-00-00 00:00:00,,KRUTKOT
2,0000-00-00 00:00:00,,148,1025,189659120,,1,0000-00-00 00:00:00,,PERESPAL
3,0000-00-00 00:00:00,,16153,1951,201334945,,1,0000-00-00 00:00:00,,MANZAL_
4,0000-00-00 00:00:00,,15,156,204279384,,1,0000-00-00 00:00:00,,MUZAAMURA


In [27]:
frames = [verified, bots]
accounts = pd.concat(frames, sort = False)
# accounts = pd.read_csv('../data/data.csv')
# accounts.dropna(inplace=True)
# print(accounts.shape)
# print(accounts.dtypes)

In [36]:
accounts.drop('date_created', inplace = True, axis=1)
accounts.head(5)

Unnamed: 0,id,name,screen_name,description,created_at,followers,following,last_updated,image_url,is_bot,category,bot_ratio,f2f,bot_guess
1,11489,Brian Solis,briansolis,"Digital Analyst and Anthropologist, Best-Selli...",2006-11-04 15:36:18,281706,2875,2019-05-05 19:27:43,http://pbs.twimg.com/profile_images/2536862457...,0,0,0.423008,97.984696,0
2,12925,janina gavankar,Janina,actor. musician. geek.,2006-11-18 07:00:35,179683,1121,2019-05-05 19:27:42,http://pbs.twimg.com/profile_images/1074201164...,0,0,0.987474,160.288136,0
3,78453,Khoi Vinh,khoi,Principal designer at @adobe working on @adobe...,2006-12-18 22:14:59,326623,2883,2019-05-05 19:27:42,http://pbs.twimg.com/profile_images/1775709982...,0,0,0.85797,113.292751,0
5,740983,Loic Le Meur,loic,entrepreneur - building a service for event or...,2007-02-01 00:10:23,125582,1233,2019-05-05 19:27:41,http://pbs.twimg.com/profile_images/1031552782...,0,0,0.942654,101.85077,0
6,749963,Pierre Omidyar,pierre,"Be you. Be cool. eBay, Omidyar Network, Peer N...",2007-02-03 02:41:31,488554,458,2019-05-05 19:27:40,http://pbs.twimg.com/profile_images/860459345/...,0,0,0.727868,1066.71179,0


In [29]:
# TODO GET RID OF THIS WHEN WE HAVE ACTUAL LABELS
# import random
# vals = ['NOT', 'BOT', 'UNKNOWN']
# accounts['category'] = [random.choice(vals) for k in accounts.index]
accounts['bot_ratio']= [random.uniform(0,1) for k in accounts.index]

In [30]:
# calculate follower-following ratio
def calculate_f2f(row):
    try:
        val = row['followers'] / row['following']
    except ZeroDivisionError:
        val = 9999999999999
    return val

accounts['f2f'] = accounts.apply (lambda row: calculate_f2f(row), axis=1)

In [31]:
# check if description for account contains 'bot', 'parody' or 'fake'
def guess_if_bot(row):
    _str = row['screen_name'].lower()
    flag = 0
    bot_words = ['bot', 'parody', 'fake']
    if any(word in _str for word in bot_words):
        flag = 1
    return flag

accounts['bot_guess'] = accounts.apply(lambda row: guess_if_bot(row), axis = 1)

In [52]:
bot_tweets = pd.read_csv('../data/known_bots_tweets.csv')
verified_tweets = pd.read_csv('../data/verified_accounts_tweets.csv')
verified_tweets.head(4)

Unnamed: 0,id_str,created_at,text,user_id
0,1000001565959380992,2018-05-25 13:12:07,I’ve joined @missingpeople’s #TeamBigTweet Hel...,24447643
1,1000013625854054402,2018-05-25 14:00:02,We just added some Meet &amp; Greet VIP ticket...,50750339
2,1000026257680142336,2018-05-25 14:50:14,This Sausage Pancake Egg Sandwich combines all...,14880616
3,1000034322152280064,2018-05-25 15:22:16,YIKES!\nAn Amazon Echo recorded a family’s con...,36370563


In [53]:
bot_tweets.head(4)

Unnamed: 0,created_at,id_str,text,user_id,updates,post_type,retweet,region,language
0,2016-01-01 00:36:00,,"71-year-old driver OK after car flips, falls 1...",3091936475,3069,,0,United States,English
1,2016-01-01 00:36:00,,2 from North Carolina caught with counterfeit ...,3091936475,3070,,0,United States,English
2,2016-01-01 13:02:00,,Newark pastor leaves church with legacy of soc...,3091936475,3077,,0,United States,English
3,2016-01-01 13:13:00,,Top 10 Essex court cases and decisions to look...,3091936475,3078,,0,United States,English


In [60]:
frames = [verified_tweets, bot_tweets]
tweets = pd.concat(frames, sort = False)
tweets.drop('id_str', inplace=True, axis=1)
tweets.drop('post_type', inplace=True, axis=1)
tweets.drop('retweet', inplace=True, axis=1)
tweets.drop('region', inplace=True, axis=1)
tweets.drop('language', inplace=True, axis=1)
tweets.drop('updates', inplace=True, axis=1)
tweets.dropna(inplace=True)
tweets.head(79)

Unnamed: 0,created_at,text,user_id
0,2018-05-25 13:12:07,I’ve joined @missingpeople’s #TeamBigTweet Hel...,24447643
1,2018-05-25 14:00:02,We just added some Meet &amp; Greet VIP ticket...,50750339
2,2018-05-25 14:50:14,This Sausage Pancake Egg Sandwich combines all...,14880616
3,2018-05-25 15:22:16,YIKES!\nAn Amazon Echo recorded a family’s con...,36370563
4,2018-05-25 15:23:26,My new song Coming From Afar featuring @Mavado...,18022904
5,2018-05-25 15:43:41,@Walkin_Louie @OriginalFunko @AMCTalkingDead @...,15666380
6,2018-05-25 15:47:37,Now that’s an outfit I can get with,35982046
7,2018-05-25 16:00:20,@Daryle_Dobos @NikkiGlaser Transcendental Medi...,15666380
8,2018-05-25 16:05:19,"Sorry to be negative, but it's a bummer that @...",15666380
9,2018-05-25 17:04:58,RT @juliepilat: “I know I talk a lot but we do...,7119102


In [61]:
import string
import re    
def remove_punct(row):
    s = row['text']
    remove = string.punctuation
    remove = remove.replace("#", "")
    remove = remove.replace("@", "")
    pattern = r"[{}]".format(remove)
    s = re.sub(pattern, "", s)
    return s

tweets['text'] = tweets.apply(lambda row: remove_punct(row), axis = 1)

In [62]:
# add some weak NLP stuff 
from textblob import TextBlob
tweets[['polarity', 'subjectivity']] = tweets['text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

Need to write a function that gathers stats about tweets from a given account. We will want to track the average tweet length, average number of hashtags used and average tweet sentiment (polarity & subjectivity).

In [63]:
def get_tweet_info(row):
    row = row[1]
    tweet = row['text']
    pol = row['polarity']
    sub = row['subjectivity']
    length = len(tweet)
    hashtags = len(re.findall("#", tweet))
    mentions = len(re.findall("@", tweet))
    return (pol, sub, length, hashtags, mentions)

In [64]:
from statistics import mean
def get_tweets_for_user(row):
    uid = row['id']
    user_tweets = tweets[tweets['user_id'] == uid]
    if not user_tweets.empty:
        pols = []
        subs = []
        lengths = []
        hashtags = []
        mentions = []
        n_tweets = user_tweets.shape[0]
        for row in user_tweets.iterrows():
            
            pol, sub, length, hashtag, mention = get_tweet_info(row)
            pols.append(pol)
            subs.append(sub)
            lengths.append(length)
            hashtags.append(hashtag)
            mentions.append(mention)
    
        ave_pol = mean(pols)
        ave_sub = mean(subs)
        ave_length = mean(lengths)
        ave_hashtags = mean(hashtags)
        ave_mentions = mean(mentions)
        return [ave_pol, ave_sub, ave_length, ave_hashtags, ave_mentions, n_tweets]
    else:
        return 0

In [65]:
cols = ['ave_polarity', 'ave_subjectivity', 'ave_length', 'ave_hashtags', 'ave_mentions', 'n_tweets']
for name in cols:
    accounts[name] = random.random()

In [66]:
for i, row in accounts.iterrows():
    l = get_tweets_for_user(row)
    if l:
        j = 0
        for name in cols:
            accounts.at[i, name] = l[j]
            j += 1

In [67]:
accounts.head(5)

Unnamed: 0,id,name,screen_name,description,created_at,followers,following,last_updated,image_url,is_bot,category,bot_ratio,f2f,bot_guess,ave_polarity,ave_subjectivity,ave_length,ave_hashtags,ave_mentions,n_tweets
1,11489,Brian Solis,briansolis,"Digital Analyst and Anthropologist, Best-Selli...",2006-11-04 15:36:18,281706,2875,2019-05-05 19:27:43,http://pbs.twimg.com/profile_images/2536862457...,0,0,0.423008,97.984696,0,0.117515,0.309054,111.635838,0.404624,0.982659,173.0
2,12925,janina gavankar,Janina,actor. musician. geek.,2006-11-18 07:00:35,179683,1121,2019-05-05 19:27:42,http://pbs.twimg.com/profile_images/1074201164...,0,0,0.987474,160.288136,0,0.100108,0.279754,91.586735,0.153061,1.214286,196.0
3,78453,Khoi Vinh,khoi,Principal designer at @adobe working on @adobe...,2006-12-18 22:14:59,326623,2883,2019-05-05 19:27:42,http://pbs.twimg.com/profile_images/1775709982...,0,0,0.85797,113.292751,0,0.16082,0.437966,90.585,0.015,0.5,200.0
5,740983,Loic Le Meur,loic,entrepreneur - building a service for event or...,2007-02-01 00:10:23,125582,1233,2019-05-05 19:27:41,http://pbs.twimg.com/profile_images/1031552782...,0,0,0.942654,101.85077,0,0.138902,0.330753,88.35,0.115,1.125,200.0
6,749963,Pierre Omidyar,pierre,"Be you. Be cool. eBay, Omidyar Network, Peer N...",2007-02-03 02:41:31,488554,458,2019-05-05 19:27:40,http://pbs.twimg.com/profile_images/860459345/...,0,0,0.727868,1066.71179,0,0.077174,0.431647,147.975,0.0,0.21,200.0


In [68]:
accounts.to_csv(path_or_buf='../data/accounts_processed.csv')