## Dataset Builder
This will build a dataset which can then be utilized to build machine learning models. It will essentially modify the accounts table by adding a few useful values such as average tweet sentiment, average tweet length, average number of hashtags used, and some other data

In [1]:
import numpy as np
import pandas as pd

In [2]:
accounts = pd.read_csv('../data/data.csv')
accounts.dropna(inplace=True)
print(accounts.shape)
print(accounts.dtypes)

(39522, 10)
id               int64
name            object
screen_name     object
description     object
date_created    object
followers        int64
following        int64
last_updated    object
image_url       object
is_bot          object
dtype: object


In [3]:
# TODO GET RID OF THIS WHEN WE HAVE ACTUAL LABELS
import random
vals = ['NOT', 'BOT', 'UNKNOWN']
accounts['category'] = [random.choice(vals) for k in accounts.index]
accounts['bot_ratio']= [random.uniform(0,1) for k in accounts.index]

In [4]:
# calculate follower-following ratio
def calculate_f2f(row):
    try:
        val = row['followers'] / row['following']
    except ZeroDivisionError:
        val = 9999999999999
    return val

accounts['f2f'] = accounts.apply (lambda row: calculate_f2f(row), axis=1)

In [5]:
# check if description for account contains 'bot', 'parody' or 'fake'
def guess_if_bot(row):
    _str = row['description'].lower() + row['screen_name'].lower()
    flag = 0
    bot_words = ['bot', 'paroody', 'fake']
    if any(word in _str for word in bot_words):
        flag = 1
    return flag

accounts['bot_guess'] = accounts.apply(lambda row: guess_if_bot(row), axis = 1)

In [6]:
tweets = pd.read_csv('../data/tweets.csv')
tweets.dropna(inplace = True)
print(tweets.shape)
print(tweets.dtypes)

(145436, 4)
created_at    object
id_str         int64
text          object
user_id        int64
dtype: object


In [7]:
import string
import re    
def remove_punct(row):
    s = row['text']
    remove = string.punctuation
    remove = remove.replace("#", "")
    remove = remove.replace("@", "")
    pattern = r"[{}]".format(remove)
    s = re.sub(pattern, "", s)
    return s

tweets['text'] = tweets.apply(lambda row: remove_punct(row), axis = 1)

In [8]:
# add some weak NLP stuff 
from textblob import TextBlob
tweets[['polarity', 'subjectivity']] = tweets['text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

Need to write a function that gathers stats about tweets from a given account. We will want to track the average tweet length, average number of hashtags used and average tweet sentiment (polarity & subjectivity).

In [9]:
def get_tweet_info(row):
    row = row[1]
    tweet = row['text']
    pol = row['polarity']
    sub = row['subjectivity']
    length = len(tweet)
    hashtags = len(re.findall("#", tweet))
    mentions = len(re.findall("@", tweet))
    return (pol, sub, length, hashtags, mentions)

In [10]:
from statistics import mean
def get_tweets_for_user(row):
    uid = row['id']
    user_tweets = tweets[tweets['user_id'] == uid]
    if not user_tweets.empty:
        pols = []
        subs = []
        lengths = []
        hashtags = []
        mentions = []
    
        for row in user_tweets.iterrows():
            
            pol, sub, length, hashtag, mention = get_tweet_info(row)
            pols.append(pol)
            subs.append(sub)
            lengths.append(length)
            hashtags.append(hashtag)
            mentions.append(mention)
    
        ave_pol = mean(pols)
        ave_sub = mean(subs)
        ave_length = mean(lengths)
        ave_hashtags = mean(hashtags)
        ave_mentions = mean(mentions)
    
        return [ave_pol, ave_sub, ave_length, ave_hashtags, ave_mentions]
    else:
        return 0

In [11]:
cols = ['ave_polarity', 'ave_subjectivity', 'ave_length', 'ave_hashtags', 'ave_mentions']
for name in cols:
    accounts[name] = random.random()

In [12]:
for i, row in accounts.iterrows():
    l = get_tweets_for_user(row)
    if l:
        j = 0
        for name in cols:
            accounts.at[i, name] = l[j]
            j += 1

In [13]:
accounts.head(5)

Unnamed: 0,id,name,screen_name,description,date_created,followers,following,last_updated,image_url,is_bot,category,bot_ratio,f2f,bot_guess,ave_polarity,ave_subjectivity,ave_length,ave_hashtags,ave_mentions
0,74013,CJ n TX,ccjones,Nothing is as it seems.,2006-12-16 18:21:51,3918,3398,2019-04-13 08:10:57,http://pbs.twimg.com/profile_images/1060243585...,unknown,UNKNOWN,0.954182,1.153031,0,0.0,0.25,136.0,0.0,1.0
1,681403,Jason Benway,BenwayNet,"Christian,husband,father,Computer geek, and xb...",2007-01-22 16:42:28,1068,1560,2019-04-15 00:32:18,http://pbs.twimg.com/profile_images/1111705638...,unknown,BOT,0.448861,0.684615,0,0.0,0.0,122.0,2.0,3.0
2,736463,Won Peace of a Hole,dtweete,probably a bot. ✝,2007-01-31 07:20:08,624,1610,2019-04-14 06:21:22,http://pbs.twimg.com/profile_images/149195479/...,unknown,BOT,0.195416,0.387578,1,0.0625,0.21875,137.5,0.0,1.5
3,755703,Dean Roth,deanroth,See that name up there? That's me. \n\nAlso av...,2007-02-07 04:13:21,649,790,2019-04-13 16:11:00,http://pbs.twimg.com/profile_images/4706719918...,unknown,BOT,0.391906,0.821519,0,0.0,0.625,138.0,0.0,2.0
4,759066,David Neuland,Galloway,"Engineer, writer, Mac-addict. I. Am. Not. Geor...",2007-02-08 23:18:36,173,458,2019-04-14 04:50:49,http://pbs.twimg.com/profile_images/22917772/I...,unknown,NOT,0.578891,0.377729,0,0.0,0.066667,123.0,0.0,1.0


In [14]:
accounts.to_csv(path_or_buf='../data/accounts_processed.csv')