Goal: Build a machine-learning algorthim that can predict whether a tweet is more likely to interact with (i.e. retweet at) @BarackObama or @realDonaldTrump, based on the text of the tweet.

In [294]:
#! pip install --user emoji

In [295]:
import json
import emoji
from random import randint
import oauth2 as oauth
import pandas as pd
import numpy as np
from credentials import *

In [296]:
def oauth_twitter_search(query, consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET):
    """ Search Twitter ...
    looks like Tweets with "truncated": true could pose a problem
    """
    search_endpoint = "https://api.twitter.com/1.1/search/tweets.json"
    compiled_search_endpoint = "{}?q={}+-filter:retweets&count=100&result_type=recent&lang=en&tweet_mode=extended".format(search_endpoint, query)
    consumer = oauth.Consumer(key=CONSUMER_KEY, secret=CONSUMER_SECRET)
    client = oauth.Client(consumer)
    response, data = client.request(compiled_search_endpoint)
    tweets = json.loads(data)
    return tweets

In [297]:
#load the csv of previously fetched tweets
df = pd.read_csv('44v45tweets.csv', index_col = 0)
#print(df.shape)
#df.head()

In [298]:
# fetch the tweets
tweets_44 = oauth_twitter_search("@BarackObama")['statuses']
tweets_45 = oauth_twitter_search("@realDonaldTrump")['statuses']
print(len(tweets_44), len(tweets_45))

78 93


In [299]:
#join the lists of statuses
tweets_all = tweets_44 + tweets_45
print(len(tweets_all))

171


In [300]:
# prune each fetched tweet set
pruned_tweets = []
for tweet in tweets_all:
    d = {}
    d['at_44'] = '@barackobama' in tweet['full_text'].lower()
    d['at_45'] = '@realdonaldtrump' in tweet['full_text'].lower()
    d['id_str'] = tweet['id_str']
    d['full_text'] = tweet['full_text']
    d['pruned_text'] = tweet['full_text'].lower().replace('@barackobama', '').replace('@realdonaldtrump', '')
    pruned_tweets.append(d)

In [301]:
df_tweets = pd.DataFrame(pruned_tweets)
df_tweets = df_tweets[['id_str', 'at_44', 'at_45', 'full_text', 'pruned_text']]
#df_tweets.head()

In [302]:
df_tweets.loc[(df_tweets['at_44'] == True) & (df_tweets['at_45'] == True)].shape

(35, 5)

In [303]:
#df.head()

In [304]:
compiled = pd.concat([df, df_tweets])
compiled.shape

(1662, 5)

In [305]:
compiled.dtypes

id_str         object
at_44            bool
at_45            bool
full_text      object
pruned_text    object
dtype: object

In [306]:
compiled.loc[(compiled['at_44'] == True) & (compiled['at_45'] == True)].shape

(202, 5)

In [307]:
compiled.drop_duplicates(subset = ['id_str']).shape

(1599, 5)

In [308]:
compiled.drop_duplicates(subset = ['id_str'], inplace = True)

In [309]:
compiled.to_csv('44v45tweets.csv')

In [310]:
print(compiled.shape[0] - df.shape[0])

108
