### Goal: 

Build a machine-learning algorthim that can predict whether a tweet is more likely to interact with (i.e. retweet at) @BarackObama or @realDonaldTrump, based on the text of the tweet.

In [None]:
#! pip install --user emoji

In [None]:
import json
# import emoji
# from random import randint
import oauth2 as oauth
import pandas as pd
# import numpy as np
from credentials import *

In [None]:
def oauth_twitter_search(query, consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET):
    """ Search Twitter ...
    """
    search_endpoint = "https://api.twitter.com/1.1/search/tweets.json"
    compiled_search_endpoint = "{}?q={}+-filter:retweets&count=100&result_type=recent&lang=en&tweet_mode=extended".format(search_endpoint, query)
    consumer = oauth.Consumer(key=CONSUMER_KEY, secret=CONSUMER_SECRET)
    client = oauth.Client(consumer)
    response, data = client.request(compiled_search_endpoint)
    tweets = json.loads(data)
    return tweets

In [None]:
# fetch the tweets and extract the statuses; 
tweets_44 = oauth_twitter_search("@BarackObama")['statuses']
tweets_45 = oauth_twitter_search("@realDonaldTrump")['statuses']

In [None]:
# combine the lists of fetched statuses, and check the count;
# we'll get at most 100 recent tweets from each search
tweets_all = tweets_44 + tweets_45

In [None]:
# prune each fetched tweet set ...
# a fetched 'status' is a dictionary. our keys of interest are
# the unique id of the tweet (id_str) and the full text of 
# the tweet (full_text)
pruned_tweets = []
for tweet in tweets_all:
    d = {}
    d['at_44'] = '@barackobama' in tweet['full_text'].lower()
    d['at_45'] = '@realdonaldtrump' in tweet['full_text'].lower()
    d['id_str'] = tweet['id_str']
    d['full_text'] = tweet['full_text']
    # since we are trying to predict the presence of such an @mention, 
    # we remove them from the fetched text
    d['pruned_text'] = tweet['full_text'].lower().replace('@barackobama', '').replace('@realdonaldtrump', '')
    pruned_tweets.append(d)

In [None]:
# build the data frame from the list of dicts
df_tweets = pd.DataFrame(pruned_tweets)
# for reasons I don't understand, the order of the keys in each dictionary
# does not (always?) transfer to the order of the columns of the data frame,
# so we manually correct for this
df_tweets = df_tweets[['id_str', 'at_44', 'at_45', 'full_text', 'pruned_text']]

In [None]:
# for my curiosity: how many of the fetched tweets are 
# @44 and @45? might be duplicates too. 
df_tweets.loc[(df_tweets['at_44'] == True) & (df_tweets['at_45'] == True)].shape[0]

In [None]:
#load the csv of previously fetched tweets
df = pd.read_csv('44v45tweets.csv', index_col = 0)

In [None]:
# take the previously fetched/cleaned tweets and the newly fetched tweets
# and put them into a single data frame
compiled = pd.concat([df, df_tweets])
# how many (possibly non-unique) tweets have we obtained so far?
compiled.shape[0]

In [None]:
# we expect duplicates to be fetched, either because (1) a tweet was fetched 
# by both searches, or (2) it was fetched by two consecutive runs of this script.
# so we drop the duplicates
compiled.drop_duplicates(subset = ['id_str'], inplace = True)

In [None]:
# write the compiled data frame to csv
compiled.to_csv('44v45tweets.csv')

In [None]:
# how many new tweets (and unique) were fetched?
# anecdotal evidence suggests that the rate of tweets that meet (at least)
# one of the queries is 10 per minute.
# so i've been manually running this notebook every 10-or-so minutes
# on the evening of Saturday 4/13/19;
# a few more runs each day since
print(compiled.shape[0] - df.shape[0])

In [None]:
# of the total fetched tweets, how many are @44 ?
compiled.loc[(compiled['at_44'] == True)].shape[0]

In [None]:
# of the total fetched tweets, how many are @45 ?
compiled.loc[(compiled['at_45'] == True)].shape[0]