In [15]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Exstract Trump Tweet IDs

In [None]:
from src.data.trump_tweet_ids import get_trump_tweet_ids

In [4]:
df_trump_tweets1 = pd.read_csv('../Data/raw/tweets/trump_tweets_1st.csv')  
df_trump_tweets2 = pd.read_csv('../Data/raw/tweets/trump_tweets_1st.csv')
df_trump = pd.concat([df_trump_tweets1, df_trump_tweets2])

In [6]:
filepath = "../Data/raw/tweets/trump_id.txt"
get_trump_tweet_ids(df_trump, filepath)

11326 tweet ids saved


## Hydrate Tweets

In [24]:
import tweepy
from src.tools.twitter_api_credentials import api_key, api_secret_key, access_token, access_token_secret
from src.data.hydrate import hydrate_tweets

In [26]:
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
try:
    redirect_url = auth.get_authorization_url()
except tweepy.TweepError:
    print('Error! Failed to get request token.')


In [28]:
representatives115 = np.loadtxt(
    "../Data/Raw/Tweets/representatives115.txt", dtype=int
)
representatives116 = np.loadtxt(
    "../Data/Raw/Tweets/representatives116.txt", dtype=int
)
senators115 = np.loadtxt(
    "../Data/Raw/Tweets/senators115.txt", dtype=int
)
senators116 = np.loadtxt(
    "../Data/Raw/Tweets/senators116.txt", dtype=int
)
trump = np.loadtxt(
    "../Data/Raw/Tweets/trump_id.txt", dtype=int
)

Note running the cell below take $24 \pm 6$ hours as the twitter API set limits to how much can be exstracted.

In [None]:
congress = np.concatenate([representatives115, representatives116, senators115, senators116, trump])
filepath = "../Data/interim/congress.pkl"
api = tweepy.API(auth, wait_on_rate_limit=True)

hydrate_tweets(
    tweet_ids=congress,
    filepath=filepath,
    api = api
)

## Cleanup

In [33]:
congress = pd.read_pickle('../Data/Interim/congress.pkl')
twitter_handles = pd.read_table('../Data/Processed/Twitter_Handles_updated.csv', sep = ',')

s1 = set(twitter_handles['twitter_display_name'])
s2 = set(congress.user_name.unique())

In [38]:
non_overlapping_twitter_profiles = s1 ^ s2

In [None]:
# Make sure tweets only comes from people that twitter handles exist for. 
congress = congress[congress.user_name.isin(s1)]

In [None]:
# Keep only the periods from Harward:
mask = (
    #January 27, 2017 and January 2, 2019 
    (congress.created_at > '2017-1-27 00:00:00') & (congress.created_at < '2019-1-2 00:00:00')
    | 
    #January 27, 2019 and May 7, 2020 
    (congress.created_at > '2019-1-27 00:00:00') & (congress.created_at < '2020-5-7 00:00:00')
)
congress = congress[mask]

In [None]:
congress = congress.drop_duplicates(keep='first')
congress = congress.sort_values(by='created_at')
congress = congress.reset_index(drop=True)
congress.to_pickle("Data/interim/congress_cleaned.pkl")

In [None]:
# Extract the tweets ids and convert them to integers
ids = list(congress.id.astype(int).values)

filepath = "../Data/raw/tweets/Cleaned_tweet_id.txt"
with open(filepath, 'w') as output:
    for row in ids:
        output.write(str(row) + '\n')

    print(f'{len(ids)} tweet ids saved.')

## Shortcut to exstract the data

The dataframe with tweets of from congress after cleanup contain 60 % rows. 

Note running the cell below take $10 \pm 2.5$ hours as the twitter API set limits to how much can be exstracted.

In [None]:
congress_tweet_id = np.loadtxt("../Data/Raw/Tweets/Cleaned_tweet_id.txt", dtype=int)
filepath = "../Data/interim/congress_cleaned.pkl"

hydrate_tweets(
    tweet_ids=congress_tweet_id,
    filepath=filepath,
    api = api
)

## Preprocess the twitter data

In [None]:
congress = pd.read_pickle('../Data/Interim/congress_cleaned.pkl')

In [None]:
special_characters = ",._´&’%':€$£!?#"
character_set = {
    "characters": "abcdefghijklmnopqrstuvwxyz0123456789" + special_characters,
    "space": " ",
}
alphabet = "".join(character_set.values())

In [None]:
regex_links = re.compile("http\S+")
regex_whitespace = re.compile("[\s|-]+")
regex_unknown = re.compile(f"[^{alphabet}]+")

In [None]:
regex_html_tags = {
    "&amp": "and",
    "&lt": "<",
    "&gt": ">",
    "&quot": '"',
    "&apos": "'",
}

In [None]:
## Replace unicode charetars
for pattern_string, char in regex_html_tags.items():
    congress_tweets["text"] = congress_tweets["text"].str.replace(pattern_string, char)

In [None]:
congress_tweets["text"] = (congress_tweets["text"]
    .str.lower()
    .str.replace(regex_links, "")
    .str.replace(regex_whitespace, character_set["space"])
    .str.replace(regex_unknown, '')
    .str.strip()
)

In [None]:
congress_tweets.to_pickle('../Data/Processed/congress_cleaned_processed.pkl')