In [24]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib.request
import camelot
import tweepy
import tqdm
import re

In [25]:
congress = pd.read_pickle('../Data/Interim/congress.pkl')
trump = pd.read_pickle('../Data/Interim/trump.pkl')


# Concatinating the congress with Trump
congress_tweets = pd.concat([congress, trump])

In [26]:
# Removing duplicates
congress_tweets.drop_duplicates(keep='first', inplace=True)

In [27]:
congress_tweets.shape

(2192037, 10)

## Overlap between congress members and twitter handles

In [28]:
twitter_handles = pd.read_table('../Data/Processed/Twitter_Handles_updated.csv', sep = ',')
s1 = set(twitter_handles['twitter_display_name'])
s2 = set(congress_tweets.user_name.unique())

In [29]:
non_overlapping_twitter_profiles = s1 ^ s2

In [30]:
congress_tweets = congress_tweets[congress_tweets.user_name.isin(s1)]

In [31]:
congress_tweets.shape

(1637891, 10)

## Cleanup

In [32]:
congress_tweets['created_at'] = pd.to_datetime(congress_tweets.created_at)
congress_tweets = congress_tweets.sort_values(by='created_at')
congress_tweets = congress_tweets.reset_index(drop=True)

In [33]:
congress_tweets.to_pickle('../Data/Processed/congress.pkl')

## Preprocess

congress_tweets

In [34]:
special_characters = ",._´&’%':€$£!?#"
character_set = {
    "characters": "abcdefghijklmnopqrstuvwxyz0123456789" + special_characters,
    "space": " ",
}
alphabet = "".join(character_set.values())

In [36]:
regex_links = re.compile("http\S+")
regex_whitespace = re.compile("[\s|-]+")
regex_unknown = re.compile(f"[^{alphabet}]+")

In [44]:
regex_html_tags = {
    "&amp": "and",
    "&lt": "<",
    "&gt": ">",
    "&quot": '"',
    "&apos": "'",
}

In [45]:
## Replace unicode charetars
for pattern_string, char in regex_html_tags.items():
    congress_tweets["text"] = congress_tweets["text"].str.replace(pattern_string, char)

In [38]:
congress_tweets["text"] = (congress_tweets["text"]
    .str.lower()
    .str.replace(regex_links, "")
    .str.replace(regex_whitespace, character_set["space"])
    .str.replace(regex_unknown, '')
    .str.strip()
)

In [46]:
congress_tweets.iloc[100].text

"don't miss 2nd opportunity to watch aptv's inside congress: alabama to dc tonight at 8:30. features a veteran story near and dear to my ."

In [47]:
congress_tweets.to_pickle('../Data/Processed/congress_cleaned_processed.pkl')

In [42]:
tmp = pd.read_pickle('../Data/Processed/congress_cleaned_processed.pkl')

In [43]:
tmp.shape

(1637891, 10)