In [80]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib.request
import camelot
import tweepy
import tqdm
from src.tools.twitter_api import auth
import re

In [61]:
congress = pd.read_pickle('../Data/Interim/congress.pkl')
trump = pd.read_pickle('../Data/Interim/trump.pkl')


# Concatinating the congress with Trump
congress_tweets = pd.concat([congress, trump])

In [62]:
# Removing duplicates
congress_tweets.drop_duplicates(keep='first', inplace=True)

In [63]:
congress_tweets.shape

(2192037, 10)

## Overlap between congress members and twitter handles

In [64]:
twitter_handles = pd.read_table('../Data/Processed/Twitter_Handles_updated.csv', sep = ',')
s1 = set(twitter_handles['twitter_display_name'])
s2 = set(congress_tweets.user_name.unique())

In [65]:
non_overlapping_twitter_profiles = s1 ^ s2

In [66]:
congress_tweets = congress_tweets[congress_tweets.user_name.isin(s1)]

In [67]:
congress_tweets.shape

(1637891, 10)

## Cleanup

In [68]:
congress_tweets['created_at'] = pd.to_datetime(congress_tweets.created_at)
congress_tweets = congress_tweets.sort_values(by='created_at')
congress_tweets = congress_tweets.reset_index(drop=True)

In [69]:
congress_tweets.to_pickle('../Data/Processed/congress.pkl')

## Preprocess

congress_tweets

In [119]:
special_characters = ",._´&’%':€$£!?#"
character_set = {
    "characters": "abcdefghijklmnopqrstuvwxyz0123456789" + special_characters,
    "space": " ",
    "unknown": ""
}
alphabet = "".join(character_set.values())

In [86]:
special_characters = ",._´’%'\":€$£!?#"

character_set = {
    "characters": "abcdefghijklmnopqrstuvwxyz0123456789" + special_characters,
    "space": " ",
}

In [121]:
regex_links = re.compile("http\S+")
regex_whitespace = re.compile("[\s|-]+")
regex_unknown = re.compile(f"[^{alphabet}]+")

In [120]:
regex_html_tags = {
    "&amp;": "and",
    "&lt;": "<",
    "&gt;": ">",
    "&quot;": '"',
    "&apos;": "'",
}

In [123]:
congress_tweets["text"] = (congress_tweets["text"]
    .str.lower()
    .str.replace(regex_links, "")
    .str.replace(regex_whitespace, character_set["space"])
    .str.replace(regex_unknown, character_set["unknown"])
    .str.strip()
)

In [122]:
## Replace unicode charetars
for pattern_string, char in regex_html_tags.items():
    congress_tweets["text"] = congress_tweets["text"].str.replace(pattern_string, char)

In [126]:
congress_tweets.iloc[100].text

"don't miss 2nd opportunity to watch aptv's inside congress: alabama to dc tonight at 8:30. features a veteran story near and dear to my ."

In [127]:
congress_tweets.to_pickle('../Data/Processed/congress_cleaned_processed.pkl')