In [221]:
from requests import get
from csv import DictReader
from bs4 import BeautifulSoup as Soup
from datetime import date
from io import StringIO
import pandas as pd

In [251]:
ewarren_url = "https://politicalemails.org/organizations/117"
dtrump_url = "https://politicalemails.org/organizations/415"

hdr = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Referer': 'https://cssspritegenerator.com',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'none',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'
}

In [246]:
def get_individual_message_links(base_url):
    '''
    Get the links to individual messages from a candidate's webpage
    '''
    d = get(base_url, headers=hdr)

    # get the webpage content
    soup = Soup(d.content, 'html.parser')

    # filter to the instances of the links to individual messages
    first50messages = soup.find_all('a', {'class': 'message-tease message-tease--list'})

    # get the urls for the individual messages
    message_links = [message['href'] for message in first50messages]
    
    return message_links

In [270]:
def get_message_contents(message_links, export=True):
    '''
    Get the contents of messages, given links to the messages
    '''
    all_text = {}
    for url in message_links:
        all_text[url] = {}
        d = get(url, headers=hdr)
        page_content = Soup(d.content, 'html.parser')
        meta_keys = page_content.find_all('td', {'class': 'content-box-meta__key'})
        meta_values = page_content.find_all('td', {'class': 'content-box-meta__value'})

        # get the metadata
        clean_keys = []
        clean_values = []
        for item in meta_keys:
            clean_keys += [i.strip() for i in item.strings]
        for item in meta_values:
            clean_values.append([i.strip() for i in item.strings])
        clean_values[0] = ''.join(clean_values[0]).split('<[email\xa0protected]>')[0].strip()
        clean_values[1] = clean_values[1][0]
        clean_values[2] = clean_values[2][0]
        metadata = dict(zip(clean_keys, clean_values))

        # get the message text
        message_text = page_content.find('div', {'class': 'message-text'})
        if message_text is None:
            clean_text = "FATAL ERROR: Failed to parse this email"
        else:
            clean_text = ' '.join([item.strip().strip('_') for item in message_text.strings if (str(item) and len(item.strip().strip('_')) > 0)])

        # save it all off
        all_text[url] = metadata
        all_text[url]['body'] = clean_text
        
    if export:
        df = pd.DataFrame(all_text).T
        df = df.reset_index().rename({'index': 'message_url'}, axis=1)
        return df
    else:
        return all_text

# Get 50 of Warren's most recent messages

In [271]:
message_links = get_individual_message_links(ewarren_url)
warren_contents = get_message_contents(message_links)
warren_contents.to_csv('data/elizabeth_warren_50.csv')

# Get 50 of Trump's most recent messages

In [273]:
message_links = get_individual_message_links(dtrump_url)
trump_contents = get_message_contents(message_links)
trump_contents.to_csv('data/donald_trump_50.csv')