## Putting data into MongoDB in AWS

In [10]:
import requests
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

from pymongo import MongoClient

import os
import sys

import re

from datetime import datetime

## Download the data

In [24]:
zipurl = "http://www.thegrammarlab.com/?wpdmpro=corpus-of-presidential-speeches&wpdmdl=595&refresh=5ec70e29da09f1590103593"
zipurl2 = "http://www.thegrammarlab.com/?wpdmpro=clintontrump-corpus&wpdmdl=597&refresh=5ec70e03344501590103555"

In [26]:
def save_zip(zipurl, name):
    with urlopen(zipurl) as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            zfile.extractall(name)

In [27]:
save_zip(zipurl, "speeches")
save_zip(zipurl2, "trump_clinton_speeches")

In [29]:
!cp -r "trump_clinton_speeches/Clinton-Trump Corpus/Trump" "speeches/Corpus of Presential Speeches/"
!rm -r "trump_clinton_speeches/"

## Upload to Mongo

In [15]:
config = {
    'host': '13.56.124.215:27017',
    'username': 'fisher',
    'password': 'mongoPassword',
    'authSource': 'speeches'
}

client = MongoClient(**config)
db = client.speeches

db.list_collection_names() # check the connection

['speeches']

In [16]:
db.speeches.find_one()

{'_id': ObjectId('5ec4c64db0366fe6fd550001'),
 'speaker': 'coolidge',
 'title': 'Address at the Opening of Work on Mount Rushmore in Black Hills, SD',
 'date': datetime.datetime(1927, 8, 10, 0, 0),
 'content': 'We have come here to dedicate a cornerstone that was laid by the hand of the Almighty. On this towering wall of Rushmore, in the heart of the Black Hills, is to be inscribed a memorial which will represent some of the outstanding features of four of our Presidents, laid on by the hand of a great artist in sculpture. This memorial will crown the height of land between the Rocky Mountains and the Atlantic Seaboard, where coming generations may view it for all time.\nIt is but natural that such a design should begin with George Washington, for with him begins that which is truly characteristic of America. He represents our independence, our Constitution, our liberty. He formed the highest aspirations that were entertained by any people into the permanent institutions of our Governm

In [30]:
# db.drop_collection("speeches")

{'nIndexesWas': 1, 'ns': 'speeches.speeches', 'ok': 1.0}

In [18]:
def get_title(line):
    """
    gets the title from a string of format title="[title]"
    """
    assert line.count('"') == 2, "unexpected number of quote characters (expected 2): {}".format(line)
    start_quote_idx = line.find('"')
    end_quote_idx = line.rfind('"')
    txt = line[start_quote_idx+1 : end_quote_idx]
    return txt

In [19]:
def get_date(line):
    """
    gets the date from a string
    """
    date_re = r"([a-zA-Z]+ \d{1,2}, \d{4}|\d{4}-\d{2}-\d{2})"
    date = re.findall(date_re, line)[0]
    # two date formats are used in the dataset...
    try:
        date = datetime.strptime(date, "%B %d, %Y")
    except:
        date = datetime.strptime(date, "%Y-%m-%d")
    return date

In [31]:
path = "speeches/Corpus of Presential Speeches"
for pres in os.listdir(path):
    if pres == ".DS_Store":
        continue
    print("loading {}'s speeches".format(pres.lower()))
    speeches_dir = os.path.join(path, pres)
    for speech_file in os.listdir(speeches_dir):
        speech = {"speaker": pres.lower()}
        speech_file_path = os.path.join(path, pres, speech_file)
        with open(speech_file_path, "r") as f:
            
            # files have irregular format for the starting lines.
            # read lines until the date line has been parsed
            while "date" not in speech.keys():
                line = f.readline()
                if "title" in line or "date" in line: # if this is the date or title line
                    if "title" in speech.keys(): # if we already have the title, get the date
#                        try:
                        date = get_date(line)
                        speech["date"] = date
#                        except: print("parsing date failed for line: {}".format(line))
                    else: # if we haven't gotten the title yet
                        speech["title"] = get_title(line) # title
                
            speech["content"] = f.read() # read remaining text
        db.speeches.insert_one(speech)

loading coolidge's speeches
loading tyler's speeches
loading wilson's speeches
loading ford's speeches
loading pierce's speeches
loading lincoln's speeches
loading washington's speeches
loading reagan's speeches
loading hoover's speeches
loading jefferson's speeches
loading bharrison's speeches
loading monroe's speeches
loading carter's speeches
loading taft's speeches
loading madison's speeches
loading roosevelt's speeches
loading eisenhower's speeches
loading buchanan's speeches
loading lbjohnson's speeches
loading adams's speeches
loading arthur's speeches
loading fillmore's speeches
loading kennedy's speeches
loading fdroosevelt's speeches
loading hayes's speeches
loading obama's speeches
loading bush's speeches
loading johnson's speeches
loading cleveland's speeches
loading nixon's speeches
loading harrison's speeches
loading taylor's speeches
loading clinton's speeches
loading trump's speeches
loading truman's speeches
loading gwbush's speeches
loading garfield's speeches
loading

In [32]:
db.speeches.find_one({"speaker": "trump"})

{'_id': ObjectId('5ec71104fc4ce11ff2e0d195'),
 'speaker': 'trump',
 'title': 'Donald Trump, Republican Presidential Candidate, delivers remarks at a campaign event in Ambridge, Pennsylvania (SD/24)',
 'date': datetime.datetime(2016, 10, 10, 0, 0),
 'content': '<TRUMP:> Thank you. Thank you. Thank you. And we love Big Ben. Big Ben\'s a friend. <APPLAUSE> We play golf together. He once hit a shot that went head <ph> right into a tree. It was a massive tree like this. The ball hit the tree so hard, I swear I said, Ben, within two years, that\'s tree\'s going to be dead. And you know what? Two years later, he\'s shooting the same. It\'s because we have <inaudible> here. <LAUGHTER> But two years later, that tree was dead. <LAUGHTER> He is a strong guy, and he\'s good guy too. Big Ben. <APPLAUSE> So I\'m thrilled to be back in Pennsylvania. Great place. I went to school in Pennsylvania. November 8th, we\'re going to win this state and we\'re going to win back the White House. <APPLAUSE> We\'