In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
base_url = 'https://www.presidency.ucsb.edu'

init_urls = ['https://www.presidency.ucsb.edu/documents/app-categories/elections-and-transitions/campaign-documents?items_per_page=250&field_docs_start_date_time_value%5Bvalue%5D%5Bdate%5D=2015',
             'https://www.presidency.ucsb.edu/documents/app-categories/elections-and-transitions/campaign-documents?items_per_page=250&field_docs_start_date_time_value%5Bvalue%5D%5Bdate%5D=2016']

In [4]:
sdf = pd.DataFrame(columns=['title', 'date', 'person', 'link'])
for u in init_urls:
    r = requests.get(u)
    s = BeautifulSoup(r.text, 'html.parser')
    dates = [d.find("span",attrs={'class':'date-display-single'})['content'] for d in s.find_all("div",attrs={'class':'views-row'})]
    doc_links = [d.find("div",attrs={'class':'field-title'}).find("a") for d in s.find_all("div",attrs={'class':'views-row'})]
    titles = [d.string for d in doc_links]
    links = [d['href'] for d in doc_links]
    persons = [d.find("div",attrs={'class':'col-sm-4'}).find("a").string for d in s.find_all("div",attrs={'class':'views-row'})]
    sdf = sdf.append(pd.DataFrame({"title": titles, "date": dates, "person": persons, "link": links}))

In [8]:
# at this point it includes speeches by other people - Obama, Sanders
# it also includes dates outside of the campaign season
# to match Liu and Lei, dates should be from "April 1, 2015 for Clinton and June 16th, 2015 for Trump"
# to Trump’s victory and Clinton’s concession speeches on November 9, 2016
sdf = sdf[sdf['person'].isin(['Donald J. Trump','Hillary Clinton']) & sdf['date'].between('2015-04-01','2016-11-09')]

In [9]:
sdf["transcript"] = ""

In [10]:
# This will take a few minutes to download them all.
sdf = sdf.set_index('link')
for link in sdf.index:
    #link = '/documents/remarks-new-york-city-accepting-election-the-45th-president-the-united-states'
    #fname = link[11:]
    r = requests.get(base_url + link)
    s = BeautifulSoup(r.text, 'html.parser')
    transcript = s.find("div",attrs={'class':'field-docs-content'}).get_text()
    sdf.at[link, "transcript"] = transcript
    
sdf = sdf.reset_index()

In [11]:
sdf.to_json("data/ucsb_speeches_2016.json")

In [12]:
sdf

Unnamed: 0,link,title,date,person,transcript
0,/documents/remarks-town-hall-meeting-portsmout...,"Remarks at a Town Hall Meeting in Portsmouth, ...",2015-12-29T00:00:00+00:00,Hillary Clinton,\nCLINTON: Wow. Thank you. Thank you all. Than...
1,/documents/remarks-the-university-minnesota-mi...,Remarks at the University of Minnesota in Minn...,2015-12-15T00:00:00+00:00,Hillary Clinton,\nThank you. Thank you all very much. Thank yo...
2,/documents/interview-with-george-stephanopoulo...,Interview with George Stephanopoulos of ABC Ne...,2015-12-06T00:00:00+00:00,Hillary Clinton,\nSTEPHANOPOULOS: And we'll hear more on that ...
3,/documents/interview-with-charlie-rose,Interview with Charlie Rose,2015-12-01T00:00:00+00:00,Hillary Clinton,"\nROSE: She is a former first lady, a former s..."
4,/documents/remarks-and-question-and-answer-ses...,Remarks and a Question and Answer Session at t...,2015-11-19T00:00:00+00:00,Hillary Clinton,\nCLINTON: Thank you. Thank you very much. [ap...
...,...,...,...,...,...
158,/documents/interview-with-chuck-todd-nbc-news-...,"Interview with Chuck Todd of NBC News ""Meet th...",2016-01-17T00:00:00+00:00,Hillary Clinton,\nTODD: But we start with the Democrats and a ...
159,/documents/interview-with-jake-tapper-cnns-sta...,"Interview with Jake tapper of CNN's ""State of ...",2016-01-17T00:00:00+00:00,Hillary Clinton,\nTAPPER: We're joined right off the bat by De...
160,/documents/interview-with-alisyn-camerota-cnn,Interview with Alisyn Camerota of CNN,2016-01-13T00:00:00+00:00,Hillary Clinton,"\nCAMEROTA: Secretary Clinton, thanks so much ..."
161,/documents/interview-with-john-dickerson-cbs-n...,"Interview with John Dickerson of CBS News ""Fac...",2016-01-10T00:00:00+00:00,Hillary Clinton,\nDICKERSON: We're going to get right to our l...


In [1]:
#Liu and Lei: 
"""Specifically, the Clinton corpus contains 89 scripts with a total of 286,899 words while the
Trump corpus includes 74 scripts with a total of 276,212 words."""

'Specifically, the Clinton cor-\npus contains 89 scripts with a total of 286,899 words while the\nTrump corpus includes 74 scripts with a total of 276,212 words.'

In [13]:
sdf.groupby('person').count()

Unnamed: 0_level_0,link,title,date,transcript
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Donald J. Trump,75,75,75,75
Hillary Clinton,88,88,88,88


In [None]:
# Not identical, but close enough that we should get similar results.