In [1]:
import requests, lxml.html
import pandas as pd

pd.set_option('display.max_colwidth', 100)

In [2]:
response = requests.get("http://www.presidency.ucsb.edu/debates.php")
doc = lxml.html.fromstring(response.content)
rows = []
for el in doc.cssselect("td.doctext"):
    row = el.getparent()
    date = row.cssselect("td.docdate")[0].text_content()
    debate = row.cssselect("td.doctext")[0].text_content()
    link = ""
    if row.cssselect("td.doctext a"):
        link = row.cssselect("td.doctext a")[0].get('href')
    row = [date, debate, link]
    rows.append(row)
df = pd.DataFrame(rows, columns=['date', 'debate', 'link'])
df['date'] = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df[df.link != ""].head()

Unnamed: 0,date,debate,link,year
4,2016-04-14,"Democratic Candidates Debate in Brooklyn, New York",http://www.presidency.ucsb.edu/ws/index.php?pid=116995,2016
5,2016-03-09,"Democratic Candidates Debate in Miami, Florida",http://www.presidency.ucsb.edu/ws/index.php?pid=112719,2016
6,2016-03-06,"Democratic Candidates Debate in Flint, Michigan",http://www.presidency.ucsb.edu/ws/index.php?pid=112718,2016
7,2016-02-11,"Democratic Candidates Debate in Milwaukee, Wisconsin",http://www.presidency.ucsb.edu/ws/index.php?pid=111520,2016
8,2016-02-04,"Democratic Candidates Debate in Durham, New Hampshire",http://www.presidency.ucsb.edu/ws/index.php?pid=111471,2016


In [3]:
df[df.year >= 2015].sort_values(by='year').head(1)

Unnamed: 0,date,debate,link,year
31,2015-08-06,"Republican Candidates ""Undercard"" Debate in Cleveland, Ohio",http://www.presidency.ucsb.edu/ws/index.php?pid=110757,2015


In [4]:
df[df.year >= 2012].sort_values(by='year').head(1)

Unnamed: 0,date,debate,link,year
42,2012-01-07,"Republican Candidates Debate in Manchester, New Hampshire",http://www.presidency.ucsb.edu/ws/index.php?pid=98813,2012


In [5]:
df[df.year >= 2011].sort_values(by='year').head(1)

Unnamed: 0,date,debate,link,year
55,2011-06-13,"Republican Candidates Debate in Manchester, New Hampshire",http://www.presidency.ucsb.edu/ws/index.php?pid=90513,2011


In [6]:
df[df.year >= 2008].sort_values(by='year').head(1)

Unnamed: 0,date,debate,link,year
83,2008-01-05,"Republican Candidates Debate in Manchester, New Hampshire",http://www.presidency.ucsb.edu/ws/index.php?pid=76223,2008


# Keep Scrolling Below at your own Peril!

In [7]:
class Utterance(object):
    def __init__(self, speaker, text):
        self.speaker = speaker
        self.text = text
    
    def __str__(self):
        return self.speaker + ": " + self.text
    
    def __repr__(self):
        return str((self.speaker, self.text))

def parse_debate(url):

    response = requests.get(url)
    doc = lxml.html.fromstring(response.content)

    text = doc.cssselect("span.displaytext")[0]
    elements = text.getchildren()

    first_p_tag_index = None
    for i, el in enumerate(elements):
        if el.tag == "p":
            first_p_tag_index = i
            break

    assert all(el.tag == "p" for el in elements[first_p_tag_index:])

    if "MODERATOR" in elements[first_p_tag_index].text_content():
        first_p_tag_index += 1

    p_tags = elements[first_p_tag_index:]

    utterances = []

    current_speaker = None

    for p_tag in p_tags:

        if p_tag.cssselect("b"):
            b_tag = p_tag.cssselect("b")[0]
            current_speaker = b_tag.text_content()
            current_text = p_tag.text_content().replace(current_speaker, "", 1).strip()
            current_speaker = current_speaker.rstrip(":")
        else:
            current_text = p_tag.text_content()

        if len(utterances) > 0 and utterances[-1].speaker == current_speaker:
            utterances[-1].text += "\n"
            utterances[-1].text += current_text
        else:
            utterance = Utterance(current_speaker, current_text)    
            utterances.append(utterance)
    
    return utterances

In [8]:
# April 14, 2016 - Democratic Candidates Debate in Brooklyn, New York
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=116995")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")

('BLITZER', "Secretary Clinton and Senator Sanders, you can now move to your lecterns while I explain a few ground rules. As moderator, I'll guide the discussion, asking questions and follow-ups. You'll also get questions from Dana Bash and Errol Louis. You'll each have one minute and 15 seconds to answer questions, 30 seconds for follow- ups. Timing lights will signal when your time is up. Both candidates have agreed to these rules now. Opening statements, you'll each have two minutes.\nLet's begin with Senator Sanders. [applause] ")
------------------
('SANDERS', "Wolf, thank you very much. CNN, thank you very much. Secretary Clinton, thank you very much.\nWhen we began this campaign almost a year ago, we started off at 3 percent in the polls. We were about 70 points behind Secretary Clinton. In the last couple of weeks, there were two polls out there that had us ahead. [applause] \nOf the last nine caucuses and primaries, we have won eight of them, many of them by landslide victorie

In [9]:
# 2015-08-06 - Republican Candidates "Undercard" Debate in Cleveland, Ohio
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=110757")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")

('HEMMER', "This is first official event in the campaign for the Republican nomination for president. Welcome to Cleveland Ohio. It is debate night.\nI'm Bill Hemmer. ")
------------------
('MacCALLUM', "And I'm Martha MacCallum.\nIt all starts here. We are ready, the candidates are ready. We're live at the Quicken Loans Arena, where we have partnered with Facebook to bring you, the voter, into today's debate. ")
------------------
('HEMMER', "So you will hear from all 17 candidates tonight, and you'll meet seven of them right now, starting with three-time governor in the state of Texas, Rick Perry. [applause]")
------------------


In [10]:
# 2012-01-07 - http://www.presidency.ucsb.edu/ws/index.php?pid=98813
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=98813")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")

('SAWYER', 'And good evening to all of you. Welcome to Saint Anselm College and the first debate of the year, 2012. The voting is underway. And, George, those eight votes in Iowa reminded us on Tuesday every vote counts.')
------------------
('STEPHANOPOULOS', "No question about it, we are off and running. Great to be here with you, Josh. And now let's introduce the candidates: former Governor Jon Huntsman; Texas Congressman Ron Paul; former Governor of Massachusetts Mitt Romney; former Senator from Pennsylvania Rick Santorum; the former speaker of the House, Newt Gingrich; and Texas Governor Rick Perry.")
------------------
('SAWYER', "And it is time to remind everyone again of the rules, which are pretty straightforward, and we remind you again, they were negotiated and agreed to by the candidates themselves. So let's take you through them.\nOne-minute responses to the question, with 30 seconds for rebuttal. And we're showing everybody at home that the candidates will see green, and 

In [11]:
# 2011-06-13 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=90513")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")

('JOHN KING', "Welcome to Saint Anselm College in Manchester, New Hampshire, and the first Republican presidential debate in this first-in-the-nation primary state. Behind me on this stage, the Republican candidates for president appearing together on the same stage for the first time tonight.\nAnd tonight's debate will be different than any presidential debate you've ever seen. Over the course of the next two hours, in addition to questions from myself and journalists from our partners, WMUR and the New Hampshire Union Leader, the candidates will take questions directly from voters right here in Manchester, as well as from voters at town meetings taking place tonight all across New Hampshire. \nSo let's get right to it and meet the candidates. Now, we've asked for no opening statements. However, we will continue a tradition from our past New Hampshire debates, to ask each candidate in one short sentence -- hopefully, five, maybe six or seven seconds -- to introduce themselves to the v

In [12]:
# 2008-01-05 - Republican Candidates Debate in Manchester, New Hampshire
utterances = parse_debate("http://www.presidency.ucsb.edu/ws/index.php?pid=76223")
for utterance in utterances[:3]:
    print(repr(utterance))
    print("------------------")

('DIANE SAWYER', "Well, it is time for the great debate to begin. Let's go to Charlie Gibson at St. Anselm College in Manchester, New Hampshire.\nCharlie? ")
------------------
('CHARLIE GIBSON', 'Thanks very much, Diane.\nAnd we have been joined on the stage by the six leading Republican candidates for the Republican nomination for president. And I want to introduce them to you from left to right. \nThe positions in which they sit were drawn by lot, and so let me introduce them from left to right: Senator John McCain from the state of Arizona, former Senator Fred Thompson from Tennessee, Congressman Ron Paul from Texas, former Governor Mitt Romney of Massachusetts, former Governor Mike Huckabee from Arkansas, and former Mayor Rudy Giuliani from the city of New York. \nAnd, gentlemen, just at the risk of being repetitive, I hope you will take the questions posed in these first 45 minutes and I hope, to the extent we can, discuss them among yourselves. ')
------------------
('MR. GIBSON

Similar Analysis: http://www.realclearpolitics.com/articles/2016/04/14/debate_text_analysis_trump_is_the_moderate_130275.html