In [1]:
DEBATE_URL = 'http://www.presidency.ucsb.edu/debates.php'
last_fetched_at = None
import json
import urllib.request, time, re, random, hashlib
import bs4
import time
import sys
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import FreqDist

In [2]:
def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache/cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                #print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    #print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
    
        return result

In [3]:
def debate_processing(soup):
    return_list = []
    tables = soup.find_all('table')
    
    for table in tables:
        if table['width'] == '700' and table['bgcolor'] == "#FFFFFF":
            actual_table = table
    rows = actual_table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        try:
            link = row.find('a')['href']
            cols.append(link)
            return_list.append(cols)
        except:
            pass

    return return_list

In [4]:
def get_words_from_speech(link):
    result = fetch(link)
    soup = bs4.BeautifulSoup(result,'lxml')
    return soup

In [5]:
def get_debate_dict():
    result = fetch(DEBATE_URL)
    soup = bs4.BeautifulSoup(result,'lxml')
    debate_list = debate_processing(soup)
    final_list = {}
    for debate in debate_list:

        if ' ' not in debate[0]:
            debate = debate[1:]
        debate_id = ' '.join(debate[:2])
        try:
            debate_datetime = time.strptime(debate[0].replace('th','').replace('st',''),'%B %d, %Y')
        except:
            debate_datetime = None

        final_list[debate_id] = {}
        final_list[debate_id]['link'] = debate[2]
        final_list[debate_id]['time'] = debate_datetime 
        
        try:
            final_list[debate_id]['soup'] = get_words_from_speech(debate[2])
        except:
            final_list[debate_id]['soup'] = None
        
    return final_list

In [6]:
def find_politician_names(debate_soup_dict):
    for key in debate_soup_dict.keys():
        raw = get_soup_text(key)
        raw = raw.replace("--", ". ")
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())

        #find candidate names, most commonly repeated first words of sentences, not common words
        colon_names = []
        period_names = []
        #dumbWords = stopwords.words('english')

        #get names from before colons
        for sent in sents:
            if ':' in sent:
                sent = sent.split(':')
                possible_name = sent[0] + ":"
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    colon_names.append(possible_name_no_paren)

        fdist1 = FreqDist(colon_names)
        fdist1_above_5 = [name[0] for name in fdist1.most_common(15) if name[1]>5]
        
        #getnames before periods
        for sent in sents:
            if len(nltk.word_tokenize(sent))<5:
                possible_name = sent
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    period_names.append(possible_name_no_paren)
                    
        fdist2 = FreqDist(period_names)
        fdist2_above_15 = [name[0] for name in fdist2.most_common(15) if name[1]>15]
    
        #add names to dict
        colon_name_highest_freq = fdist1.most_common(1)[0][1]
        if colon_name_highest_freq > 20 :
            debate_soup_dict[key]['names'] = fdist1_above_5
        else:
            debate_soup_dict[key]['names'] = fdist2_above_15
            
    return debate_soup_dict

In [7]:
def get_soup_text(key):
    raw = debate_dict[key]['soup'].get_text()
    raw = raw.replace("\\", "")
    raw = raw.replace(".", ". ")
    raw = raw.replace("?", "? ")
    raw = raw.replace("!", "! ")
    raw = raw.replace("  ", " ")
    raw = raw.replace("-", "- ")
    raw = raw.replace("…", ". ")
    raw = raw.replace("...", ". ")
    return raw
    

In [8]:
def remove_paren(name):
    return_name = ''
    skip1c = 0
    skip2c = 0
    for i in name:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            return_name += i
    return return_name

In [9]:
 def translate_names(names):
    
    lookup_dict = {}
    
    for name in names:
        clean_name = name.split()[-1].upper().replace('.','').replace(')','').replace(';','').replace(':','')
        lookup_dict[name] = clean_name
    
    return lookup_dict

In [10]:
def get_election_year(year, key):
    year = debate_dict[key]['time'].tm_year
    year_mod = year % 4
    if year_mod == 0:
        election_year = year
    else:
        election_year = year + (4 - year_mod)
    return election_year

In [11]:
def clean_names_years(debate_dict):
    # Add debate year
    name_years = {}
    for key in debate_dict.keys():
        time = debate_dict[key]['time']

        # Get election year
        if time:
            election_year = get_election_year(time.tm_year, key)
        else:
            election_year = 'Uncertain Year'
        debate_dict[key]['election_year'] = election_year

        # Add new set of names from debate to name_years dict
        if election_year not in name_years:
            name_years[election_year] = {'names':set()}

        names = set(debate_dict[key]["names"])
        name_years[election_year]['names'] = name_years[election_year]['names'].union(names)

    # Reduce all names in one year to a single name
    for year in name_years:
        name_years[year]['lookup'] = translate_names(name_years[year]['names'])

    # Add lookup dictionary to debate dictionary
    for key in debate_dict.keys():
        election_year = debate_dict[key]['election_year']
        debate_dict[key]['lookup'] = name_years[election_year]['lookup']
        debate_dict[key]['clean_names'] = debate_dict[key]['lookup'].values()
    
    return debate_dict

In [24]:
def attribute_text(debate_dict):
    #make year/candidate dictionary for text
    text_dict = {}
    for key in debate_dict.keys():
        election_year = debate_dict[key]['election_year']
        text_dict[election_year] = {}
        for candidate in debate_dict[key]["clean_names"]:
            text_dict[election_year][candidate] = ""
    
    #fill year/candidate dictionary
    for key in debate_dict.keys():
        #set variables
        election_year = debate_dict[key]["election_year"]
        names = debate_dict[key]["names"]
        if "write" in names:
            names.remove('write')
        
        #get debate soup
        raw = get_soup_text(key)
        
        #tokenize sents
        for name in names:
            raw = raw.replace(name, ". " + name)
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())
        
        #loop through sents
        current_speaker = ""
        got_first_speaker = False
        for sent in sents:
            new_speaker = (len([name for name in names if name in sent])>0)
            if(new_speaker):
                got_first_speaker = True
                current_speaker_dirty = [name for name in names if name in sent][0]
                current_speaker = debate_dict[key]["lookup"][current_speaker_dirty]
            
            if(got_first_speaker):
                sent_no_name = sent.replace(current_speaker_dirty, "")
                text_dict[election_year][current_speaker] = (text_dict[election_year][current_speaker] + " " + sent_no_name)

    return text_dict

In [25]:
#main
#debate_dict = get_debate_dict()
#debate_dict = find_politician_names(debate_dict)
#debate_dict = clean_names_years(debate_dict)
candidate_text_dict = attribute_text(debate_dict)
print("end")

end


In [26]:
print(candidate_text_dict[2008]["OBAMA"])

  Thank you very much, Charlie and George, and thanks to all in the audience and who are out there. You know, Senator Clinton and I have been running for 15 months now. We've been traveling across Pennsylvania for at least the last five weeks. And everywhere I go, what I've been struck by is the core decency and generosity of people of Pennsylvania and the American people. But what I've also been struck by is the frustration. You know, I met a gentleman in Latrobe who had lost his job and was trying to figure out how he could find the gas money to travel to find a job. And that story, I think, is typical of what we're seeing all across the country. People are frustrated not only with jobs moving and incomes being flat, health care being too expensive, but also that special interests have come to dominate Washington, and they don't feel like they're being listened to. I think this election offers us an opportunity to change that, to transform that frustration into something more hopeful