In [1]:
DEBATE_URL = 'http://www.presidency.ucsb.edu/debates.php'
last_fetched_at = None
import json
import urllib.request, time, re, random, hashlib
import bs4
import time
import sys
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import FreqDist

In [2]:
def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache/cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                #print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    #print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
    
        return result

In [3]:
def debate_processing(soup):
    return_list = []
    tables = soup.find_all('table')
    
    for table in tables:
        if table['width'] == '700' and table['bgcolor'] == "#FFFFFF":
            actual_table = table
    rows = actual_table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        try:
            link = row.find('a')['href']
            cols.append(link)
            return_list.append(cols)
        except:
            pass

    return return_list

In [4]:
def get_words_from_speech(link):
    result = fetch(link)
    soup = bs4.BeautifulSoup(result,'lxml')
    return soup

In [5]:
def get_debate_dict():
    result = fetch(DEBATE_URL)
    soup = bs4.BeautifulSoup(result,'lxml')
    debate_list = debate_processing(soup)
    final_list = {}
    for debate in debate_list:

        if ' ' not in debate[0]:
            debate = debate[1:]
        debate_id = ' '.join(debate[:2])
        try:
            debate_datetime = time.strptime(debate[0].replace('th','').replace('st',''),'%B %d, %Y')
        except:
            debate_datetime = None

        final_list[debate_id] = {}
        final_list[debate_id]['link'] = debate[2]
        final_list[debate_id]['time'] = debate_datetime 
        
        try:
            final_list[debate_id]['soup'] = get_words_from_speech(debate[2])
        except:
            final_list[debate_id]['soup'] = None
        
    return final_list

In [76]:
def find_politician_names(debate_soup_dict):
    for debate in debate_soup_dict.keys():
        raw = debate_soup_dict[debate]['soup'].get_text()
        raw = raw.replace("\\", "")
        raw = raw.replace(".", ". ")
        raw = raw.replace("?", "? ")
        raw = raw.replace("!", "! ")
        raw = raw.replace("  ", " ")
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())

        #find candidate names, most commonly repeated first words of sentences, not common words
        colon_names = []
        period_names = []
        dumbWords = stopwords.words('english')

        #get names from before colons
        for sent in sents:
            if ':' in sent:
                sent = sent.split(':')
                possible_name = sent[0]
                if (len(possible_name)<25) & (len(possible_name)>1):
                    possible_name_no_paren = remove_paren(possible_name).strip()
                    colon_names.append(possible_name_no_paren)

        fdist1 = FreqDist(colon_names)
        fdist1_above_5 = [name[0] for name in fdist1.most_common(10) if name[1]>5]
        
        #getnames before periods
        for sent in sents:
            if len(nltk.word_tokenize(sent))<5:
                possible_name = sent
                if (len(possible_name)<25) & (len(possible_name)>1):
                    possible_name_no_paren = remove_paren(possible_name).strip()
                    period_names.append(possible_name_no_paren)
        fdist2 = FreqDist(period_names)
        fdist2_above_15 = [name for name in fdist2.most_common(10) if name[1]>15 and name[0]]
    
        #add names to dict
        mostFreq = fdist1.most_common(1)[0][1]
        
        if mostFreq > 20 :
            debate_soup_dict[debate]['names'] = fdist1_above_5
        else:
            debate_soup_dict[debate]['names'] = fdist2_above_15
            
    return debate_soup_dict

In [77]:
def remove_paren(name):
    return_name = ''
    skip1c = 0
    skip2c = 0
    for i in name:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            return_name += i
    return return_name

In [78]:
#debate_dict = get_debate_dict()
debate_dict2 = find_politician_names(debate_dict)
for key in debate_dict.keys():
    print(debate_dict[key]["names"])


['MR. GIBSON', 'SEN. CLINTON', 'SEN. OBAMA', 'SEN. EDWARDS', 'RICHARDSON', 'MR. SPRADLING']
['Griffith', 'Woodruff', 'Forbes', 'McCain', 'Hatch', 'Bauer', 'Keyes']
['STEPHANOPOULOS', 'CLINTON', 'OBAMA', 'EDWARDS', 'BIDEN', 'RICHARDSON', 'GRAVEL', 'KUCINICH', 'DODD', 'YEPSEN']
['WOODRUFF', 'QUAYLE', 'BENTSEN', 'MARGOLIS', 'BROKAW']
['DICKERSON', 'SANDERS', 'CLINTON', "O'MALLEY", 'CORDES', 'COONEY']
['COOPER', 'CLINTON', 'SANDERS', "O'MALLEY", 'WEBB', 'CHAFEE', 'BASH', 'LOPEZ']
['MR. HUME', 'MR. WALLACE', 'MR. GOLER', 'MR. ROMNEY', 'MR. GIULIANI', 'SEN. MCCAIN', 'REP. PAUL', 'SEN. BROWNBACK', 'MR. HUCKABEE', 'REP. HUNTER']
[('THE MODERATOR.', 30), ('THE PRESIDENT.', 20), ('MR. CARTER.', 17)]
['DEMINT', 'S. KING', 'BACHMANN', 'KING', 'GEORGE', 'CAIN', 'PAUL', 'ROMNEY', 'GINGRICH', 'UNIDENTIFIED MALE']
[('Senator Kerry.', 39), ('Mr. Schieffer.', 39), ('President Bush.', 30)]
[('Mr. Newman.', 27), ('The President.', 26), ('Mr. Mondale.', 23)]
[('Mr. Lehrer.', 75), ('President Bush.', 36), (

In [None]:
'''  
    #keep all names above avg freq
        sum_freq = 0
        for name_freq in fdist1.most_common(10):
            if name_freq[0].upper() != "MODERATOR":
                sum_freq += name_freq[1]
        avg_freq = (sum_freq/10)
        
        fdist1_above_avg = [name[0] for name in fdist1.most_common(10) if name[1]>avg_freq]
'''