In [4]:
DEBATE_URL = 'http://www.presidency.ucsb.edu/debates.php'
last_fetched_at = None
import json
import urllib.request, time, re, random, hashlib
import bs4
import time
import sys
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.corpus import wordnet
from nltk import bigrams

In [5]:
def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache/cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                #print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    #print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
    
        return result

In [6]:
def debate_processing(soup):
    return_list = []
    tables = soup.find_all('table')
    
    for table in tables:
        if table['width'] == '700' and table['bgcolor'] == "#FFFFFF":
            actual_table = table
    rows = actual_table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        try:
            link = row.find('a')['href']
            cols.append(link)
            return_list.append(cols)
        except:
            pass

    return return_list

In [7]:
def get_words_from_speech(link):
    result = fetch(link)
    soup = bs4.BeautifulSoup(result,'lxml')
    return soup

In [8]:
def get_debate_dict():
    result = fetch(DEBATE_URL)
    soup = bs4.BeautifulSoup(result,'lxml')
    debate_list = debate_processing(soup)
    debate_dict = {}
    for debate in debate_list:

        if ' ' not in debate[0]:
            debate = debate[1:]
        debate_id = ' '.join(debate[:2])
        try:
            debate_datetime = time.strptime(debate[0].replace('th','').replace('st',''),'%B %d, %Y')
        except:
            debate_datetime = None

        debate_dict[debate_id] = {}
        debate_dict[debate_id]['link'] = debate[2]
        debate_dict[debate_id]['time'] = debate_datetime 
        
        try:
            debate_dict[debate_id]['soup'] = get_words_from_speech(debate[2])
        except:
            debate_dict[debate_id]['soup'] = None
        
    return debate_dict

In [9]:
def find_politician_names(debate_dict):
    for key in debate_dict.keys():
        raw = get_soup_text(key)
        raw = raw.replace("--", ". ")
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())

        #find candidate names, most commonly repeated first words of sentences, not common words
        colon_names = []
        period_names = []

        #get names from before colons
        for sent in sents:
            if ':' in sent:
                sent = sent.split(':')
                possible_name = sent[0] + ":"
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    colon_names.append(possible_name_no_paren)

        fdist1 = FreqDist(colon_names)
        fdist1_above_5 = [name[0] for name in fdist1.most_common(15) if name[1]>5]
        
        #get names before periods
        for sent in sents:
            if len(nltk.word_tokenize(sent))<5:
                possible_name = sent
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    period_names.append(possible_name_no_paren)
                    
        fdist2 = FreqDist(period_names)
        fdist2_above_15 = [name[0] for name in fdist2.most_common(15) if name[1]>15]
    
        #add names to dict
        colon_name_highest_freq = fdist1.most_common(1)[0][1]
        if colon_name_highest_freq > 20 :
            debate_dict[key]['names'] = fdist1_above_5
        else:
            debate_dict[key]['names'] = fdist2_above_15
            
    return debate_dict

In [10]:
def get_soup_text(dbt):
    raw = debate_dict[dbt]['soup'].get_text()
    raw = raw.replace("\\", "")
    raw = raw.replace(".", ". ")
    raw = raw.replace("?", "? ")
    raw = raw.replace("!", "! ")
    raw = raw.replace("  ", " ")
    raw = raw.replace("-", "- ")
    raw = raw.replace("…", ". ")
    raw = raw.replace("...", ". ")
    return raw
    

In [11]:
def remove_paren(name):
    return_name = ''
    skip1c = 0
    skip2c = 0
    for i in name:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            return_name += i
    return return_name

In [12]:
 def clean_dirty_name_lookup(names):
    
    lookup_dict = {}
    
    for name in names:
        clean_name = name.split()[-1].upper().replace('.','').replace(')','').replace(';','').replace(':','')
        lookup_dict[name] = clean_name
    
    return lookup_dict

In [13]:
def get_election_year(year, dbt):
    year = debate_dict[dbt]['time'].tm_year
    year_mod = year % 4
    if year_mod == 0:
        election_year = year
    else:
        election_year = year + (4 - year_mod)
    return election_year

In [14]:
def clean_names(debate_dict):
    # Add debate year
    name_years = {}
    for dbt in debate_dict.keys():
        time = debate_dict[dbt]['time']

        # Get election year
        if time:
            election_year = get_election_year(time.tm_year, dbt)
        else:
            election_year = 'Uncertain Year'
        debate_dict[dbt]['election_year'] = election_year

        # Add new set of names from debate to name_years dict
        if election_year not in name_years:
            name_years[election_year] = {'names':set()}

        names = set(debate_dict[dbt]["names"])
        name_years[election_year]['names'] = name_years[election_year]['names'].union(names)

    # Reduce all names in one year to a single name
    for year in name_years:
        name_years[year]['lookup'] = clean_dirty_name_lookup(name_years[year]['names'])

    # Add lookup dictionary to debate dictionary
    for dbt in debate_dict.keys():
        election_year = debate_dict[dbt]['election_year']
        debate_dict[dbt]['lookup'] = name_years[election_year]['lookup']
        debate_dict[dbt]['clean_names'] = debate_dict[dbt]['lookup'].values()
    
    return debate_dict

In [15]:
def attribute_text(debate_dict):
    #make year/candidate dictionary for text
    cand_text_dict = {}
    for dbt in debate_dict.keys():
        year = debate_dict[dbt]['election_year']
        cand_text_dict[year] = {}
        for cand in debate_dict[dbt]["clean_names"]:
            cand_text_dict[year][cand] = {}
            cand_text_dict[year][cand]['full_text'] = ""
    
    #fill year/candidate dictionary
    for dbt in debate_dict.keys():
        #set variables
        year = debate_dict[dbt]["election_year"]
        names = debate_dict[dbt]["names"]
        if "write" in names:
            names.remove('write')
        
        #get debate soup
        raw = get_soup_text(dbt)
        
        #tokenize sents
        for name in names:
            raw = raw.replace(name, ". " + name)
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())
        
        #loop through sents
        current_speaker = ""
        got_first_speaker = False
        for sent in sents:
            new_speaker = (len([name for name in names if name in sent])>0)
            if(new_speaker):
                got_first_speaker = True
                current_speaker_dirty = [name for name in names if name in sent][0]
                current_speaker = debate_dict[dbt]["lookup"][current_speaker_dirty]
            
            if(got_first_speaker):
                sent_no_name = sent.replace(current_speaker_dirty, "")
                cand_text_dict[year][current_speaker]['full_text'] = (cand_text_dict[year][current_speaker]['full_text'] + " " + sent_no_name)

    return cand_text_dict

In [56]:
def similarity_model(cand_text_dict):
    dumbWords = stopwords.words('english')
    political_positions = ['Governor', 'Senator', 'President']
    
    
    #loop through election years
    for year in cand_text_dict.keys():
        #loop through candidates
        for cand in cand_text_dict[year].keys():
            #print(year, cand)
        
            tokens = nltk.word_tokenize(cand_text_dict[year][cand]['full_text'])
            text = nltk.Text(tokens)
            fdist_tokens = FreqDist(tokens)
            
            special_words = [word for word in tokens if len(word)>4 and fdist_tokens[word]>=5 
                             and wordnet.synsets(word) and word not in political_positions]
            cand_text_dict[year][cand]["special_words"] = special_words
            
            special_words_no_caps = [word for word in tokens if len(word)>4 and fdist_tokens[word]>=5 
                             and wordnet.synsets(word) and word[0].islower()]
            cand_text_dict[year][cand]["special_words_no_caps"] = special_words_no_caps
            
            if len(text)>0:
                #avg word len
                sum_len = sum([len(word) for word in text])
                cand_text_dict[year][cand]["avg_word_len"] = sum_len/len(text)
                
                #avg word len, no stopwords
                text_no_dumbWords = [word for word in text if word not in dumbWords]
                sum_len = sum([len(word) for word in text_no_dumbWords])
                cand_text_dict[year][cand]["avg_word_len_no_stopword"] = sum_len/len(text_no_dumbWords)
                
                #lex diversity                
                cand_text_dict[year][cand]["lex_diversity_no_stopword"] = (len(set(text_no_dumbWords)) / len(text_no_dumbWords))
            
            bgrms = list(bigrams(text))
            fdist_bgrms = FreqDist(bgrms)
            special_bgrms = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1])]
            cand_text_dict[year][cand]["special_bgrms"] = special_bgrms
            
            special_bgrms_no_caps = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1]) 
                                     and bgm[0][0].islower() and bgm[1][0].islower()]
            cand_text_dict[year][cand]["special_bgrms_no_caps"] = special_bgrms_no_caps
            
            special_bgrms_no_caps_stopwords = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1]) 
                                     and bgm[0][0].islower() and bgm[1][0].islower()
                                              and bgm[0] not in dumbWords and bgm[1] not in dumbWords]
            cand_text_dict[year][cand]["special_bgrms_no_caps_stopwords"] = special_bgrms_no_caps_stopwords
            
    return cand_text_dict
        

In [57]:
#main
'''#make a dictionary with debate info
debate_dict = get_debate_dict()

#find the names of the participants
debate_dict = find_politician_names(debate_dict)

#clean names and years for comparison within electoral years
debate_dict = clean_names(debate_dict)

#compile all text by candidate-year
cand_text_dict = attribute_text(debate_dict)'''

#create a model of text similarity
cand_text_dict2 = similarity_model(cand_text_dict)

print("end")

end


In [58]:
print(cand_text_dict2[2016]['TRUMP'].keys())

dict_keys(['special_bgrms_no_caps', 'avg_word_len', 'special_bgrms', 'special_words_no_caps', 'special_words', 'full_text', 'lex_diversity_no_stopword', 'special_bgrms_no_caps_stopwords', 'avg_word_len_no_stopword'])


In [62]:
print(FreqDist(cand_text_dict2[1960]['KENNEDY']['special_words_no_caps']).most_common(10))

[('think', 73), ('believe', 50), ('years', 42), ('going', 37), ('there', 31), ('country', 28), ('about', 27), ('people', 26), ('president', 25), ('dollars', 22)]


In [66]:
print(cand_text_dict2[1960]['KENNEDY']['lex_diversity_no_stopword'])

0.2170036418435263


In [71]:
for yr in cand_text_dict2.keys():
    for cand in cand_text_dict2[yr].keys():
        if len(cand_text_dict2[yr][cand]['full_text'])>3:
            print(cand_text_dict2[yr][cand]['avg_word_len_no_stopword'])

4.104138851802404
4.573079145002889
4.284582571602681
4.414370078740157
4.446633154079962
4.421238938053097
4.5636363636363635
4.0596617898536955
4.136774880255455
4.461883408071749
4.236434108527132
4.280459770114943
4.345758354755784
4.2677150435771125
4.176753121998079
3.9336343115124155
4.229874776386405
4.396773773903994
3.1315789473684212
4.086842105263158
4.194939620471535
3.932107496463932
4.475577654284878
4.6120959332638165
3.8774193548387097
4.308727034120735
3.949882645312716
4.171428571428572
4.776061776061776
3.9394625176803393
4.124902419984387
4.129302325581396
4.2719836400818
4.164976133651551
4.371165644171779
4.141197592142782
4.248967454154965
4.2532836516068935
4.00862911103875
4.533333333333333
4.529345372460496
4.6440677966101696
4.432336584186347
4.12488928255093
4.246820349761526
4.2598208132322535
3.8880733944954127
3.6653089724194112
4.71256038647343
4.12039312039312
4.191732629727353
2.30188679245283
4.264834478450968
4.427848101265822
4.278528855938363
4.46

NameError: name 'lookup_dict' is not defined