In [62]:
DEBATE_URL = 'http://www.presidency.ucsb.edu/debates.php'
last_fetched_at = None
import json
import urllib.request, time, re, random, hashlib
import bs4
import time
import sys
import nltk
import nltk.data
from itertools import combinations
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.corpus import wordnet
from nltk import bigrams
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np
from scipy.spatial import distance
import time
SAVE_FILE = 'text_dict.json'

In [20]:
def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache/cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                #print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    #print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
    
        return result

In [21]:
def debate_processing(soup):
    return_list = []
    tables = soup.find_all('table')
    
    for table in tables:
        if table['width'] == '700' and table['bgcolor'] == "#FFFFFF":
            actual_table = table
    rows = actual_table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        try:
            link = row.find('a')['href']
            cols.append(link)
            return_list.append(cols)
        except:
            pass

    return return_list

In [22]:
def get_words_from_speech(link):
    result = fetch(link)
    soup = bs4.BeautifulSoup(result,'lxml')
    return soup

In [23]:
def get_debate_dict():
    result = fetch(DEBATE_URL)
    soup = bs4.BeautifulSoup(result,'lxml')
    debate_list = debate_processing(soup)
    debate_dict = {}
    for debate in debate_list:

        if ' ' not in debate[0]:
            debate = debate[1:]
        debate_id = ' '.join(debate[:2])
        try:
            debate_datetime = time.strptime(debate[0].replace('th','').replace('st',''),'%B %d, %Y')
        except:
            debate_datetime = None

        debate_dict[debate_id] = {}
        debate_dict[debate_id]['link'] = debate[2]
        debate_dict[debate_id]['time'] = debate_datetime 
        
        try:
            debate_dict[debate_id]['soup'] = get_words_from_speech(debate[2])
        except:
            debate_dict[debate_id]['soup'] = None
        
    return debate_dict

In [25]:
def find_politician_names(debate_dict):
    for key in debate_dict.keys():
        raw = get_soup_text(debate_dict[key])
        raw = raw.replace("--", ". ")
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())

        #find candidate names, most commonly repeated first words of sentences, not common words
        colon_names = []
        period_names = []

        #get names from before colons
        for sent in sents:
            if ':' in sent:
                sent = sent.split(':')
                possible_name = sent[0] + ":"
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    colon_names.append(possible_name_no_paren)

        fdist1 = FreqDist(colon_names)
        fdist1_above_5 = [name[0] for name in fdist1.most_common(15) if name[1]>5]
        
        #get names before periods
        for sent in sents:
            if len(nltk.word_tokenize(sent))<5:
                possible_name = sent
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    period_names.append(possible_name_no_paren)
                    
        fdist2 = FreqDist(period_names)
        fdist2_above_15 = [name[0] for name in fdist2.most_common(15) if name[1]>15]
    
        #add names to dict
        colon_name_highest_freq = fdist1.most_common(1)[0][1]
        if colon_name_highest_freq > 20 :
            debate_dict[key]['names'] = fdist1_above_5
        else:
            debate_dict[key]['names'] = fdist2_above_15
            
    return debate_dict

In [26]:
def get_soup_text(dbt):
    raw = dbt['soup'].get_text()
    raw = raw.replace("\\", "")
    raw = raw.replace(".", ". ")
    raw = raw.replace("?", "? ")
    raw = raw.replace("!", "! ")
    raw = raw.replace("  ", " ")
    raw = raw.replace("-", "- ")
    raw = raw.replace("…", ". ")
    raw = raw.replace("...", ". ")
    return raw
    

In [27]:
def remove_paren(name):
    return_name = ''
    skip1c = 0
    skip2c = 0
    for i in name:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            return_name += i
    return return_name

In [28]:
 def clean_dirty_name_lookup(names):
    
    lookup_dict = {}
    
    for name in names:
        clean_name = name.split()[-1].upper().replace('.','').replace(')','').replace(';','').replace(':','')
        lookup_dict[name] = clean_name
    
    return lookup_dict

In [29]:
def get_election_year(year, dbt):
    year = dbt['time'].tm_year
    year_mod = year % 4
    if year_mod == 0:
        election_year = year
    else:
        election_year = year + (4 - year_mod)
    return election_year

In [30]:
def clean_names(debate_dict):
    # Add debate year
    name_years = {}
    for dbt in debate_dict.keys():
        time = debate_dict[dbt]['time']

        # Get election year
        if time:
            election_year = get_election_year(time.tm_year, debate_dict[dbt])
        else:
            election_year = 'Uncertain Year'
        debate_dict[dbt]['election_year'] = election_year

        # Add new set of names from debate to name_years dict
        if election_year not in name_years:
            name_years[election_year] = {'names':set()}

        names = set(debate_dict[dbt]["names"])
        name_years[election_year]['names'] = name_years[election_year]['names'].union(names)

    # Reduce all names in one year to a single name
    for year in name_years:
        name_years[year]['lookup'] = clean_dirty_name_lookup(name_years[year]['names'])

    # Add lookup dictionary to debate dictionary
    for dbt in debate_dict.keys():
        election_year = debate_dict[dbt]['election_year']
        debate_dict[dbt]['lookup'] = name_years[election_year]['lookup']
        debate_dict[dbt]['clean_names'] = debate_dict[dbt]['lookup'].values()
    
    return debate_dict

In [31]:
def attribute_text(debate_dict):
    #make year/candidate dictionary for text
    cand_text_dict = {}
    for dbt in debate_dict.keys():
        year = debate_dict[dbt]['election_year']
        cand_text_dict[year] = {}
        for cand in debate_dict[dbt]["clean_names"]:
            cand_text_dict[year][cand] = {}
            cand_text_dict[year][cand]['full_text'] = ""
    
    #fill year/candidate dictionary
    for dbt in debate_dict.keys():
        #set variables
        year = debate_dict[dbt]["election_year"]
        names = debate_dict[dbt]["names"]
        if "write" in names:
            names.remove('write')
        
        #get debate soup
        raw = get_soup_text(debate_dict[dbt])
        
        #tokenize sents
        for name in names:
            raw = raw.replace(name, ". " + name)
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())
        
        #loop through sents
        current_speaker = ""
        got_first_speaker = False
        for sent in sents:
            new_speaker = (len([name for name in names if name in sent])>0)
            if(new_speaker):
                got_first_speaker = True
                current_speaker_dirty = [name for name in names if name in sent][0]
                current_speaker = debate_dict[dbt]["lookup"][current_speaker_dirty]
            
            if(got_first_speaker):
                sent_no_name = sent.replace(current_speaker_dirty, "")
                cand_text_dict[year][current_speaker]['full_text'] = (cand_text_dict[year][current_speaker]['full_text'] + " " + sent_no_name)

    return cand_text_dict

In [32]:
def similarity_model(cand_text_dict):
    dumbWords = stopwords.words('english')
    political_positions = ['Governor', 'Senator', 'President']
    
    
    #loop through election years
    for year in cand_text_dict.keys():
        #loop through candidates
        for cand in cand_text_dict[year].keys():
            #print(year, cand)
        
            tokens = nltk.word_tokenize(cand_text_dict[year][cand]['full_text'])
            text = nltk.Text(tokens)
            fdist_tokens = FreqDist(tokens)
            
            special_words = [word for word in tokens if len(word)>4 and fdist_tokens[word]>=5 
                             and wordnet.synsets(word) and word not in political_positions]
            cand_text_dict[year][cand]["special_words"] = special_words
            
            special_words_no_caps = [word for word in tokens if len(word)>4 and fdist_tokens[word]>=5 
                             and wordnet.synsets(word) and word[0].islower()]
            cand_text_dict[year][cand]["special_words_no_caps"] = special_words_no_caps
            
            if len(text)>0:
                #avg word len
                sum_len = sum([len(word) for word in text])
                cand_text_dict[year][cand]["avg_word_len"] = sum_len/len(text)
                
                #avg word len, no stopwords
                text_no_dumbWords = [word for word in text if word not in dumbWords]
                sum_len = sum([len(word) for word in text_no_dumbWords])
                cand_text_dict[year][cand]["avg_word_len_no_stopword"] = sum_len/len(text_no_dumbWords)
                
                #lex diversity                
                cand_text_dict[year][cand]["lex_diversity_no_stopword"] = (len(set(text_no_dumbWords)) / len(text_no_dumbWords))
            
            bgrms = list(bigrams(text))
            fdist_bgrms = FreqDist(bgrms)
            special_bgrms = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1])]
            cand_text_dict[year][cand]["special_bgrms"] = special_bgrms
            
            special_bgrms_no_caps = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1]) 
                                     and bgm[0][0].islower() and bgm[1][0].islower()]
            cand_text_dict[year][cand]["special_bgrms_no_caps"] = special_bgrms_no_caps
            
            special_bgrms_no_caps_stopwords = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1]) 
                                     and bgm[0][0].islower() and bgm[1][0].islower()
                                              and bgm[0] not in dumbWords and bgm[1] not in dumbWords]
            cand_text_dict[year][cand]["special_bgrms_no_caps_stopwords"] = special_bgrms_no_caps_stopwords
            
    return cand_text_dict
        

In [33]:
def main_function(filename):
    debate_dict = get_debate_dict()

    #find the names of the participants
    debate_dict = find_politician_names(debate_dict)

    #clean names and years for comparison within electoral years
    debate_dict = clean_names(debate_dict)

    #compile all text by candidate-year
    cand_text_dict = attribute_text(debate_dict)

    #create a model of text similarity
    cand_text_dict = similarity_model(cand_text_dict)

    with open(filename, 'w') as fp:
        json.dump(cand_text_dict, fp)
    return cand_text_dict





In [34]:
cand_text_dict = main_function(SAVE_FILE)

In [38]:
print(cand_text_dict[1984].keys())

dict_keys(['WRITE', 'WHITE', 'PRESIDENT', 'MASHEK', 'NEWMAN', 'VANOCUR', 'WALTERS', 'BUSH', 'BOYD', 'MONDALE', 'FERRARO', 'QUARLES'])


In [39]:
FreqDist(cand_text_dict[1960]['NIXON']['special_bgrms_no_caps_stopwords']).most_common(1)

[(('federal', 'government'), 13)]

In [43]:
FreqDist(cand_text_dict[1976]['PRESIDENT']['special_bgrms_no_caps_stopwords']).most_common(20)

[(('foreign', 'policy'), 12),
 (('tax', 'reduction'), 9),
 (('present', 'time'), 8),
 (('tax', 'bill'), 7),
 (('best', 'interests'), 6),
 (('defense', 'budget'), 6),
 (('make', 'certain'), 5),
 (('good', 'job'), 5),
 (('year', 'ago'), 4),
 (('mining', 'bill'), 4),
 (('cruise', 'missiles'), 4),
 (('million', 'metric'), 4),
 (('made', 'available'), 4),
 (('billion', 'tax'), 4),
 (('strip', 'mining'), 4),
 (('metric', 'tons'), 4),
 (('net', 'result'), 4),
 (('constitutional', 'amendment'), 4),
 (('million', 'people'), 4),
 (('tax', 'relief'), 4)]

In [53]:
cand_text_dict[1984]['BUSH'].keys()

dict_keys(['special_bgrms_no_caps_stopwords', 'lex_diversity_no_stopword', 'avg_word_len_no_stopword', 'special_bgrms_no_caps', 'special_words', 'avg_word_len', 'special_words_no_caps', 'full_text', 'special_bgrms'])

In [127]:
def knn_direct_compare(debate_dict,var_list,distance_method,weights=None):
    
    # locations of id and text in score dataframe
    cand_scores = []
    for year in debate_dict:
        # Build each row of dataframe
        for candidate in debate_dict[year]:
            cand_year_dict = debate_dict[year][candidate]
            cand_id = candidate + '_' + str(year)
            var_score_list = [cand_year_dict[x] for x in cand_year_dict if x in var_list]
            cand_scores.append([cand_id,cand_year_dict['full_text']] + var_score_list)
            
    # build normalized dataframe with name and text as first columns, var_list as col keys
    cand_df = normalize_scores(cand_scores,var_list)   

    
    tfidf_freq = get_tfidf_vectors(cand_df['full_text'])
    tf_cols = list(tfidf_freq.columns)
    df = pd.concat([cand_df,tfidf_freq],axis=1)

    
    num_rows = len(df)
    dist_dict = {}
    # Loop over all combinations and calculate distances
    for i in range(num_rows-1):
        print(i)
        for j in range(i+1,num_rows):
            
            # default to equal weights
            if weights == None:
                weights = [1]*(len(var_list) + 1)
            
            name1 = df.ix[i]['cand_name']
            name2 = df.ix[j]['cand_name']
            score = calculate_score(df.ix[i],df.ix[j],weights,var_list,tf_cols,distance_method)
            

            # fill both sides of dictionary with relative distances

            if name1 not in dist_dict:
                dist_dict[name1] = {}
            if name2 not in dist_dict:
                dist_dict[name2] = {}
            dist_dict[name1][name2] = score
            dist_dict[name2][name1] = score


    return df,dist_dict        
    

In [141]:
def calculate_score(row1,row2,weights,var_list,tfidf_cols,distance_method):
    
    tfidf_weight = weights[-1]
    var_weights = weights[:-1]
    
    
    # equal weight to all tfidf features
    tfidf_weight_list = [tfidf_weight / len(tfidf_cols)]*len(tfidf_cols)
    
    weights = [tfidf_weight_list,var_weights]
    columns = [tfidf_cols,var_list]
    r1_weighted = weight_rows(row1,columns,weights)
    r2_weighted = weight_rows(row2,columns,weights)

    return distance_method(r1_weighted,r2_weighted)
    
    
    

In [46]:
def weight_rows(row,col_list,weight_list):
    rv = np.array(row[col_list[0]]) * weight_list[0]
    for i in range(1,len(col_list)):
        rv = np.hstack([rv,np.array(row[col_list[0]]*weight_list[0])])
    
    return rv

In [47]:
def fill_dist_dict(dist_dict,name1,name2,var_dist,tfdif_dist):
    if name1 not in dist_dict:
        dist_dict[name1] = {}
    if name2 not in dist_dict:
        dist_dict[name2] = {}
    
    cur_dict = dist_dict[name1]
    cur_dict[name2] = {'gen_distance':var_dist,'tfdif_dist':tfdif_dist}

In [50]:
def normalize_scores(list_of_scores,var_list):
    
    df = pd.DataFrame(list_of_scores)
    
    # Rename columns
    col_dict = {0:'cand_name',1:'full_text'}
    for i, var in enumerate(var_list):
        col_dict[i+2] = var
    df = df.rename(columns=col_dict)
    df.fillna(0,inplace=True) # none existent should be 0
    
    
    df[var_list] = normalize(df[var_list],'l1',axis=0)
    return df
            

In [51]:
def get_tfidf_vectors(list_of_texts):
    # Used basis of code from hw 1
    vectorizer = TfidfVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = stopwords.words('english'),
                                 lowercase= True,
                                 max_features = 1000,
                                 smooth_idf = True) # Enable smoothing
    compressed_vectors = vectorizer.fit_transform(list_of_texts)
    df = pd.DataFrame(compressed_vectors.toarray())
    df.columns = vectorizer.get_feature_names()
    return df 

In [159]:
raw_scores,dist_dict = knn_direct_compare(cand_text_dict,['lex_diversity_no_stopword','avg_word_len_no_stopword'],cosine,[1,1,1])

0
1
2
3
4
5


In [171]:
def build_distance_matrix(raw_scores,dist_dict):
    names = list(raw_scores['cand_name'])
    score_compare = pd.DataFrame(index=names,columns=names)
    for name in names:
        new_col = []
        for compare_name in names:
            try:
                new_col.append(dist_dict[name][compare_name])
            except:
                new_col.append(-1)
        score_compare[name] = new_col
    score_compare.fillna(-1,inplace=True)
    
    rv = score_compare.merge(raw_scores,left_index=True,right_on='cand_name')
    
    return rv.set_index('cand_name')
    
    

In [172]:
build_distance_matrix(raw_scores,dist_dict)

Unnamed: 0_level_0,HARWOOD_2016,REGAN_2016,HUCKABEE_2016,MACCALLUM_2016,FIORINA_2016,CLINTON_2016,WRITE_2016,full_text,lex_diversity_no_stopword,avg_word_len_no_stopword,...,wrong,wrote,www,yeah,year,years,yes,yet,york,young
cand_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HARWOOD_2016,-1.0,0.487423,0.582325,0.685687,0.667461,0.588589,-1,We're going to pose this question to all can...,0.002714,0.002883,...,0.035381,0.034616,0.0,0.0,0.0,0.037513,0.010568,0.013081,0.015531,0.0
REGAN_2016,0.487423,-1.0,0.698662,0.30939,0.707876,0.665506,-1,"In Tuesday's State of the Union Address, the...",0.003328,0.002907,...,0.0,0.0,0.038879,0.0,0.022723,0.0,0.0,0.0,0.0,0.0
HUCKABEE_2016,0.582325,0.698662,-1.0,0.74161,0.341592,0.232728,-1,I wish I saw the country in the same place t...,0.001824,0.002818,...,0.003794,0.0,0.0,0.011801,0.025915,0.066378,0.0102,0.004208,0.009994,0.017561
MACCALLUM_2016,0.685687,0.30939,0.74161,-1.0,0.714529,0.733591,-1,So it will come as no surprise that there is...,0.003791,0.002807,...,0.0,0.0,0.056597,0.0,0.033078,0.0,0.0,0.0,0.076535,0.0
FIORINA_2016,0.667461,0.707876,0.341592,0.714529,-1.0,0.359843,-1,"Well, thank you. Good evening. If I may begi...",0.001809,0.002932,...,0.027213,0.0,0.0,0.006045,0.033189,0.083464,0.055735,0.043116,0.005119,0.02249
CLINTON_2016,0.588589,0.665506,0.232728,0.733591,0.359843,-1.0,-1,"Well, I'm happy to be here in New Hampshire ...",0.001205,0.003016,...,0.008963,0.002192,0.0,0.009292,0.011478,0.023757,0.052203,0.006627,0.015737,0.057036
WRITE_2016,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [173]:
raw_scores

Unnamed: 0,cand_name,full_text,lex_diversity_no_stopword,avg_word_len_no_stopword,000,10,100,11,12,15,...,wrong,wrote,www,yeah,year,years,yes,yet,york,young
0,HARWOOD_2016,We're going to pose this question to all can...,0.002714,0.002883,0.011532,0.025803,0.0,0.014048,0.0,0.014048,...,0.035381,0.034616,0.0,0.0,0.0,0.037513,0.010568,0.013081,0.015531,0.0
1,REGAN_2016,"In Tuesday's State of the Union Address, the...",0.003328,0.002907,0.013012,0.0,0.0,0.015851,0.017525,0.0,...,0.0,0.0,0.038879,0.0,0.022723,0.0,0.0,0.0,0.0,0.0
2,HUCKABEE_2016,I wish I saw the country in the same place t...,0.001824,0.002818,0.051941,0.024904,0.024198,0.00452,0.0,0.009039,...,0.003794,0.0,0.0,0.011801,0.025915,0.066378,0.0102,0.004208,0.009994,0.017561
3,MACCALLUM_2016,So it will come as no surprise that there is...,0.003791,0.002807,0.018942,0.021192,0.0,0.0,0.0,0.0,...,0.0,0.0,0.056597,0.0,0.033078,0.0,0.0,0.0,0.076535,0.0
4,FIORINA_2016,"Well, thank you. Good evening. If I may begi...",0.001809,0.002932,0.072222,0.008505,0.0,0.013892,0.005119,0.013892,...,0.027213,0.0,0.0,0.006045,0.033189,0.083464,0.055735,0.043116,0.005119,0.02249
5,CLINTON_2016,"Well, I'm happy to be here in New Hampshire ...",0.001205,0.003016,0.013146,0.011439,0.013337,0.02491,0.007869,0.005338,...,0.008963,0.002192,0.0,0.009292,0.011478,0.023757,0.052203,0.006627,0.015737,0.057036
6,WRITE_2016,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
