In [216]:
DEBATE_URL = 'http://www.presidency.ucsb.edu/debates.php'
last_fetched_at = None
import json
import urllib.request, time, re, random, hashlib
import bs4
import time
import sys
import nltk
import nltk.data
from itertools import combinations
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.corpus import wordnet
from nltk import bigrams
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np
from scipy.spatial import distance
SAVE_FILE = 'text_dict.json'

In [2]:
def fetch(url):
    """Load the url compassionately."""
    
    global last_fetched_at
    
    url_hash = hashlib.sha1(url.encode()).hexdigest()
    filename = 'cache/cache-file-{}'.format(url_hash)
    try:
        with open(filename, 'r') as f:
            result = f.read()
            if len(result) > 0:
                #print("Retrieving from cache:", url)
                return result
    except:
        pass
    
    #print("Loading:", url)
    wait_interval = random.randint(3000,10000)
    if last_fetched_at is not None:
        now = time.time()
        elapsed = now - last_fetched_at
        if elapsed < wait_interval:
            time.sleep((wait_interval - elapsed)/1000)
        
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
    headers = { 'User-Agent' : user_agent }
    req = urllib.request.Request(url, headers = headers)
    last_fetched_at = time.time()
    with urllib.request.urlopen(req) as response:
        result = str(response.read())
        with open(filename, 'w') as f:
            f.write(result)
    
        return result

In [3]:
def debate_processing(soup):
    return_list = []
    tables = soup.find_all('table')
    
    for table in tables:
        if table['width'] == '700' and table['bgcolor'] == "#FFFFFF":
            actual_table = table
    rows = actual_table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        try:
            link = row.find('a')['href']
            cols.append(link)
            return_list.append(cols)
        except:
            pass

    return return_list

In [4]:
def get_words_from_speech(link):
    result = fetch(link)
    soup = bs4.BeautifulSoup(result,'lxml')
    return soup

In [5]:
def get_debate_dict():
    result = fetch(DEBATE_URL)
    soup = bs4.BeautifulSoup(result,'lxml')
    debate_list = debate_processing(soup)
    debate_dict = {}
    for debate in debate_list:

        if ' ' not in debate[0]:
            debate = debate[1:]
        debate_id = ' '.join(debate[:2])
        try:
            debate_datetime = time.strptime(debate[0].replace('th','').replace('st',''),'%B %d, %Y')
        except:
            debate_datetime = None

        debate_dict[debate_id] = {}
        debate_dict[debate_id]['link'] = debate[2]
        debate_dict[debate_id]['time'] = debate_datetime 
        
        try:
            debate_dict[debate_id]['soup'] = get_words_from_speech(debate[2])
        except:
            debate_dict[debate_id]['soup'] = None
        
    return debate_dict

In [6]:
def find_politician_names(debate_dict):
    for key in debate_dict.keys():
        raw = get_soup_text(debate_dict[key])
        raw = raw.replace("--", ". ")
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())

        #find candidate names, most commonly repeated first words of sentences, not common words
        colon_names = []
        period_names = []

        #get names from before colons
        for sent in sents:
            if ':' in sent:
                sent = sent.split(':')
                possible_name = sent[0] + ":"
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    colon_names.append(possible_name_no_paren)

        fdist1 = FreqDist(colon_names)
        fdist1_above_5 = [name[0] for name in fdist1.most_common(15) if name[1]>5]
        
        #get names before periods
        for sent in sents:
            if len(nltk.word_tokenize(sent))<5:
                possible_name = sent
                possible_name_no_paren = remove_paren(possible_name).strip()
                if (len(possible_name_no_paren)<25) & (len(possible_name_no_paren)>2):
                    period_names.append(possible_name_no_paren)
                    
        fdist2 = FreqDist(period_names)
        fdist2_above_15 = [name[0] for name in fdist2.most_common(15) if name[1]>15]
    
        #add names to dict
        colon_name_highest_freq = fdist1.most_common(1)[0][1]
        if colon_name_highest_freq > 20 :
            debate_dict[key]['names'] = fdist1_above_5
        else:
            debate_dict[key]['names'] = fdist2_above_15
            
    return debate_dict

In [7]:
def get_soup_text(dbt):
    raw = dbt['soup'].get_text()
    raw = raw.replace("\\", "")
    raw = raw.replace(".", ". ")
    raw = raw.replace("?", "? ")
    raw = raw.replace("!", "! ")
    raw = raw.replace("  ", " ")
    raw = raw.replace("-", "- ")
    raw = raw.replace("…", ". ")
    raw = raw.replace("...", ". ")
    return raw
    

In [8]:
def remove_paren(name):
    return_name = ''
    skip1c = 0
    skip2c = 0
    for i in name:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            return_name += i
    return return_name

In [9]:
 def clean_dirty_name_lookup(names):
    
    lookup_dict = {}
    
    for name in names:
        clean_name = name.split()[-1].upper().replace('.','').replace(')','').replace(';','').replace(':','')
        lookup_dict[name] = clean_name
    
    return lookup_dict

In [10]:
def get_election_year(year, dbt):
    year = dbt['time'].tm_year
    year_mod = year % 4
    if year_mod == 0:
        election_year = year
    else:
        election_year = year + (4 - year_mod)
    return election_year

In [11]:
def clean_names(debate_dict):
    # Add debate year
    name_years = {}
    for dbt in debate_dict.keys():
        time = debate_dict[dbt]['time']

        # Get election year
        if time:
            election_year = get_election_year(time.tm_year, debate_dict[dbt])
        else:
            election_year = 'Uncertain Year'
        debate_dict[dbt]['election_year'] = election_year

        # Add new set of names from debate to name_years dict
        if election_year not in name_years:
            name_years[election_year] = {'names':set()}

        names = set(debate_dict[dbt]["names"])
        name_years[election_year]['names'] = name_years[election_year]['names'].union(names)

    # Reduce all names in one year to a single name
    for year in name_years:
        name_years[year]['lookup'] = clean_dirty_name_lookup(name_years[year]['names'])

    # Add lookup dictionary to debate dictionary
    for dbt in debate_dict.keys():
        election_year = debate_dict[dbt]['election_year']
        debate_dict[dbt]['lookup'] = name_years[election_year]['lookup']
        debate_dict[dbt]['clean_names'] = debate_dict[dbt]['lookup'].values()
    
    return debate_dict

In [12]:
def attribute_text(debate_dict):
    #make year/candidate dictionary for text
    cand_text_dict = {}
    for dbt in debate_dict.keys():
        year = debate_dict[dbt]['election_year']
        cand_text_dict[year] = {}
        for cand in debate_dict[dbt]["clean_names"]:
            cand_text_dict[year][cand] = {}
            cand_text_dict[year][cand]['full_text'] = ""
    
    #fill year/candidate dictionary
    for dbt in debate_dict.keys():
        #set variables
        year = debate_dict[dbt]["election_year"]
        names = debate_dict[dbt]["names"]
        if "write" in names:
            names.remove('write')
        
        #get debate soup
        raw = get_soup_text(debate_dict[dbt])
        
        #tokenize sents
        for name in names:
            raw = raw.replace(name, ". " + name)
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(raw.strip())
        
        #loop through sents
        current_speaker = ""
        got_first_speaker = False
        for sent in sents:
            new_speaker = (len([name for name in names if name in sent])>0)
            if(new_speaker):
                got_first_speaker = True
                current_speaker_dirty = [name for name in names if name in sent][0]
                current_speaker = debate_dict[dbt]["lookup"][current_speaker_dirty]
            
            if(got_first_speaker):
                sent_no_name = sent.replace(current_speaker_dirty, "")
                cand_text_dict[year][current_speaker]['full_text'] = (cand_text_dict[year][current_speaker]['full_text'] + " " + sent_no_name)

    return cand_text_dict

In [13]:
def similarity_model(cand_text_dict):
    dumbWords = stopwords.words('english')
    political_positions = ['Governor', 'Senator', 'President']
    
    
    #loop through election years
    for year in cand_text_dict.keys():
        #loop through candidates
        for cand in cand_text_dict[year].keys():
            #print(year, cand)
        
            tokens = nltk.word_tokenize(cand_text_dict[year][cand]['full_text'])
            text = nltk.Text(tokens)
            fdist_tokens = FreqDist(tokens)
            
            special_words = [word for word in tokens if len(word)>4 and fdist_tokens[word]>=5 
                             and wordnet.synsets(word) and word not in political_positions]
            cand_text_dict[year][cand]["special_words"] = special_words
            
            special_words_no_caps = [word for word in tokens if len(word)>4 and fdist_tokens[word]>=5 
                             and wordnet.synsets(word) and word[0].islower()]
            cand_text_dict[year][cand]["special_words_no_caps"] = special_words_no_caps
            
            if len(text)>0:
                #avg word len
                sum_len = sum([len(word) for word in text])
                cand_text_dict[year][cand]["avg_word_len"] = sum_len/len(text)
                
                #avg word len, no stopwords
                text_no_dumbWords = [word for word in text if word not in dumbWords]
                sum_len = sum([len(word) for word in text_no_dumbWords])
                cand_text_dict[year][cand]["avg_word_len_no_stopword"] = sum_len/len(text_no_dumbWords)
                
                #lex diversity                
                cand_text_dict[year][cand]["lex_diversity_no_stopword"] = (len(set(text_no_dumbWords)) / len(text_no_dumbWords))
            
            bgrms = list(bigrams(text))
            fdist_bgrms = FreqDist(bgrms)
            special_bgrms = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1])]
            cand_text_dict[year][cand]["special_bgrms"] = special_bgrms
            
            special_bgrms_no_caps = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1]) 
                                     and bgm[0][0].islower() and bgm[1][0].islower()]
            cand_text_dict[year][cand]["special_bgrms_no_caps"] = special_bgrms_no_caps
            
            special_bgrms_no_caps_stopwords = [bgm for bgm in bgrms if fdist_bgrms[bgm]>2 
                             and wordnet.synsets(bgm[0]) and wordnet.synsets(bgm[1]) 
                                     and bgm[0][0].islower() and bgm[1][0].islower()
                                              and bgm[0] not in dumbWords and bgm[1] not in dumbWords]
            cand_text_dict[year][cand]["special_bgrms_no_caps_stopwords"] = special_bgrms_no_caps_stopwords
            
    return cand_text_dict
        

In [14]:
def main_function(filename):
    debate_dict = get_debate_dict()

    #find the names of the participants
    debate_dict = find_politician_names(debate_dict)

    #clean names and years for comparison within electoral years
    debate_dict = clean_names(debate_dict)

    #compile all text by candidate-year
    cand_text_dict = attribute_text(debate_dict)

    #create a model of text similarity
    cand_text_dict = similarity_model(cand_text_dict)

    with open(filename, 'w') as fp:
        json.dump(cand_text_dict, fp)
    return cand_text_dict





In [15]:
#main
#make a dictionary with debate info
try:
    with open(SAVE_FILE, 'r') as fp:
        cand_text_dict = json.load(fp)
except:
   cand_text_dict = main_function(SAVE_FILE)



In [16]:
cand_text_dict = main_function(SAVE_FILE)

In [17]:
print(cand_text_dict[1980].keys())

dict_keys(['SMITH', 'ANDERSON', 'WRITE', 'PRESIDENT', 'MOYERS', 'YES', 'REAGAN'])


In [18]:
FreqDist(cand_text_dict[1960]['NIXON']['special_bgrms_no_caps_stopwords']).most_common(20)

[(('federal', 'government'), 13),
 (('half', 'years'), 7),
 (('economic', 'assistance'), 6),
 (('technical', 'assistance'), 6),
 (('special', 'session'), 5),
 (('billion', 'dollars'), 5),
 (('national', 'product'), 5),
 (('standing', 'still'), 5),
 (('summit', 'conference'), 5),
 (('medical', 'care'), 5),
 (('made', 'recommendations'), 4),
 (('press', 'conference'), 4),
 (('depletion', 'allowance'), 4),
 (('seven', 'years'), 4),
 (('gross', 'national'), 4),
 (('cold', 'war'), 4),
 (('moment', 'ago'), 4),
 (('bed', 'hungry'), 4),
 (('last', 'seven'), 3),
 (('school', 'construction'), 3)]

In [76]:
FreqDist(cand_text_dict[1972]['NIXON']['special_bgrms_no_caps_stopwords']).most_common(20)

KeyError: 1972

In [53]:
cand_text_dict[1984]['BUSH'].keys()

dict_keys(['special_bgrms_no_caps_stopwords', 'lex_diversity_no_stopword', 'avg_word_len_no_stopword', 'special_bgrms_no_caps', 'special_words', 'avg_word_len', 'special_words_no_caps', 'full_text', 'special_bgrms'])

In [28]:
print(cand_text_dict[1960]['KENNEDY']['lex_diversity_no_stopword'])

0.2170036418435263


In [71]:
for yr in cand_text_dict2.keys():
    for cand in cand_text_dict2[yr].keys():
        if len(cand_text_dict2[yr][cand]['full_text'])>3:
            print(cand_text_dict2[yr][cand]['avg_word_len_no_stopword'])

4.104138851802404
4.573079145002889
4.284582571602681
4.414370078740157
4.446633154079962
4.421238938053097
4.5636363636363635
4.0596617898536955
4.136774880255455
4.461883408071749
4.236434108527132
4.280459770114943
4.345758354755784
4.2677150435771125
4.176753121998079
3.9336343115124155
4.229874776386405
4.396773773903994
3.1315789473684212
4.086842105263158
4.194939620471535
3.932107496463932
4.475577654284878
4.6120959332638165
3.8774193548387097
4.308727034120735
3.949882645312716
4.171428571428572
4.776061776061776
3.9394625176803393
4.124902419984387
4.129302325581396
4.2719836400818
4.164976133651551
4.371165644171779
4.141197592142782
4.248967454154965
4.2532836516068935
4.00862911103875
4.533333333333333
4.529345372460496
4.6440677966101696
4.432336584186347
4.12488928255093
4.246820349761526
4.2598208132322535
3.8880733944954127
3.6653089724194112
4.71256038647343
4.12039312039312
4.191732629727353
2.30188679245283
4.264834478450968
4.427848101265822
4.278528855938363
4.46

In [273]:
def knn_direct_compare(debate_dict,var_list,distance_method,weights=None):
    
    # locations of id and text in score dataframe
    cand_id_loc = 0
    full_text_loc = 1
    dist_dict = {}
    cand_scores = []
    for year in debate_dict:
        # Build each row of dataframe
        for candidate in debate_dict[year]:
            cand_year_dict = debate_dict[year][candidate]
            cand_id = candidate + '_' + str(year)
            var_score_list = [cand_year_dict[x] for x in cand_year_dict if x in var_list]
            cand_scores.append([cand_id,cand_year_dict['full_text']] + var_score_list)
            
    # build normalized dataframe with name and text as first columns, var_list as col keys
    cand_df = normalize_scores(cand_scores,var_list)
    

    
    tfidf_freq = get_tfidf_vectors(cand_df['full_text'])
    feature_names = tfidf_freq.columns
    return df
    
    num_rows = len(df)
    # Loop over all combinations and calculate distances
    for i in range(num_rows-1):
        for j in range(i+1,num_rows):
            
            # default to equal weights
            if weights == None:
                weights = [1]*(len(cand1_var_list) + 1)
            
            df_row = df.ix[i]
            weighted1_vect = 
                
            

            # weight variables
    
            # fill both sides of dictionary with relative distances
            fill_dist_dict(dist_dict,cand1_name,cand2_name,var_distance,tfdif_dist)         
    
    return dist_dict              
    

In [None]:
def calculate_score(row1,row2,weights,var_list)

In [198]:
def fill_dist_dict(dist_dict,name1,name2,var_dist,tfdif_dist):
    if name1 not in dist_dict:
        dist_dict[name1] = {}
    if name2 not in dist_dict:
        dist_dict[name2] = {}
    
    cur_dict = dist_dict[name1]
    cur_dict[name2] = {'gen_distance':var_dist,'tfdif_dist':tfdif_dist}

In [199]:
def get_row_data_w_tfidf(df,row_num,var_list,word_features,weights):
    # returns canidates name, a tfdif matrix and a list of other scores in the matrix
    
    df_row = df.ix[row_num]
    name = df_row['cand_name']
    scores = np.array(df_row[var_list])*weights[:-1]
    tfidf = 0
    return name, tfidf, scores
    

In [150]:
def normalize_scores(list_of_scores,var_list):
    
    df = pd.DataFrame(list_of_scores)
    
    # Rename columns
    col_dict = {0:'cand_name',1:'full_text'}
    for i, var in enumerate(var_list):
        col_dict[i+2] = var
    df = df.rename(columns=col_dict)
    df.fillna(0,inplace=True) # none existant should be 0
    
    
    df[var_list] = normalize(df[var_list],'l1',axis=0)
    return df
            

In [270]:
def get_tfidf_vectors(list_of_texts):
    # Used basis of code from hw 1
    vectorizer = TfidfVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = stopwords.words('english'),
                                 lowercase= True,
                                 max_features = 1000,
                                 smooth_idf = True)
    compressed_vectors = vectorizer.fit_transform(list_of_texts)
    df = pd.DataFrame(compressed_vectors.toarray())
    df.columns = vectorizer.get_feature_names()
    return df 

In [274]:
cand = knn_direct_compare(cand_text_dict,['lex_diversity_no_stopword','avg_word_len_no_stopword'],cosine)

In [278]:
cand.columns

Index(['cand_name', 'full_text', 'lex_diversity_no_stopword',
       'avg_word_len_no_stopword', '000', '10', '100', '11', '12', '15',
       ...
       'wrong', 'wrote', 'www', 'yeah', 'year', 'years', 'yes', 'yet', 'york',
       'young'],
      dtype='object', length=1004)

In [192]:
cosine(np.array(cand1_var_list),np.array(cand2_var_list))

0.047374615866792591

In [91]:
var_list = ['lex_diversity_no_stopword','avg_word_len_no_stopword']

In [221]:
cand.ix[0]['full_text']

'  Thanks, David. Good evening, guys. .  Mr. Trump. .  In the Democratic primary, Hillary Clinton has criticized Bernie Sanders\' plan for single payer government health care, noting it would require big, across the board tax increases for Americans. In doing so, she\'s doubling down on Obamacare, despite its persistent unpopularity. Mr. Trump, you have said you want to appeal Obamacare. You have also said, quote, "Everybody\'s got to be covered," adding, quote, "The government\'s going to pay for it. " Are you closer to Bernie Sanders\' vision for health care than Hillary Clinton\'s? .  Dr. Carson, you have some experience with this matter. In the past, you have said that Obamacare should be replaced before it\'s repealed. How and why? .  Thank you, Dr. Carson. David, Martha, back to you. .  Thanks, David. Senator Cruz, on the campaign trail you\'ve promised voters a lot, in fact if you\'re elected president you\'d say you end Common Core immediately, abolish the IRS, and do away with

In [272]:
df

Unnamed: 0,000,10,100,11,12,15,20,200,25,30,...,wrong,wrote,www,yeah,year,years,yes,yet,york,young
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.066134,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156423
1,0.000000,0.000000,0.000000,0.056263,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.047234,0.000000,0.000000,0.000000,0.020163,0.037560,0.042325,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016040,...,0.032704,0.000000,0.047772,0.000000,0.000000,0.013003,0.014652,0.000000,0.000000,0.000000
3,0.023571,0.005274,0.030746,0.011485,0.000000,0.000000,0.005764,0.007142,0.006591,0.004729,...,0.028927,0.000000,0.000000,0.029990,0.020580,0.026836,0.017280,0.016042,0.006349,0.011157
4,0.000000,0.000000,0.037990,0.000000,0.026150,0.011826,0.000000,0.000000,0.000000,0.068174,...,0.000000,0.014571,0.029007,0.000000,0.050859,0.007895,0.008897,0.011012,0.000000,0.011488
5,0.041831,0.031200,0.009094,0.038219,0.014085,0.008493,0.098032,0.000000,0.019495,0.000000,...,0.014261,0.020928,0.000000,0.000000,0.030437,0.070873,0.028751,0.000000,0.000000,0.004125
6,0.028321,0.031685,0.046178,0.021563,0.023840,0.047439,0.012986,0.005363,0.000000,0.028412,...,0.000000,0.015940,0.000000,0.016891,0.077276,0.089249,0.064885,0.012047,0.000000,0.029324
7,0.032905,0.000000,0.021461,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016505,...,0.050479,0.000000,0.000000,0.026167,0.014365,0.040140,0.000000,0.000000,0.000000,0.000000
8,0.028790,0.064419,0.012518,0.011691,0.012925,0.011691,0.023468,0.000000,0.000000,0.009627,...,0.000000,0.014403,0.000000,0.030526,0.058655,0.023413,0.026384,0.000000,0.064625,0.000000
9,0.000000,0.024552,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.022016,...,0.000000,0.000000,0.000000,0.000000,0.057486,0.017847,0.000000,0.000000,0.000000,0.000000


In [269]:
df_freq = pd.DataFrame(frequencies)
df_freq.columns = feature_names
df_freq.shape

(351, 1000)