In [1]:
import pandas as pd
import numpy as np

## Load speech file

In [2]:
# load data from pickle file
import pickle

pickle_in = open("speeches.pickle", "rb")
speech_df = pickle.load(pickle_in)

In [3]:
speech_df.head()

Unnamed: 0,type,speaker,date,speech
0,convention,Hillary Clinton,2016,Thank you all for the great convention that we...
1,convention,Robert Dole,1996,The folks in Hollywood would be happy to know ...
2,convention,George W. Bush,2000,"Thank you. Thank you for this honor. [,],Thank..."
3,convention,George W. Bush,2004,"When I said those words 4 years ago, none of u..."
4,convention,John McCain,2008,"Tonight, I have a privilege given few American..."


In [7]:
speech_df['tag'] = speech_df['type'] + speech_df['date'].map(str) + speech_df['speaker']

In [4]:
speech_text = speech_df['speech']

## TFIDF and NMF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [13]:
t_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    token_pattern="\\b[a-z][a-z]+\\b"
)

tfidf_matrix = t_vectorizer.fit_transform(speech_text)

In [12]:
t_vectorizer.get_feature_names()

['aa',
 'aa cases',
 'aaa',
 'aaa program',
 'aana',
 'aana western',
 'aar',
 'aar participants',
 'aaron',
 'aaron burr',
 'aaron just',
 'aaron life',
 'aback',
 'aback thinking',
 'abandon',
 'abandon acted',
 'abandon adjacent',
 'abandon agenda',
 'abandon altogether',
 'abandon asia',
 'abandon blessings',
 'abandon cause',
 'abandon certain',
 'abandon children',
 'abandon cities',
 'abandon citizenship',
 'abandon claims',
 'abandon contest',
 'abandon contract',
 'abandon crime',
 'abandon delusions',
 'abandon desire',
 'abandon diminish',
 'abandon disregard',
 'abandon effect',
 'abandon effort',
 'abandon efforts',
 'abandon evidently',
 'abandon failed',
 'abandon false',
 'abandon far',
 'abandon fault',
 'abandon field',
 'abandon fundamental',
 'abandon government',
 'abandon graves',
 'abandon great',
 'abandon growth',
 'abandon habits',
 'abandon homes',
 'abandon hope',
 'abandon idea',
 'abandon ideals',
 'abandon imperfect',
 'abandon instead',
 'abandon iraqi',

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

dist = 1 - cosine_similarity(tfidf_matrix)

In [15]:
dist

array([[ 3.33066907e-15,  8.77271246e-01,  8.79598367e-01, ...,
         9.85381450e-01,  9.80462786e-01,  9.85607279e-01],
       [ 8.77271246e-01,  1.04360964e-14,  8.85040808e-01, ...,
         9.79530696e-01,  9.74863184e-01,  9.79613961e-01],
       [ 8.79598367e-01,  8.85040808e-01, -7.28306304e-14, ...,
         9.84836261e-01,  9.80640004e-01,  9.83685649e-01],
       ...,
       [ 9.85381450e-01,  9.79530696e-01,  9.84836261e-01, ...,
         1.46549439e-14,  9.16077675e-01,  9.44966737e-01],
       [ 9.80462786e-01,  9.74863184e-01,  9.80640004e-01, ...,
         9.16077675e-01,  4.66293670e-15,  9.32544573e-01],
       [ 9.85607279e-01,  9.79613961e-01,  9.83685649e-01, ...,
         9.44966737e-01,  9.32544573e-01,  1.77635684e-15]])