In [2]:

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
import random
text_data = []
with open('speeches_clean.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['executive', 'compensation', 'salary', 'speech;04-feb-09', 'recovery', 'economic', 'crisis', 'unlike', 'lifetime', 'crisis', 'fall', 'confidence', 'rising', 'widely', 'distribute', 'narrowly', 'concentrate', 'reward', 'crisis', 'write', 'print', 'prime', 'ledger', 'mighty', 'financial', 'people', 'across', 'country', 'economy', 'million', 'alone', 'everything', 'crisis', 'making', 'around', 'mistake', 'failure', 'crisis', 'catastrophe', 'guarantee', 'longer', 'recession', 'robust', 'recovery', 'uncertain', 'future', 'million', 'defer', 'sense', 'urgency', 'economic', 'recovery', 'reinvestment', 'congress', 'today', 'create', 'three', 'million', 'strengthen', 'country', 'merely', 'prescription', 'short', 'spending', 'strategy', 'economic', 'growth', 'renewable', 'energy', 'health', 'education', 'somehow', 'want', 'economic', 'crisis', 'first', 'place', 'notion', 'alone', 'solve', 'ignore', 'fundamental', 'energy', 'independence', 'health', 'somehow', 'piecemeal', 'fashion', 'still', 'e

In [12]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [15]:
import gensim
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.027*"crisis" + 0.017*"people" + 0.015*"economic" + 0.015*"financial" + 0.015*"compensation"')
(1, '0.001*"people" + 0.001*"unite" + 0.001*"going" + 0.001*"country" + 0.001*"president"')
(2, '0.001*"president" + 0.001*"people" + 0.001*"think" + 0.001*"going" + 0.001*"unite"')
(3, '0.001*"people" + 0.001*"going" + 0.001*"think" + 0.001*"president" + 0.001*"health"')
(4, '0.001*"people" + 0.001*"country" + 0.001*"unite" + 0.001*"nation" + 0.001*"president"')
(5, '0.001*"going" + 0.001*"people" + 0.001*"president" + 0.001*"think" + 0.001*"health"')
(6, '0.029*"military" + 0.027*"defense" + 0.013*"world" + 0.013*"budget" + 0.012*"going"')
(7, '0.001*"going" + 0.001*"people" + 0.001*"president" + 0.001*"think" + 0.001*"health"')
(8, '0.026*"people" + 0.025*"think" + 0.025*"president" + 0.025*"going" + 0.017*"health"')
(9, '0.036*"people" + 0.024*"country" + 0.021*"unite" + 0.017*"freedom" + 0.015*"nation"')


In [16]:
new_doc = 'Obama Speeches Topics'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(766, 1)]
[(0, 0.05), (1, 0.050000004), (2, 0.050000004), (3, 0.050000004), (4, 0.050000004), (5, 0.050000004), (6, 0.05002306), (7, 0.050000004), (8, 0.5499689), (9, 0.050008032)]


In [19]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 5, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.001*"think" + 0.001*"president" + 0.001*"going" + 0.001*"people" + 0.001*"health"')
(1, '0.001*"going" + 0.001*"people" + 0.001*"president" + 0.001*"health" + 0.001*"think"')
(2, '0.033*"people" + 0.021*"country" + 0.019*"unite" + 0.016*"freedom" + 0.014*"world"')
(3, '0.021*"crisis" + 0.014*"people" + 0.012*"financial" + 0.012*"economic" + 0.012*"compensation"')
(4, '0.022*"going" + 0.021*"people" + 0.021*"think" + 0.020*"president" + 0.014*"health"')


In [21]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.036*"people" + 0.024*"country" + 0.021*"unite" + 0.017*"freedom" + 0.015*"nation"')
(1, '0.029*"military" + 0.027*"defense" + 0.013*"world" + 0.013*"budget" + 0.012*"going"')
(2, '0.001*"people" + 0.001*"going" + 0.001*"think" + 0.001*"president" + 0.001*"unite"')
(3, '0.001*"president" + 0.001*"think" + 0.001*"going" + 0.001*"people" + 0.001*"health"')
(4, '0.001*"people" + 0.001*"president" + 0.001*"going" + 0.001*"think" + 0.001*"unite"')
(5, '0.001*"people" + 0.001*"think" + 0.001*"going" + 0.001*"president" + 0.001*"health"')
(6, '0.026*"people" + 0.025*"think" + 0.025*"going" + 0.025*"president" + 0.017*"health"')
(7, '0.001*"think" + 0.001*"people" + 0.001*"going" + 0.001*"health" + 0.001*"president"')
(8, '0.027*"crisis" + 0.017*"people" + 0.015*"financial" + 0.015*"economic" + 0.015*"compensation"')
(9, '0.001*"people" + 0.001*"crisis" + 0.001*"going" + 0.001*"financial" + 0.001*"president"')


In [23]:
pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 25.1 MB/s eta 0:00:01[K     |▍                               | 20 kB 31.6 MB/s eta 0:00:01[K     |▋                               | 30 kB 37.7 MB/s eta 0:00:01[K     |▉                               | 40 kB 18.2 MB/s eta 0:00:01[K     |█                               | 51 kB 15.7 MB/s eta 0:00:01[K     |█▏                              | 61 kB 18.1 MB/s eta 0:00:01[K     |█▍                              | 71 kB 14.5 MB/s eta 0:00:01[K     |█▋                              | 81 kB 15.6 MB/s eta 0:00:01[K     |█▉                              | 92 kB 17.2 MB/s eta 0:00:01[K     |██                              | 102 kB 15.8 MB/s eta 0:00:01[K     |██▏                             | 112 kB 15.8 MB/s eta 0:00:01[K     |██▍                             | 122 kB 15.8 MB/s eta 0:00:01[K     |██▋                             | 133 kB 15.8 MB/s eta 0:00:01

In [30]:
pip install gensim



In [31]:
pip install pyldavis



In [37]:
pip install pyLDAvis==2.1.2

Collecting pyLDAvis==2.1.2
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
[?25l[K     |▏                               | 10 kB 22.9 MB/s eta 0:00:01[K     |▍                               | 20 kB 28.1 MB/s eta 0:00:01[K     |▋                               | 30 kB 33.1 MB/s eta 0:00:01[K     |▉                               | 40 kB 18.2 MB/s eta 0:00:01[K     |█                               | 51 kB 14.6 MB/s eta 0:00:01[K     |█▏                              | 61 kB 16.7 MB/s eta 0:00:01[K     |█▍                              | 71 kB 13.8 MB/s eta 0:00:01[K     |█▋                              | 81 kB 15.1 MB/s eta 0:00:01[K     |█▉                              | 92 kB 16.3 MB/s eta 0:00:01[K     |██                              | 102 kB 14.1 MB/s eta 0:00:01[K     |██▎                             | 112 kB 14.1 MB/s eta 0:00:01[K     |██▍                             | 122 kB 14.1 MB/s eta 0:00:01[K     |██▋                             | 133 kB 14.1 MB/s eta 0:

In [38]:
import pyLDAvis.gensim

In [40]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  return pd.DataFrame({'Term': vocab[term_ix], \
