In [1]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Consolidated imports for entire notebook
# =-=-=-=-=-=

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
import gensim


# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Getting only the years from dates list
years = [re.sub('[A-Za-z ]', '', item) for item in dates]

# Combining year with presenter for citation
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

cited_texts = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

# This just shows that the citation and the text are paired correctly.
# cited_texts.head()

In [2]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Documentation: https://pypi.python.org/pypi/lda
# LDA requires a DTM as input

# From the Stopwords Notebook:
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())

# Test Strings
# doc_a = "You can call me Al."
# doc_b = "I can call you Betty."
# doc_c = "Who'll be my role model?"
# 
# doc_set = [doc_a, doc_b, doc_c] # Test list of strings

# List for loop
texts = []

# loop through document list
for i in talks:
    
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(tokens)
#    texts.append(stopped_tokens)

print(texts[0:5])



In [3]:
# =-=-=-=-=-=
# Generate LDA Model
# =-=-=-=-=-=

# turn our tokenized documents into a id <-> term dictionary
dictionary = gensim.corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# print(corpus[0:3]) # to see the corpus

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=35, 
                                           id2word = dictionary, 
                                           passes=100)

In [4]:
#save the model
ldamodel.save('../data/lda_35_100_all_words.model')

# for later: load trained model from file
#model =  models.LdaModel.load('lda.model')

In [4]:
# =-=-=-=-=-=
# Printing Options
# =-=-=-=-=-=


# Prints all topics. Format T = words, Format F = ids
# print(ldamodel.show_topics(num_topics=25, num_words=10, formatted=True))

# This is pretty much the same output as show_topics() method
# except with "topic" at start of line and not formatted as a list of tuples.
for t in range(0, ldamodel.num_topics):
    print('topic {}, {}'.format(t, ldamodel.print_topic(t)))

topic 0, 0.051*the + 0.041*and + 0.032*to + 0.023*a + 0.022*in + 0.022*of + 0.019*we + 0.016*that + 0.015*you + 0.013*i
topic 1, 0.000*the + 0.000*and + 0.000*that + 0.000*of + 0.000*i + 0.000*a + 0.000*in + 0.000*to + 0.000*you + 0.000*it
topic 2, 0.044*la + 0.010*li + 0.006*thank + 0.005*marries + 0.005*janitor's + 0.005*boy + 0.004*haired + 0.003*red + 0.002*molly + 0.002*milly
topic 3, 0.037*and + 0.033*the + 0.025*you + 0.025*to + 0.021*is + 0.021*of + 0.015*use + 0.014*those + 0.014*a + 0.013*information
topic 4, 0.000*the + 0.000*and + 0.000*of + 0.000*a + 0.000*to + 0.000*i + 0.000*that + 0.000*in + 0.000*you + 0.000*we
topic 5, 0.000*the + 0.000*and + 0.000*to + 0.000*a + 0.000*you + 0.000*of + 0.000*i + 0.000*that + 0.000*is + 0.000*in
topic 6, 0.009*dragonflies + 0.009*monsoon + 0.006*maldives + 0.004*rains + 0.003*tori + 0.003*mal + 0.003*ah + 0.002*rowed + 0.001*amur + 0.001*capsizes
topic 7, 0.041*you + 0.023*deception + 0.022*truth + 0.020*lie + 0.020*to + 0.017*a + 0.01

In [None]:
# From the gensim documentation: "Calculate the Umass topic coherence for each topic. 
# Algorithm from Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence 
# in Topic Models, CEMNLP 2011."

ldamodel.top_topics(corpus, num_words=20)

In [19]:
# Still working on understanding this functionality.
ldamodel.get_document_topics(texts[0], minimum_probability=None, minimum_phi_value=None, per_word_topics=False)

ValueError: too many values to unpack (expected 2)