In [1]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re

# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Getting only the years from dates list
years = [re.sub('[A-Za-z ]', '', item) for item in dates]

# Combining year with presenter for citation
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

cited_texts = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

# This just shows that the citation and the text are paired correctly.
# cited_texts.head()

In [3]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Documentation: https://pypi.python.org/pypi/lda
# LDA requires a DTM as input

from nltk.tokenize import WhitespaceTokenizer

# From the Stopwords Notebook:
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())

# Test Strings
doc_a = "You can call me Al."
doc_b = "I can call you Betty."
doc_c = "Who'll be my role model?"

doc_set = [doc_a, doc_b, doc_c] # Test list of strings

# List for loop
texts = []

# loop through document list
for i in talks:
    
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stopped_tokens)

# print(texts[0:5])

In [4]:
# =-=-=-=-=-=
# Generate LDA Model
# =-=-=-=-=-=


# from gensim import corpora, models >>> Test to see if needed
import gensim

# turn our tokenized documents into a id <-> term dictionary
dictionary = gensim.corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# print(corpus[0:3]) # to see the corpus

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=35, 
                                           id2word = dictionary, 
                                           passes=100)

#save the model
#ldamodel.save('../data/lda.model')

# for later: load trained model from file
#model =  models.LdaModel.load('lda.model')

In [None]:
# =-=-=-=-=-=
# Printing Options
# =-=-=-=-=-=


# Prints all topics. Format T = words, Format F = ids
print(ldamodel.show_topics(num_topics=25, num_words=10, formatted=True))

# Same as above but working through list in for loop
for i in range(0, ldamodel.num_topics):
    print(ldamodel.print_topic(i))

In [47]:
print('{}'.format(ldamodel.print_topic(3)) + '\n' + '{}'.format(ldamodel.show_topic(3)))

0.019*like + 0.013*actually + 0.012*brain + 0.010*just + 0.010*cells + 0.010*think + 0.009*babies + 0.008*children + 0.008*see + 0.007*two
[('like', 0.018872414487262068), ('actually', 0.012801400450384949), ('brain', 0.012249001649413069), ('just', 0.0099968190527590477), ('cells', 0.0098472656722105659), ('think', 0.0096294033856906165), ('babies', 0.0091951516720024887), ('children', 0.0082945751438348765), ('see', 0.0078378135797309845), ('two', 0.0073985921731832063)]


In [66]:
for t in range(0, ldamodel.num_topics):
    print('topic {}, {}'.format(t, ldamodel.print_topic(t)))

topic 0, 0.013*people + 0.010*going + 0.007*percent + 0.007*need + 0.007*years + 0.007*food + 0.007*really + 0.007*world + 0.006*like + 0.006*get
topic 1, 0.040*india + 0.032*china + 0.031*country + 0.025*growth + 0.024*economic + 0.016*countries + 0.012*two + 0.011*terms + 0.010*indian + 0.010*political
topic 2, 0.032*game + 0.024*games + 0.010*re + 0.010*play + 0.009*vaccine + 0.009*video + 0.008*virus + 0.007*t + 0.007*m + 0.007*higgs
topic 3, 0.019*like + 0.013*actually + 0.012*brain + 0.010*just + 0.010*cells + 0.010*think + 0.009*babies + 0.008*children + 0.008*see + 0.007*two
topic 4, 0.005*envisions + 0.004*pigeon + 0.004*passenger + 0.004*adam + 0.002*mania + 0.001*sin + 0.001*tailed + 0.001*adrianne + 0.001*pigeons + 0.001*manias
topic 5, 0.011*like + 0.011*actually + 0.009*just + 0.008*see + 0.008*really + 0.007*plant + 0.007*species + 0.007*get + 0.007*know + 0.007*looking
topic 6, 0.012*grownup + 0.008*copyright + 0.006*iran + 0.006*nike + 0.005*awesome + 0.005*fashion + 0

In [64]:
ldamodel.print_topics(25)

[(0,
  '0.013*people + 0.010*going + 0.007*percent + 0.007*need + 0.007*years + 0.007*food + 0.007*really + 0.007*world + 0.006*like + 0.006*get'),
 (1,
  '0.040*india + 0.032*china + 0.031*country + 0.025*growth + 0.024*economic + 0.016*countries + 0.012*two + 0.011*terms + 0.010*indian + 0.010*political'),
 (2,
  '0.032*game + 0.024*games + 0.010*re + 0.010*play + 0.009*vaccine + 0.009*video + 0.008*virus + 0.007*t + 0.007*m + 0.007*higgs'),
 (3,
  '0.019*like + 0.013*actually + 0.012*brain + 0.010*just + 0.010*cells + 0.010*think + 0.009*babies + 0.008*children + 0.008*see + 0.007*two'),
 (4,
  '0.005*envisions + 0.004*pigeon + 0.004*passenger + 0.004*adam + 0.002*mania + 0.001*sin + 0.001*tailed + 0.001*adrianne + 0.001*pigeons + 0.001*manias'),
 (5,
  '0.011*like + 0.011*actually + 0.009*just + 0.008*see + 0.008*really + 0.007*plant + 0.007*species + 0.007*get + 0.007*know + 0.007*looking'),
 (6,
  '0.012*grownup + 0.008*copyright + 0.006*iran + 0.006*nike + 0.005*awesome + 0.005*