In [41]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re


# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Getting only the years from dates list
years = [re.sub('[A-Za-z ]', '', item) for item in dates]

# Combining year with presenter for citation
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

citations = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

# This just shows that the citation and the text are paired correctly.
# citations.head()

Unnamed: 0,citation,text
0,Author,Text
1,Al Gore 2006,Thank you so much Chris. And it's truly a gre...
2,David Pogue 2006,Hello voice mail my old friend. I've called f...
3,Cameron Sinclair 2006,I'm going to take you on a journey very quickl...
4,Sergey Brin + Larry Page 2007,Sergey Brin I want to discuss a question I kn...


In [43]:
# =-=-=-=-=-=-=-=-=-=-=
# Settings & Display Functions
# =-=-=-=-=-=-=-=-=-=-= 

n_topics = 50
n_features = 5000
n_top_words = 10
n_top_documents = 3


stopwords = re.split('\s+', open('../data/stopwords_all.txt', 'r').read().lower())

def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("| "+str(topic_idx)+" |"+' '.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +',' for i in topic.argsort()[:-n_top_words - 1:-1]])+"|")
        
# Both NMF **and** LDA produce two matrices: 
# H - words to topics
# W - topics to documents

def display_topics(H, W, feature_names, documents, n_top_words, n_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:n_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

In [13]:
        print("| {}| ".format(topic_idx))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-no_top_words - 1:-1]]))


# This version of the print/display function only lists words (no values)
#        print(" ".join([feature_names[i]
#                        for i in topic.argsort()[:-no_top_words - 1:-1]]))       

In [27]:
# =-=-=-=-=-=
# Generate LDA Model
# =-=-=-=-=-=

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# LDA can only use raw term counts (virtual BoW)
tf_vectorizer = CountVectorizer(max_df = 0.95, 
                                min_df = 2, 
                                max_features = n_features, 
                                stop_words = stopwords)
tf = tf_vectorizer.fit_transform(talks)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_topics = n_topics, 
                                max_iter = 5, 
                                learning_method = 'online', 
                                learning_offset = 50.,
                                random_state = 0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

In [30]:
display_topics(lda, tf_feature_names, n_top_words)

| 0 |just 0.59, africa 0.59, people 0.43, africans 0.39, sectors 0.31, think 0.31, time 0.26, million 0.25, dollars 0.24, sector 0.24,|
| 1 |know 0.54, battery 0.51, just 0.41, see 0.3, people 0.29, places 0.28, time 0.27, back 0.27, think 0.26, first 0.26,|
| 2 |energy 0.17, earth 0.14, warming 0.12, climate 0.12, year 0.12, co 0.11, years 0.11, change 0.1, time 0.09, ocean 0.09,|
| 3 |iran 145.87, east 113.02, israel 98.82, middle 87.35, region 66.98, arab 61.62, peace 59.92, israeli 49.68, islamic 47.91, great 47.7,|
| 4 |weather 14.61, scores 4.46, score 3.87, data 3.11, numbers 2.25, colored 1.9, versus 1.71, element 1.59, vertical 1.51, dimensional 1.38,|
| 5 |device 93.58, surveillance 67.07, gps 65.98, devices 59.69, wireless 59.34, hackers 55.55, security 54.6, rb 52.88, software 51.17, hacking 47.43,|
| 6 |universe 258.72, quantum 164.32, particles 103.74, higgs 73.19, atoms 62.62, mechanics 59.88, particle 57.07, field 39.75, physics 37.23, physicists 29.85,|
| 7 |data 460.8

In [36]:
# =-=-=-=-=-=-=-=-=-=-=
# NMF Model
# =-=-=-=-=-=-=-=-=-=-= 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, 
                                   min_df = 2, 
                                   max_features = n_features, 
                                   stop_words = stopwords)
tfidf = tfidf_vectorizer.fit_transform(talks)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf = NMF(n_components=n_topics, 
          random_state=1, 
          alpha=.1, 
          l1_ratio=.5, 
          init='nndsvd').fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_

In [31]:
display_topics(nmf, tfidf_feature_names, n_top_words)

| 0 |people 0.8, just 0.73, know 0.66, think 0.54, time 0.48, see 0.47, said 0.44, life 0.36, years 0.36, first 0.33,|
| 1 |kids 1.05, school 0.97, students 0.75, teachers 0.63, education 0.62, teacher 0.35, schools 0.3, learning 0.29, classroom 0.28, teach 0.24,|
| 2 |countries 0.47, people 0.46, percent 0.38, money 0.34, country 0.34, government 0.33, dollars 0.32, global 0.32, economic 0.32, economy 0.29,|
| 3 |brain 2.1, neurons 0.48, brains 0.28, cortex 0.16, neuron 0.15, activity 0.14, neuroscience 0.13, arm 0.13, memory 0.12, disorders 0.12,|
| 4 |cancer 1.78, tumor 0.42, breast 0.22, disease 0.19, drug 0.18, tumors 0.16, body 0.16, protein 0.14, blood 0.13, cancers 0.12,|
| 5 |dna 1.19, genome 0.5, genes 0.35, gene 0.3, chromosome 0.28, genetic 0.27, cell 0.24, molecular 0.21, code 0.21, species 0.19,|
| 6 |universe 1.2, galaxies 0.47, galaxy 0.43, stars 0.35, light 0.27, telescope 0.25, dark 0.25, see 0.2, planets 0.17, theory 0.17,|
| 7 |women 2.05, men 0.89, girls 0.51, woma

In [42]:
display_topics(nmf_H, nmf_W, tfidf_feature_names, talks, n_top_words, n_top_documents)

Topic 0:
people just know think time see said life years first


KeyError: 1051