In [2]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re


# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Getting only the years from dates list
years = [re.sub('[A-Za-z ]', '', item) for item in dates]

# Combining year with presenter for citation
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

cited_texts = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

# This just shows that the citation and the text are paired correctly.
cited_texts.head()

Unnamed: 0,citation,text
0,Author,Text
1,Al Gore 2006,Thank you so much Chris. And it's truly a gre...
2,David Pogue 2006,Hello voice mail my old friend. I've called f...
3,Cameron Sinclair 2006,I'm going to take you on a journey very quickl...
4,Sergey Brin + Larry Page 2007,Sergey Brin I want to discuss a question I kn...


In [24]:
# =-=-=-=-=-=-=-=-=-=-=
# Settings & Display Function
# =-=-=-=-=-=-=-=-=-=-= 

no_topics = 50
no_features = 5000
no_top_words = 15

stopwords = re.split('\s+', open('../data/stopwords_all.txt', 'r').read().lower())

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("| "+str(topic_idx)+" |"+' '.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +',' for i in topic.argsort()[:-no_top_words - 1:-1]])+"|")

In [13]:
        print("| {}| ".format(topic_idx))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-no_top_words - 1:-1]]))


# This version of the print/display function only lists words (no values)
#        print(" ".join([feature_names[i]
#                        for i in topic.argsort()[:-no_top_words - 1:-1]]))       

In [4]:
# =-=-=-=-=-=
# Generate LDA Model
# =-=-=-=-=-=

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# LDA can only use raw term counts (virtual BoW)
tf_vectorizer = CountVectorizer(max_df = 0.95, 
                                min_df = 2, 
                                max_features = no_features, 
                                stop_words = stopwords)
tf = tf_vectorizer.fit_transform(talks)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_topics = no_topics, 
                                max_iter = 5, 
                                learning_method = 'online', 
                                learning_offset = 50.,
                                random_state = 0).fit(tf)

In [5]:
# =-=-=-=-=-=-=-=-=-=-=
# NMF Model
# =-=-=-=-=-=-=-=-=-=-= 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, 
                                   min_df = 2, 
                                   max_features = no_features, 
                                   stop_words = stopwords)
tfidf = tfidf_vectorizer.fit_transform(talks)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf = NMF(n_components=no_topics, 
          random_state=1, 
          alpha=.1, 
          l1_ratio=.5, 
          init='nndsvd').fit(tfidf)

In [25]:
display_topics(lda, tf_feature_names, no_top_words)

| 0 |just 0.59, africa 0.59, people 0.43, africans 0.39, sectors 0.31, think 0.31, time 0.26, million 0.25, dollars 0.24, sector 0.24, percent 0.23, know 0.23, african 0.22, words 0.21, said 0.21,|
| 1 |know 0.54, battery 0.51, just 0.41, see 0.3, people 0.29, places 0.28, time 0.27, back 0.27, think 0.26, first 0.26, laughter 0.25, metal 0.25, liquid 0.25, life 0.24, fact 0.24,|
| 2 |energy 0.17, earth 0.14, warming 0.12, climate 0.12, year 0.12, co 0.11, years 0.11, change 0.1, time 0.09, ocean 0.09, life 0.09, ice 0.09, mother 0.09, level 0.08, global 0.08,|
| 3 |iran 145.87, east 113.02, israel 98.82, middle 87.35, region 66.98, arab 61.62, peace 59.92, israeli 49.68, islamic 47.91, great 47.7, palestinian 41.0, king 39.68, stasi 38.34, war 33.31, regime 29.32,|
| 4 |weather 14.61, scores 4.46, score 3.87, data 3.11, numbers 2.25, colored 1.9, versus 1.71, element 1.59, vertical 1.51, dimensional 1.38, elements 1.3, translation 1.28, temperature 1.25, horizontal 1.24, visualization

In [26]:
display_topics(nmf, tfidf_feature_names, no_top_words)

| 0 |people 0.8, just 0.73, know 0.66, think 0.54, time 0.48, see 0.47, said 0.44, life 0.36, years 0.36, first 0.33, back 0.32, ll 0.29, day 0.27, come 0.25, different 0.25,|
| 1 |kids 1.05, school 0.97, students 0.75, teachers 0.63, education 0.62, teacher 0.35, schools 0.3, learning 0.29, classroom 0.28, teach 0.24, teaching 0.24, class 0.22, student 0.19, kid 0.16, learn 0.16,|
| 2 |countries 0.47, people 0.46, percent 0.38, money 0.34, country 0.34, government 0.33, dollars 0.32, global 0.32, economic 0.32, economy 0.29, growth 0.28, billion 0.25, democracy 0.24, social 0.23, business 0.23,|
| 3 |brain 2.1, neurons 0.48, brains 0.28, cortex 0.16, neuron 0.15, activity 0.14, neuroscience 0.13, arm 0.13, memory 0.12, disorders 0.12, body 0.11, mental 0.11, electrical 0.11, human 0.11, consciousness 0.1,|
| 4 |cancer 1.78, tumor 0.42, breast 0.22, disease 0.19, drug 0.18, tumors 0.16, body 0.16, protein 0.14, blood 0.13, cancers 0.12, prostate 0.12, lung 0.1, devil 0.09, vessels 0.09