In [2]:
# TEDtalks: Topics with NMF

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re


# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Getting only the years from dates list
years = [re.sub('[A-Za-z ]', '', item) for item in dates]

# Combining year with presenter for citation
authordate = [author+" "+year for author, year in zip(authors, years)]

# Check to see if things are synced with new df with the two lists.

citations = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

# This just shows that the citation and the text are paired correctly.
# citations.head()

In [6]:
# =-=-=-=-=-=-=-=-=-=-=
# Settings & Display Functions
# =-=-=-=-=-=-=-=-=-=-= 

n_topics = 40
n_features = 5000
n_top_words = 10
n_top_documents = 5

stopwords = re.split('\s+', open('../data/stopwords_all.txt', 'r').read().lower())

def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "{:d}: ".format(topic_idx)
        message += " ".join([feature_names[i] + ' ' + str(round(topic[i], 2)) + ','
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
# Both NMF **and** LDA produce two matrices: 
# H - words to topics
# W - topics to documents

def display_topics_HW(H, W, feature_names, documents, n_top_words, n_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:n_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

In [7]:
# =-=-=-=-=-=-=-=-=-=-=
# NMF Model
# =-=-=-=-=-=-=-=-=-=-= 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, 
                                   min_df = 2, 
                                   max_features = n_features, 
                                   stop_words = stopwords)
tfidf = tfidf_vectorizer.fit_transform(talks)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf = NMF(n_components=n_topics, 
          random_state=1, 
          alpha=.1, 
          l1_ratio=.5, 
          init='nndsvd').fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_

In [8]:
display_topics(nmf, tfidf_feature_names, n_top_words)

0: people 0.81, just 0.73, know 0.57, think 0.56, see 0.48, time 0.48, years 0.36, life 0.35, first 0.33, back 0.3,
1: bacteria 1.39, molecule 0.28, antibiotics 0.2, molecules 0.15, organisms 0.12, bacterial 0.11, viruses 0.1, immune 0.08, sensing 0.07, tumors 0.07,
2: percent 0.46, countries 0.44, money 0.42, dollars 0.41, growth 0.34, economic 0.34, economy 0.33, people 0.32, business 0.3, billion 0.3,
3: patients 0.92, health 0.7, patient 0.6, care 0.48, medical 0.42, doctors 0.41, hospital 0.3, doctor 0.3, surgery 0.29, medicine 0.29,
4: ice 0.98, pole 0.44, antarctica 0.39, climate 0.25, glacier 0.23, snow 0.16, south 0.16, polar 0.15, north 0.14, expedition 0.13,
5: universe 1.22, galaxies 0.43, galaxy 0.36, space 0.27, dark 0.24, theory 0.24, quantum 0.21, physics 0.21, higgs 0.2, particles 0.19,
6: women 2.17, men 0.94, woman 0.38, gender 0.22, pm 0.18, violence 0.15, female 0.15, feminist 0.15, equality 0.14, stories 0.12,
7: brain 2.11, neurons 0.49, brains 0.28, cortex 0.16,

In [11]:
import numpy as np

display_topics_HW(nmf_H, nmf_W, tfidf_feature_names, talks, n_top_words, n_top_documents)

Topic 0:
people just know think see time years life first back
Wow  what an honor. I always wondered what this would feel like. So eight years ago  I got the worst career advice of my life. I had a friend tell me   Don't worry about how much you like the work you're doing now. It's all about just building your resume.  And I'd just come back from living in Spain for a while  and I'd joined this Fortune     company. I thought   This is fantastic. I'm going to have big impact on the world.  I had all these ideas. And within about two months  I noticed at about   am every morning I had this strange urge to want to slam my head through the monitor of my computer. I don't know if anyone's ever felt that. And I noticed pretty soon after that that all the competitors in our space had already automated my job role. And this is right about when I got this sage advice to build up my resume. Well  as I'm trying to figure out what two story window I'm going to jump out of and change things up  I r