In [1]:
# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)
talks = df.text.tolist()

# =-=-=-=-=-=-=
# Create labels
# =-=-=-=-=-=-=

authors = df.author.tolist()
dates = df.date.tolist()
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
labels = [author+" "+year for author, year in zip(authors, years)]

In [2]:
# The list we generated above comes with the pandas dataframe labels.
# We pop that first item out of the list
labels.pop(0)

# Now check our list
labels[0:5]

['Al Gore 2006',
 'David Pogue 2006',
 'Cameron Sinclair 2006',
 'Sergey Brin + Larry Page 2007',
 'Nathalie Miebach 2011']

In [3]:
# Okay, here's how Alan Riddell does it (https://de.dariah.eu/tatom/topic_model_python.html)

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

tt_stopwords = open('../data/stopwords_all.txt', 'r').read().splitlines()

# Riddell uses a plain counter here (?)
vectorizer = CountVectorizer(input='talks', stop_words = tt_stopwords, min_df=10)
dtm = vectorizer.fit_transform(talks).toarray()
vocab = np.array(vectorizer.get_feature_names())

In [4]:
print(dtm.shape) # # of documents by the # of terms

(2069, 11470)


In [5]:
from sklearn import decomposition

n_topics = 50
n_top_words = 10

# This is the familiar way to set the NMF parameters.

clf = decomposition.NMF(n_components=n_topics, 
                        solver='cd',
                        random_state = 1,
                        alpha = 0.5,
                        l1_ratio = 0.5)

In [6]:
# And here we take the document-term matrix and grab the doctopic sub-matrix. (Wrong term, I know)
doctopic = clf.fit_transform(dtm) 

In [7]:
topic_words = []
for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:n_top_words]
    topic_words.append([vocab[i] for i in word_idx])

In [8]:
print("Top NMF topics in...")
for i in range(len(doctopic)):
    top_topics = np.argsort(doctopic[i,:])[::-1][0:3]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(labels[i], top_topics_str))

Top NMF topics in...
Al Gore 2006: 8 25 12
David Pogue 2006: 18 3 40
Cameron Sinclair 2006: 0 8 10
Sergey Brin + Larry Page 2007: 13 0 2
Nathalie Miebach 2011: 19 11 1
Richard Wilkinson 2011: 16 49 10
Malcolm Gladwell 2011: 33 16 1
Jay Bradner 2011: 39 32 1
Béatrice Coron 2011: 17 9 46
Hasan Elahi 2011: 28 34 13
Paul Zak 2011: 11 10 45
Anna Mracek Dietrich 2011: 32 1 26
Daniel Wolpert 2011: 34 15 13
Martin Hanczyc 2011: 4 45 40
Aparna Rao 2011: 6 46 24
Ben Kacyra 2011: 10 1 13
Allan Jones 2011: 8 34 41
Charlie Todd 2011: 4 15 46
Alexander Tsiaras 2011: 1 28 34
Yves Rossy 2011: 9 16 38
Thomas Suarez 2011: 11 10 2
Cynthia Kenyon 2011: 0 43 10
Robin Ince 2011: 46 10 38
James Howard Kunstler 2007: 47 26 0
Phil Plait 2011: 40 28 19
Péter Fankhauser 2011: 24 2 0
Joe Sabia 2011: 48 8 45
Britta Riley 2011: 26 34 2
Amy Purdy 2011: 37 1 28
Damon Horowitz 2011: 6 34 10
Annie Murphy Paul 2011: 19 11 7
John Bohannon 2011: 5 37 20
Charles Limb 2011: 9 32 36
Kathryn Schulz 2011: 49 29 7
David Kelley 

Siddhartha Mukherjee 2015: 18 21 14
Nicholas Negroponte 2008: 9 17 0
Neri Oxman 2015: 11 45 0
Teitur 2015: 13 46 24
Vijay Kumar 2015: 11 26 0
Alyson McGregor 2015: 48 15 30
Anders Fjellberg 2015: 5 42 9
Meklit Hadero 2015: 34 39 6
Will Potter 2015: 49 38 10
Jennifer Doudna 2015: 39 3 0
Tom Uglow 2015: 46 8 9
Francesco Sauro 2015: 10 8 13
Jill Bolte Taylor 2008: 24 2 10
Hilary Cottam 2015: 4 0 10
Cesar Harada 2015: 1 0 10
Christine Sun Kim 2015: 30 43 22
Mathias Jud 2015: 49 34 10
Daniel Levitin 2015: 39 1 13
Nancy Lublin 2015: 4 42 33
Melissa Fleming 2015: 16 33 1
Patrícia Medici 2015: 30 3 6
Harald Haas 2015: 14 12 16
Frank Gehry 2008: 21 9 36
Jenni Chang and Lisa Dazols 2015: 13 10 12
Andreas Ekström 2015: 1 26 39
Chelsea Shields 2015: 45 15 11
Jean-Paul Mari 2015: 5 1 0
Josh Luber 2015: 1 15 34
Nonny de la Peña 2015: 32 16 11
Anote Tong 2015: 1 0 37
Carl Safina 2015: 18 7 0
Genevieve von Petzinger 2015: 38 15 19
Ann Morgan 2015: 10 34 38
Majora Carter 2006: 34 1 25
Jimmy Wales 2006:

IndexError: list index out of range

In [None]:
doctopic_normed = doctopic / np.sum(doctopic, axis=1, keepdims=True)

In [9]:
# show the top 15 words
for t in range(len(topic_words)):
    print("Topic {}: {}".format(t, ' '.join(topic_words[t][:15])))

Topic 0: ve back come done seen think ll looking business get
Topic 1: people person group different change lives give social live reason
Topic 2: years ago year last old million three age today five
Topic 3: said went came didn come thought back ll man people
Topic 4: brain neurons brains memory different body first control arm activity
Topic 5: women men woman girls gender talk heart young man sex
Topic 6: life second self lives live living death happiness happy question
Topic 7: think like thinking important idea sense talk quite fact thought
Topic 8: technology computer internet machine information time computers today system technologies
Topic 9: cells stem cell organs body disease organ liver heart tissue
Topic 10: like called okay first made laughter looks thank phone guys
Topic 11: just like bit laughter let okay open thank show couple
Topic 12: fish ocean sea animals species sharks back coral shark oceans
Topic 13: design building made idea art sort designers project create di

In [None]:
# Fit NMF
print("Fitting the NMF model of {} topics to the {} matrix.".format(n_topics, tfidf.shape))
nmf = NMF(n_components = n_topics,
            solver='cd',
            random_state = 1,
            alpha = 0.5,
            l1_ratio = 0.5).fit(tfidf)

# W = model.fit_transform(tfidf)
# H = model.components_

# print(W.shape, H.shape)
# np.savetxt("8505-40-1-02-05-dtm.csv", H, delimiter=",", fmt = "%s")
# np.savetxt("8505-40-1-02-05-twm.csv", W, delimiter=",", fmt = "%s")

feature_names = vectorizer.get_feature_names()

print("Topics in NMF model:")
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #{}:".format(topic_idx))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
# =-=-=-=-=-=
# Get NMF topics
# =-=-=-=-=-=

import sklearn.feature_extraction.text as sk_text
from sklearn.decomposition import NMF

# All our variables are here to make it easier to make adjustments
n_samples = len(strungs)
n_features = 3000
n_topics = 40
n_top_words = 15
max_percent = 0.85
min_percent = 0.05
tt_stopwords = open('../data/stopwords-tt-20170523.txt', 'r').read().splitlines()

# Get tf-idf features for NMF
vectorizer = sk_text.TfidfVectorizer(max_df = max_percent, 
                                     min_df = min_percent,
                                     max_features = n_features,
                                     stop_words = tt_stopwords)
tfidf = vectorizer.fit_transform(strungs)

# Fit the NMF model
nmf = NMF(n_components = n_topics,
          random_state = 1,
          alpha = 0.1,
          l1_ratio = 0.5).fit(tfidf)
print("Fitting the NMF model with {} topics for {} documents with {} features."
      .format(n_topics, n_samples, n_features))

In [None]:
# =-=-=-=-=-=
# Get NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

features = vectorizer.get_feature_names()
# print(features)

# This should work now.

print("Topics in NMF model:")
print_top_words(nmf, features, n_top_words) #n_top_words can be changed on the fly

In [None]:
dtm = tfidf.toarray()
doctopic = nmf.fit_transform(dtm) # This is an array

# features = vectorizer.get_feature_names() # This is already done above.

# =-=-=-=-=-=-=-=-=-=-=
# Unused functionality
# =-=-=-=-=-=-=-=-=-=-= 
# print(features)
# for topicidx in enumerate(nmf.components_):
#     print(topicidx)

In [None]:
# =-=-=-=-=-=
# Saving output to CSV
# =-=-=-=-=-=

import numpy as np

# Since DOCTOPIC is an array, you can just do:
#      np.savetxt("foo.csv", doctopic, delimiter=",", fmt = "%s")
# http://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
#
# The above won't give you the names of the files. Instead try this:

topsnum = np.array([list(range(n_topics))])
# topsnum = np.indices((1,n_topics))[1] <-- this is more than we need,
#                                           but it's cool to know more tricks
#
# Two ways to get an array that is of the form [[0,1,2,3,...]].
# It will have the desired dimensions of (1,35) which is what we want

fileheader = np.concatenate((np.array([["citations"]]), topsnum),axis = 1)
authordate = np.array([df.author])

docTopics = np.concatenate((authordate.T, doctopic), axis = 1)
docTopics = np.concatenate((fileheader, docTopics), axis = 0)

np.savetxt("../outputs/nmf_topics_20170523.csv", doctopic, delimiter=",", fmt = "%s")
#np.savetxt("../data/nmf_topics.csv", docTopics, delimiter=",", fmt = "%s")