In [1]:
#! /usr/bin/env python

# =-=-=-=-=-=
# Consolidated imports for entire notebook
# =-=-=-=-=-=

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import sklearn.feature_extraction.text as sk_text
from sklearn.decomposition import NMF


# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Get years from date list and combing with author list for labels
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

In [2]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Documentation: https://pypi.python.org/pypi/lda
# LDA requires a DTM as input

# From the Stopwords Notebook:
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for i in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stopped_tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Re-Assemble Texts as Strings from Lists of Words
# =-=-=-=-=-=-=-=-=-=-= 

strungs = []
for text in texts:
    strung = ' '.join(text)
    strungs.append(strung)

In [3]:
# =-=-=-=-=-=
# Get NMF topics
# =-=-=-=-=-=


# All our variables are here to make it easier to make adjustments
n_samples = len(strungs)
n_features = 2000
n_topics = 40
n_top_words = 15
# tt_stopwords = open('../data/stopwords_tt.txt', 'r').read().splitlines()

# Get tf-idf features for NMF
vectorizer = sk_text.TfidfVectorizer(max_df = 0.95,
                                        min_df = 2,
                                        max_features = n_features)
tfidf = vectorizer.fit_transform(strungs)

# Fit the NMF model
nmf = NMF(n_components = n_topics,
          random_state = 1,
          alpha = 0.1,
          l1_ratio = 0.5).fit(tfidf)
print("Fitting the NMF model with {} topics for {} documents with {} features."
      .format(n_topics, n_samples, n_features))

Fitting the NMF model with 40 topics for 2092 documents with 2000 features.


In [None]:
# =-=-=-=-=-=
# Get NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

print("Topics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words) #n_top_words can be changed on the fly

In [4]:
dtm = tfidf.toarray()
# doctopic = nmf.fit_transform(dtm) # This is an array

In [5]:
dtm.shape

(2092, 2000)

In [6]:
type(nmf)

sklearn.decomposition.nmf.NMF

In [7]:
features = vectorizer.get_feature_names()

In [11]:
for topicidx in enumerate(nmf.components_):
    print(topicidx)

(0, array([ 0.08290412,  0.21683476,  0.02008461, ...,  0.03555429,
        0.01177436,  0.04738723]))
(1, array([ 0.        ,  0.04105155,  0.        , ...,  0.        ,
        0.        ,  0.        ]))
(2, array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
        0.        ,  0.03067794]))
(3, array([ 0.        ,  0.        ,  0.        , ...,  0.07211298,
        0.        ,  0.        ]))
(4, array([ 0.,  0.,  0., ...,  0.,  0.,  0.]))
(5, array([ 0.,  0.,  0., ...,  0.,  0.,  0.]))
(6, array([ 0.,  0.,  0., ...,  0.,  0.,  0.]))
(7, array([ 0.04815598,  0.0468227 ,  0.        , ...,  0.        ,
        0.        ,  0.        ]))
(8, array([ 0.        ,  0.        ,  0.        , ...,  0.02196816,
        0.00354517,  0.        ]))
(9, array([ 0.        ,  0.01496185,  0.        , ...,  0.        ,
        0.        ,  0.        ]))
(10, array([ 0.        ,  0.        ,  0.        , ...,  0.04021474,
        0.        ,  0.        ]))
(11, array([ 0.       ,  0.0502

In [None]:
# =-=-=-=-=-=
# Saving output to CSV
# =-=-=-=-=-=

# Since DOCTOPIC is an array, you can just do:
#      np.savetxt("foo.csv", doctopic, delimiter=",", fmt = "%s")
# http://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
#
# The above won't give you the names of the files. Instead try this:

topsnum = np.array([list(range(n_topics))])
# topsnum = np.indices((1,n_topics))[1] <-- this is more than we need,
#                                           but it's cool to know more tricks
#
# Two ways to get an array that is of the form [[0,1,2,3,...]].
# It will have the desired dimensions of (1,35) which is what we want

fileheader = np.concatenate((np.array([["citations"]]), topsnum),axis = 1)

docTopics = np.concatenate((authordate, doctopic), axis = 1)
docTopics = np.concatenate((fileheader, docTopics), axis = 0)

np.savetxt("../data/nmf_topics.csv", docTopics, delimiter=",", fmt = "%s")