In [1]:
#! /usr/bin/env python

# =-=-=-=-=-=
# Consolidated imports for entire notebook
# =-=-=-=-=-=

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import sklearn.feature_extraction.text as text
from sklearn.decomposition import NMF


# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Get years from date list and combing with author list for labels
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

In [2]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Documentation: https://pypi.python.org/pypi/lda
# LDA requires a DTM as input

# From the Stopwords Notebook:
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/stopwords_tt.txt', 'r').read().lower())

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for i in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stopped_tokens)

In [5]:
# =-=-=-=-=-=
# Get NMF topics
# =-=-=-=-=-=

# All our variables are here to make it easier to make adjustments
n_samples = len(texts)
n_features = 1000
n_topics = 35
n_top_words = 15
tt_stopwords = open('../data/stopwords_tt.txt', 'r').read().splitlines()

# Get tf-idf features for NMF
tfidf_vectorizer = text.TfidfVectorizer(max_df = 0.95,
                                        min_df = 2,
                                        max_features = n_features)
tfidf = tfidf_vectorizer.fit_transform(talks)

# Fit the NMF model
nmf = NMF(n_components = n_topics,
          random_state = 1,
          alpha = 0.1,
          l1_ratio = 0.5).fit(tfidf)
print("Fitting the NMF model with {} topics for {} documents with {} features."
      .format(n_topics, n_samples, n_features))

Fitting the NMF model with 35 topics for 2092 documents with 1000 features.


In [None]:
dtm = tfidf.toarray()
doctopic = nmf.fit_transform(dtm) # This is an array

In [None]:
# =-=-=-=-=-=
# Get NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words) #n_top_words can be changed on the fly