In [1]:
#! /usr/bin/env python

# =-=-=-=-=-=
# Consolidated imports for entire notebook
# =-=-=-=-=-=

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import sklearn.feature_extraction.text as text
from sklearn.decomposition import NMF


# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Get years from date list and combing with author list for labels
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

In [9]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Documentation: https://pypi.python.org/pypi/lda
# LDA requires a DTM as input

# From the Stopwords Notebook:
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for i in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stopped_tokens)

In [11]:
# =-=-=-=-=-=-=-=-=-=-=
# Re-Assemble Texts as Strings from Lists of Words
# =-=-=-=-=-=-=-=-=-=-= 

strungs = []
for text in texts:
    strung = ' '.join(text)
    strungs.append(strung)

In [30]:
# =-=-=-=-=-=
# Get NMF topics
# =-=-=-=-=-=

import sklearn.feature_extraction.text as sk_text

# All our variables are here to make it easier to make adjustments
n_samples = len(strungs)
n_features = 2000
n_topics = 40
n_top_words = 15
# tt_stopwords = open('../data/stopwords_tt.txt', 'r').read().splitlines()

# Get tf-idf features for NMF
tfidf_vectorizer = sk_text.TfidfVectorizer(max_df = 0.95,
                                        min_df = 2,
                                        max_features = n_features)
tfidf = tfidf_vectorizer.fit_transform(strungs)

# Fit the NMF model
nmf = NMF(n_components = n_topics,
          random_state = 1,
          alpha = 0.1,
          l1_ratio = 0.5).fit(tfidf)
print("Fitting the NMF model with {} topics for {} documents with {} features."
      .format(n_topics, n_samples, n_features))

Fitting the NMF model with 40 topics for 2092 documents with 2000 features.


In [31]:
# =-=-=-=-=-=
# Get NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

print("Topics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words) #n_top_words can be changed on the fly

Topics in NMF model:

Topic 0:
just 0.9, going 0.82, really 0.77, people 0.71, think 0.71, know 0.7, get 0.63, actually 0.62, things 0.62, see 0.59, right 0.51, little 0.48, time 0.48, thing 0.47, go 0.46, 

Topic 1:
cells 1.81, stem 0.69, cell 0.55, tissue 0.28, bone 0.26, organs 0.23, disease 0.19, drug 0.18, actually 0.13, lab 0.12, drugs 0.12, heart 0.11, patient 0.1, blood 0.1, patients 0.09, 

Topic 2:
energy 1.24, oil 0.87, climate 0.4, carbon 0.36, nuclear 0.35, coal 0.34, gas 0.29, solar 0.27, fuel 0.26, electricity 0.24, emissions 0.2, power 0.2, wind 0.2, percent 0.17, billion 0.16, 

Topic 3:
women 2.5, men 1.01, woman 0.39, sex 0.26, female 0.26, gender 0.24, pm 0.19, male 0.17, violence 0.14, sexual 0.14, equal 0.1, girls 0.1, rights 0.09, boys 0.09, media 0.08, 

Topic 4:
data 2.62, information 0.43, map 0.15, web 0.14, see 0.08, patterns 0.07, numbers 0.07, points 0.06, look 0.06, privacy 0.06, text 0.06, analysis 0.06, computer 0.05, satellite 0.05, collect 0.05, 

Top

In [29]:
dtm = tfidf.toarray()
doctopic = nmf.fit_transform(dtm) # This is an array

In [33]:
# =-=-=-=-=-=
# Saving output to CSV
# =-=-=-=-=-=

# Since DOCTOPIC is an array, you can just do:
#      np.savetxt("foo.csv", doctopic, delimiter=",", fmt = "%s")
# http://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
#
# The above won't give you the names of the files. Instead try this:

topsnum = np.array([list(range(n_topics))])
# topsnum = np.indices((1,n_topics))[1] <-- this is more than we need,
#                                           but it's cool to know more tricks
#
# Two ways to get an array that is of the form [[0,1,2,3,...]].
# It will have the desired dimensions of (1,35) which is what we want

fileheader = np.concatenate((np.array([["citations"]]), topsnum),axis = 1)

docTopics = np.concatenate((authordate, doctopic), axis = 1)
docTopics = np.concatenate((fileheader, docTopics), axis = 0)

np.savetxt("../data/nmf_topics.csv", docTopics, delimiter=",", fmt = "%s")

IndexError: axis 1 out of bounds [0, 1)