In [1]:
#! /usr/bin/env python

# =-=-=-=-=-=
# Consolidated imports for entire notebook
# =-=-=-=-=-=

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import *
from nltk.stem.porter import *
import numpy as np
import sklearn.feature_extraction.text as sk_text
from sklearn.decomposition import NMF


# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Get years from date list and combing with author list for labels
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

In [None]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Load tokenizer, stopwords, and stemmer
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())
p_stemmer = PorterStemmer()

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for i in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Re-Assemble Texts as Strings from Lists of Words
# =-=-=-=-=-=-=-=-=-=-= 

strungs = []
for text in texts:
    strung = ' '.join(text)
    strungs.append(strung)

In [3]:
# =-=-=-=-=-=
# Get NMF topics
# =-=-=-=-=-=


# All our variables are here to make it easier to make adjustments
data_set = talks # or talks
n_samples = len(data_set)
n_features = 1000
n_topics = 35
n_top_words = 15
# tt_stopwords = open('../data/stopwords_tt.txt', 'r').read().splitlines()

# Get tf-idf features for NMF
vectorizer = sk_text.TfidfVectorizer(max_df = 0.90,
                                        min_df = 0.01,
                                        max_features = n_features)
tfidf = vectorizer.fit_transform(data_set)

# Fit the NMF model
nmf = NMF(n_components = n_topics,
          random_state = 1,
          alpha = 0.1,
          l1_ratio = 0.5).fit(tfidf)
print("Fitting the NMF model with {} topics for {} documents with {} features."
      .format(n_topics, n_samples, n_features))

Fitting the NMF model with 35 topics for 2092 documents with 1000 features.


In [4]:
# =-=-=-=-=-=
# Get NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

features = vectorizer.get_feature_names()
#print(features)

print("Topics in NMF model:")

# KK - I get an error here. My python does not recognize tfidf_vectorizer()
#tfidf_feature_names = nmf.get_feature_names()
# This works now. I added features above. Probably broken by me
# not seeing a name change. 
print_top_words(nmf, features, n_top_words) #n_top_words can be changed on the fly

Topics in NMF model:

Topic 0:
very 0.88, actually 0.83, see 0.75, which 0.71, really 0.64, here 0.64, re 0.64, then 0.51, some 0.51, think 0.49, into 0.48, things 0.47, way 0.44, different 0.43, would 0.43, 

Topic 1:
he 3.77, his 1.63, him 1.01, said 0.4, who 0.27, man 0.26, guy 0.24, father 0.18, had 0.15, story 0.14, says 0.14, know 0.12, did 0.1, face 0.09, god 0.08, 

Topic 2:
women 2.69, men 1.09, woman 0.41, sex 0.27, who 0.18, violence 0.16, their 0.1, girls 0.1, rights 0.09, stories 0.09, man 0.08, were 0.08, media 0.08, country 0.08, young 0.07, 

Topic 3:
she 3.25, her 2.29, said 0.23, mother 0.22, girl 0.22, woman 0.2, women 0.13, who 0.12, mom 0.11, know 0.09, old 0.08, little 0.08, don 0.08, met 0.08, baby 0.07, 

Topic 4:
water 2.7, into 0.16, surface 0.12, use 0.1, ve 0.09, air 0.09, plant 0.09, environment 0.09, india 0.09, material 0.07, green 0.07, south 0.06, re 0.06, here 0.06, produce 0.06, 

Topic 5:
cancer 2.24, disease 0.44, patients 0.44, drug 0.33, patient 0

In [None]:
dtm = tfidf.toarray()
doctopic = nmf.fit_transform(dtm) # This is an array

dtm.shape

In [None]:
for topicidx in enumerate(nmf.components_):
    print(topicidx)

In [None]:
# =-=-=-=-=-=
# Saving output to CSV
# =-=-=-=-=-=

# Since DOCTOPIC is an array, you can just do:
#      np.savetxt("foo.csv", doctopic, delimiter=",", fmt = "%s")
# http://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
#
# The above won't give you the names of the files. Instead try this:

topsnum = np.array([list(range(n_topics))])
# topsnum = np.indices((1,n_topics))[1] <-- this is more than we need,
#                                           but it's cool to know more tricks
#
# Two ways to get an array that is of the form [[0,1,2,3,...]].
# It will have the desired dimensions of (1,35) which is what we want

fileheader = np.concatenate((np.array([["citations"]]), topsnum),axis = 1)
authordate = np.array([df.author])

docTopics = np.concatenate((authordate.T, doctopic), axis = 1)
docTopics = np.concatenate((fileheader, docTopics), axis = 0)

np.savetxt("../data/dt_KK_test.csv", doctopic, delimiter=",", fmt = "%s")
#np.savetxt("../data/nmf_topics.csv", docTopics, delimiter=",", fmt = "%s")