In [1]:
# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3.csv', names=colnames)
talks = df.text.tolist()

# =-=-=-=-=-=-=-=-=-=-=
# Create citations to identify individual texts
# =-=-=-=-=-=-=-=-=-=-= 
# authors = df.author.tolist()
# dates = df.date.tolist()
# years = [re.sub('[A-Za-z ]', '', item) for item in dates]
# authordate = [author+" "+year for author, year in zip(authors, years)]

In [2]:
# =-=-=-=-=-=
# Clean, Tokenize, Drop Stopwords, Stem
# =-=-=-=-=-=

from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import *
from nltk.stem.porter import *

# Load tokenizer, stopwords, and stemmer
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())
p_stemmer = PorterStemmer()

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for i in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Re-Assemble Texts as Strings from Lists of Words
# =-=-=-=-=-=-=-=-=-=-= 

strungs = []
for text in texts:
    strung = ' '.join(text)
    strungs.append(strung)

In [3]:
# =-=-=-=-=-=
# Get NMF topics
# =-=-=-=-=-=

import sklearn.feature_extraction.text as sk_text
from sklearn.decomposition import NMF

# All our variables are here to make it easier to make adjustments
data_set = talks # or talks
n_samples = len(data_set)
n_features = 1500
n_topics = 35
n_top_words = 15
# tt_stopwords = open('../data/stopwords_tt.txt', 'r').read().splitlines()

# Get tf-idf features for NMF
vectorizer = sk_text.TfidfVectorizer(max_df = 0.90,
                                        min_df = 0.01,
                                        max_features = n_features)
tfidf = vectorizer.fit_transform(data_set)

# Fit the NMF model
nmf = NMF(n_components = n_topics,
          random_state = 1,
          alpha = 0.1,
          l1_ratio = 0.5).fit(tfidf)
print("Fitting the NMF model with {} topics for {} documents with {} features."
      .format(n_topics, n_samples, n_features))

Fitting the NMF model with 35 topics for 2068 documents with 1500 features.


In [4]:
# =-=-=-=-=-=
# Get NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

features = vectorizer.get_feature_names()
#print(features)

print("Topics in NMF model:")

# KK - I get an error here. My python does not recognize tfidf_vectorizer()
#tfidf_feature_names = nmf.get_feature_names()
# This works now. I added features above. Probably broken by me
# not seeing a name change. 
print_top_words(nmf, features, n_top_words) #n_top_words can be changed on the fly

Topics in NMF model:

Topic 0:
going 0.79, really 0.71, very 0.69, think 0.68, here 0.65, your 0.65, know 0.64, get 0.61, see 0.6, things 0.58, ve 0.57, some 0.54, then 0.53, don 0.52, right 0.49, 

Topic 1:
she 3.09, her 2.17, said 0.22, mother 0.2, woman 0.2, girl 0.19, daughter 0.16, who 0.14, women 0.14, mom 0.1, their 0.1, husband 0.09, old 0.09, don 0.08, know 0.08, 

Topic 2:
africa 2.0, african 0.78, hiv 0.32, south 0.23, countries 0.19, aid 0.16, leaders 0.16, aids 0.12, farmers 0.09, here 0.09, world 0.08, country 0.08, very 0.07, has 0.06, east 0.06, 

Topic 3:
he 3.53, his 1.56, him 0.98, said 0.36, man 0.26, who 0.26, had 0.23, guy 0.2, himself 0.19, father 0.19, story 0.15, were 0.12, says 0.1, did 0.1, arm 0.1, 

Topic 4:
brain 2.46, neurons 0.53, brains 0.33, your 0.25, activity 0.16, mental 0.15, arm 0.15, memory 0.14, body 0.13, human 0.13, consciousness 0.11, behavior 0.1, mind 0.1, visual 0.1, pain 0.09, 

Topic 5:
universe 1.35, earth 0.55, planets 0.54, stars 0.53

In [None]:
dtm = tfidf.toarray()
doctopic = nmf.fit_transform(dtm) # This is an array

dtm.shape

In [None]:
for topicidx in enumerate(nmf.components_):
    print(topicidx)

In [None]:
# =-=-=-=-=-=
# Saving output to CSV
# =-=-=-=-=-=

import numpy as np

# Since DOCTOPIC is an array, you can just do:
#      np.savetxt("foo.csv", doctopic, delimiter=",", fmt = "%s")
# http://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
#
# The above won't give you the names of the files. Instead try this:

topsnum = np.array([list(range(n_topics))])
# topsnum = np.indices((1,n_topics))[1] <-- this is more than we need,
#                                           but it's cool to know more tricks
#
# Two ways to get an array that is of the form [[0,1,2,3,...]].
# It will have the desired dimensions of (1,35) which is what we want

fileheader = np.concatenate((np.array([["citations"]]), topsnum),axis = 1)
authordate = np.array([df.author])

docTopics = np.concatenate((authordate.T, doctopic), axis = 1)
docTopics = np.concatenate((fileheader, docTopics), axis = 0)

np.savetxt("../data/dt_KK_test.csv", doctopic, delimiter=",", fmt = "%s")
#np.savetxt("../data/nmf_topics.csv", docTopics, delimiter=",", fmt = "%s")