In [22]:
# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)
talks = df.text.tolist()


# =-=-=-=-=-=-=-=-=-=-=
# Create citations to identify individual texts
# =-=-=-=-=-=-=-=-=-=-= 

# authors = df.author.tolist()
# dates = df.date.tolist()
# years = [re.sub('[A-Za-z ]', '', item) for item in dates]
# authordate = [author+" "+year for author, year in zip(authors, years)]

In [25]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

from nltk.tokenize import WhitespaceTokenizer

# From the Stopwords Notebook:
tokenizer = WhitespaceTokenizer()
# stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for talk in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', talk).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    # stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Re-Assemble Texts as Strings from Lists of Words
# =-=-=-=-=-=-=-=-=-=-= 

strungs = []
for text in texts:
    strung = ' '.join(text)
    strungs.append(strung)

In [28]:
# =-=-=-=-=-=
# Get NMF topics
# =-=-=-=-=-=

import sklearn.feature_extraction.text as sk_text
from sklearn.decomposition import NMF

# All our variables are here to make it easier to make adjustments
n_samples = len(strungs)
n_features = 3000
n_topics = 40
n_top_words = 15
max_percent = 0.85
min_percent = 0.05
tt_stopwords = open('../data/stopwords-tt-20170523.txt', 'r').read().splitlines()

# Get tf-idf features for NMF
vectorizer = sk_text.TfidfVectorizer(max_df = max_percent, 
                                     min_df = min_percent,
                                     max_features = n_features,
                                     stop_words = tt_stopwords)
tfidf = vectorizer.fit_transform(strungs)

# Fit the NMF model
nmf = NMF(n_components = n_topics,
          random_state = 1,
          alpha = 0.1,
          l1_ratio = 0.5).fit(tfidf)
print("Fitting the NMF model with {} topics for {} documents with {} features."
      .format(n_topics, n_samples, n_features))

Fitting the NMF model with 40 topics for 2069 documents with 3000 features.


In [29]:
# =-=-=-=-=-=
# Get NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

features = vectorizer.get_feature_names()
# print(features)

# This should work now.

print("Topics in NMF model:")
print_top_words(nmf, features, n_top_words) #n_top_words can be changed on the fly

Topics in NMF model:

Topic 0:
different 0.4, ll 0.39, years 0.39, let 0.38, back 0.36, idea 0.34, need 0.33, human 0.3, show 0.3, great 0.3, three 0.3, able 0.29, called 0.29, life 0.29, sort 0.29, 

Topic 1:
children 2.09, child 0.95, babies 0.39, parents 0.38, family 0.32, families 0.29, mother 0.2, baby 0.16, schools 0.16, boy 0.15, countries 0.14, years 0.09, old 0.09, school 0.09, adults 0.09, 

Topic 2:
war 0.7, democracy 0.68, political 0.51, government 0.51, country 0.41, power 0.34, rights 0.34, peace 0.31, states 0.3, security 0.28, civil 0.28, citizens 0.28, politics 0.28, countries 0.26, global 0.26, 

Topic 3:
cells 2.23, cell 0.73, body 0.29, drug 0.2, disease 0.18, blood 0.16, lab 0.15, drugs 0.14, diseases 0.12, skin 0.12, grow 0.12, heart 0.11, animal 0.1, patient 0.1, material 0.1, 

Topic 4:
women 2.61, men 1.04, woman 0.44, sex 0.3, female 0.28, gender 0.25, male 0.18, equal 0.11, girls 0.1, young 0.1, husband 0.1, rights 0.1, media 0.09, boys 0.09, stories 0.08, 


In [30]:
dtm = tfidf.toarray()
doctopic = nmf.fit_transform(dtm) # This is an array

# features = vectorizer.get_feature_names() # This is already done above.

# =-=-=-=-=-=-=-=-=-=-=
# Unused functionality
# =-=-=-=-=-=-=-=-=-=-= 
# print(features)
# for topicidx in enumerate(nmf.components_):
#     print(topicidx)

In [32]:
# =-=-=-=-=-=
# Saving output to CSV
# =-=-=-=-=-=

import numpy as np

# Since DOCTOPIC is an array, you can just do:
#      np.savetxt("foo.csv", doctopic, delimiter=",", fmt = "%s")
# http://stackoverflow.com/questions/6081008/dump-a-numpy-array-into-a-csv-file
#
# The above won't give you the names of the files. Instead try this:

topsnum = np.array([list(range(n_topics))])
# topsnum = np.indices((1,n_topics))[1] <-- this is more than we need,
#                                           but it's cool to know more tricks
#
# Two ways to get an array that is of the form [[0,1,2,3,...]].
# It will have the desired dimensions of (1,35) which is what we want

fileheader = np.concatenate((np.array([["citations"]]), topsnum),axis = 1)
authordate = np.array([df.author])

docTopics = np.concatenate((authordate.T, doctopic), axis = 1)
docTopics = np.concatenate((fileheader, docTopics), axis = 0)

np.savetxt("../outputs/nmf_topics_20170523.csv", doctopic, delimiter=",", fmt = "%s")
#np.savetxt("../data/nmf_topics.csv", docTopics, delimiter=",", fmt = "%s")