In [2]:
# =-=-=-=-=-=-=-=-=-=-=
# Imports
# =-=-=-=-=-=-=-=-=-=-= 

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np


# =-=-=-=-=-=-=-=-=-=-=
# Inputs
# =-=-=-=-=-=-=-=-=-=-= 

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)
talks = df.text.tolist()

# Import stoplist
stopwords = re.split('\s+', open('../data/stopwords_2.txt', 'r').read().lower())


# =-=-=-=-=-=
# Drop Stopwords
# =-=-=-=-=-=

tokenizer = WhitespaceTokenizer()

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for talk in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', talk).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    passed = [i for i in tokens if not i in stopwords]
    # add tokens to list
    texts.append(passed)

# Re-Assemble Texts as Strings from Lists of Words
strings = []
for text in texts:
    the_string = ' '.join(text)
    strings.append(the_string)

In [12]:
# =-=-=-=-=-=
# NMF Topics
# =-=-=-=-=-=


# TFIDF parameters
n_samples = len(strings)
# n_features = 3000
n_top_words = 15
max_percent = 0.85
min_percent = 0.05

# Create TFIDF matrix
vectorizer = TfidfVectorizer(max_df = max_percent, 
                             min_df = min_percent)
tfidf = vectorizer.fit_transform(strings)

# Fit the NMF model
model = NMF(n_components = 40,
              random_state = 1,
              alpha = 0.1,
              l1_ratio = 0.5).fit(tfidf)

In [13]:
W = model.fit_transform(tfidf)
H = model.components_

In [14]:
H.shape

(40, 1830)

In [15]:
W.shape

(2069, 40)

In [16]:
tfidf.shape

(2069, 1830)

In [17]:
np.savetxt("../nmf/doctopic_1.csv", H, delimiter=",", fmt = "%s")

In [18]:
np.savetxt("../nmf/topicword_1.csv", W, delimiter=",", fmt = "%s")

In [None]:
# =-=-=-=-=-=
# NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

features = vectorizer.get_feature_names()

print("Topics in NMF model:")
print_top_words(model, features, n_top_words) #n_top_words can be changed on the fly