In [1]:
#! /usr/bin/env python

import pandas
import re
import sklearn.feature_extraction.text as sktext 
from sklearn.decomposition import NMF
import numpy as np


# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)
talks = df.text.tolist()

# Import stoplist
stopwords = re.split('\s+', open('../data/stopwords_2.txt', 'r').read().lower())

# TFIDF parameters
n_top_words = 15
max_percent = 0.85
min_percent = 0.01 # One percent = 20 talks (so not enought to warrant a topic?)

# Create TFIDF matrix
vectorizer = sktext.TfidfVectorizer(lowercase = True, 
                             stop_words = stopwords,
                             max_df = max_percent,
                             min_df = min_percent)
td_matrix = vectorizer.fit_transform(talks)
print(td_matrix.shape)

(2069, 6950)


In [2]:
print(vectorizer)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['a', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterward', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anyth...', 'god', 'try', 'stuff', 'please', 'little', 're', 'audience', 'hold', 'good', 'say', 'don', 'put'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)


In [3]:
# Fit NMF
model = NMF(n_components = 40,
          random_state = 1,
          alpha = 0.5,
          l1_ratio = 0.5).fit(td_matrix)

W = model.fit_transform(td_matrix)
H = model.components_
print(model, W.shape, H.shape)

NMF(alpha=0.5, beta=1, eta=0.1, init=None, l1_ratio=0.5, max_iter=200,
  n_components=40, nls_max_iter=2000, random_state=1, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0) (2069, 40) (40, 6950)


In [None]:
np.savetxt("8505-40-1-01-05-dtm.csv", H, delimiter=",", fmt = "%s")
np.savetxt("8505-40-1-01-05-twm.csv", W, delimiter=",", fmt = "%s")

In [4]:
# =-=-=-=-=-=
# NMF printing
# =-=-=-=-=-=

def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        print('\nTopic {}:'.format(int(topic_id)))
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))

features = vectorizer.get_feature_names()

print("Topics in NMF model:")
print_top_words(model, features, n_top_words) #n_top_words can be changed on the fly

Topics in NMF model:

Topic 0:
zoom 0.0, facilities 0.0, extremely 0.0, extremes 0.0, eye 0.0, eyes 0.0, fabric 0.0, fabulous 0.0, face 0.0, facebook 0.0, faced 0.0, faces 0.0, facial 0.0, facility 0.0, exploring 0.0, 

Topic 1:
zoom 0.0, facilities 0.0, extremely 0.0, extremes 0.0, eye 0.0, eyes 0.0, fabric 0.0, fabulous 0.0, face 0.0, facebook 0.0, faced 0.0, faces 0.0, facial 0.0, facility 0.0, exploring 0.0, 

Topic 2:
zoom 0.0, facilities 0.0, extremely 0.0, extremes 0.0, eye 0.0, eyes 0.0, fabric 0.0, fabulous 0.0, face 0.0, facebook 0.0, faced 0.0, faces 0.0, facial 0.0, facility 0.0, exploring 0.0, 

Topic 3:
zoom 0.0, facilities 0.0, extremely 0.0, extremes 0.0, eye 0.0, eyes 0.0, fabric 0.0, fabulous 0.0, face 0.0, facebook 0.0, faced 0.0, faces 0.0, facial 0.0, facility 0.0, exploring 0.0, 

Topic 4:
zoom 0.0, facilities 0.0, extremely 0.0, extremes 0.0, eye 0.0, eyes 0.0, fabric 0.0, fabulous 0.0, face 0.0, facebook 0.0, faced 0.0, faces 0.0, facial 0.0, facility 0.0, explo

In [None]:
import csv
with open('Topics.csv','w') as my_file:
    writer=csv.writer(my_file, delimiter='\t', lineterminator='\n',)
    for topic_id, topic in enumerate(model.components_):
        row = print((topic_id)+": "+''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +', ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
        writer.writerow(row)

        