In [2]:
# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re

# Create pandas dataframe & lists
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)
talks = df.text.tolist()

# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

from nltk.tokenize import WhitespaceTokenizer

# From the Stopwords Notebook:
tokenizer = WhitespaceTokenizer()
# stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for talk in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', talk).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    # stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Re-Assemble Texts as Strings from Lists of Words
# =-=-=-=-=-=-=-=-=-=-= 

strungs = []
for text in texts:
    strung = ' '.join(text)
    strungs.append(strung)

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans

def find_topic(texts, topic_model, n_topics, vec_model="tf", thr=1e-2, **kwargs):
    """Return a list of topics from texts by topic models - for demostration of simple data
    texts: array-like strings
    topic_model: {"nmf", "svd", "lda", "kmeans"} for LSA_NMF, LSA_SVD, LDA, KMEANS (not actually a topic model)
    n_topics: # of topics in texts
    vec_model: {"tf", "tfidf"} for term_freq, term_freq_inverse_doc_freq
    thr: threshold for finding keywords in a topic model
    """
    ## 1. vectorization
    vectorizer = CountVectorizer() if vec_model == "tf" else TfidfVectorizer()
    text_vec = vectorizer.fit_transform(texts)
    words = np.array(vectorizer.get_feature_names())
    ## 2. topic finding
    topic_models = {"nmf": NMF, "svd": TruncatedSVD, "lda": LatentDirichletAllocation, "kmeans": KMeans}
    topicfinder = topic_models[topic_model](n_topics, **kwargs).fit(text_vec)
    topic_dists = topicfinder.components_ if topic_model is not "kmeans" else topicfinder.cluster_centers_
    topic_dists /= topic_dists.max(axis = 1).reshape((-1, 1))   
    ## 3. keywords for topics
    ## Unlike other models, LSA_SVD will generate both positive and negative values in topic_word distribution,
    ## which makes it more ambiguous to choose keywords for topics. The sign of the weights are kept with the
    ## words for a demostration here
    
    def _topic_keywords(topic_dist):
        keywords_index = np.abs(topic_dist) >= thr
        keywords_prefix = np.where(np.sign(topic_dist) > 0, "", "^")[keywords_index]
        keywords = " | ".join(map(lambda x: "".join(x), zip(keywords_prefix, words[keywords_index])))
        return keywords
    
    topic_keywords = map(_topic_keywords, topic_dists)
    return "\n".join("Topic %i: %s" % (i, t) for i, t in enumerate(topic_keywords))

In [6]:
print(find_topic(strungs, "nmf", 40, vec_model="tfidf"))

Topic 0: about | again | air | all | almost | always | an | and | any | anything | around | as | at | back | basically | be | because | been | before | big | bit | board | book | box | building | built | but | called | came | coming | could | couldn | couple | days | design | did | didn | dinosaurs | do | doesn | doing | don | done | down | end | everybody | feet | first | for | found | fun | get | gets | getting | go | goes | going | good | got | great | guess | guy | guys | had | happen | happened | have | hit | hours | important | internet | into | isn | it | its | itself | just | kind | know | later | like | line | little | ll | long | looked | lot | machine | made | make | me | mean | much | no | not | of | off | oh | on | one | out | over | page | pages | pretty | put | quite | really | right | room | said | same | saw | say | serious | she | shot | side | so | some | something | sort | started | stop | stuff | tail | the | them | then | there | they | thing | things | think | th