In [1]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re

# Create pandas dataframe
colnames = ['index', 'citation', 'author', 'gender', 'title', 'date' , 'length',
            'text','occupation','numDate','ValenceScore']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)

# Create lists of the data we want to work with
talks = df.text.tolist()
labels = df.citation.tolist()

In [2]:
# =-=-=-=-=-=-=-=-=-=-=
# Display Functions
# =-=-=-=-=-=-=-=-=-=-= 

def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "| {:d} | ".format(topic_idx)
        message += " ".join([feature_names[i] + ' ' + str(round(topic[i], 2)) + ','
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
# Both NMF **and** LDA produce two matrices: 
# H - words to topics
# W - topics to documents

def display_topics_HW(H, W, feature_names, documents, n_top_words, n_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:n_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

In [4]:
# =-=-=-=-=-=-=-=-=-=-=
# Settings
# =-=-=-=-=-=-=-=-=-=-= 

n_topics = 50
n_features = 5000
n_top_words = 10
n_top_documents = 5

In [5]:
# =-=-=-=-=-=
# Generate LDA Model
# =-=-=-=-=-=

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

stopwords = re.split('\s+', open('../data/stopwords_all.txt', 'r').read().lower())

# LDA can only use raw term counts (virtual BoW)
tf_vectorizer = CountVectorizer(max_df = 0.95, 
                                min_df = 2, 
                                max_features = n_features, 
                                stop_words = stopwords)

tf = tf_vectorizer.fit_transform(talks)

# Optional code to safe TF array
# tf_array = tf.toarray()
# np.savetxt("../outputs/tentexts_tf.csv", tf_array.astype(np.int), fmt='%d', delimiter=",")
# print("A tf array of {} has been saved to CSV.".format(tf.shape))

lda = LatentDirichletAllocation(n_components = n_topics, 
                                max_iter = 20, 
                                learning_method = 'online', 
                                learning_offset = 50.,
                                random_state = 0)

lda.fit(tf)

AttributeError: 'numpy.float64' object has no attribute 'lower'

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Display the topics
# =-=-=-=-=-=-=-=-=-=-= 

tf_feature_names = tf_vectorizer.get_feature_names()
display_topics(lda, tf_feature_names, n_top_words)

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Create dataframes of TF, H, and W
# =-=-=-=-=-=-=-=-=-=-= 

# Create TF dataframe
df_tf = pd.DataFrame(data= tf_array, index = docs, columns = tf_feature_names)

# Uncomment to glimpse dataframe
# df_tf.head(10)

# Save TF dataframe to CSV file
df_tf.to_csv('../outputs/tf_frame.csv', sep=',')

# Get W (DTM) and H (WTM) arrays
lda_W = lda.transform(tf)
lda_H = lda.components_

df_lda_DTM = pd.DataFrame(data= lda_W, index = docs, columns = topic_labels)
df_lda_DTM.to_csv('../outputs/lda_DTM.csv', sep=',')
print(df_lda_DTM)

In [None]:
display_topics(nmf_H, nmf_W, tfidf_feature_names, talks, n_top_words, n_top_documents)