In [1]:
# TEDtalks: Topics with LDA

# =-=-=-=-=-=
# Read CSV into DataFrame and then create lists
# =-=-=-=-=-=

import pandas
import re


# Create pandas dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3a.csv', names=colnames)

# Create lists for the data
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()

# Getting only the years from dates list
years = [re.sub('[A-Za-z ]', '', item) for item in dates]

# Combining year with presenter for citation
authordate = [author+" "+year for author, year in zip(authors, years)]

# Just to check to see if things are synced,
# let's create a new df with the two lists.

citations = pandas.DataFrame(
    {'citation': authordate,
     'text': talks,
    })

# This just shows that the citation and the text are paired correctly.
# citations.head()

In [8]:
# =-=-=-=-=-=-=-=-=-=-=
# Settings & Display Functions
# =-=-=-=-=-=-=-=-=-=-= 

n_topics = 40
n_features = 5000
n_top_words = 10
n_top_documents = 5


stopwords = re.split('\s+', open('../data/stopwords_all.txt', 'r').read().lower())

def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "| {:d} | ".format(topic_idx)
        message += " ".join([feature_names[i] + ' ' + str(round(topic[i], 2)) + ','
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
# Both NMF **and** LDA produce two matrices: 
# H - words to topics
# W - topics to documents

def display_topics_HW(H, W, feature_names, documents, n_top_words, n_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:n_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

In [6]:
# =-=-=-=-=-=
# Generate LDA Model
# =-=-=-=-=-=

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# LDA can only use raw term counts (virtual BoW)
tf_vectorizer = CountVectorizer(max_df = 0.95, 
                                min_df = 2, 
                                max_features = n_features, 
                                stop_words = stopwords)

tf = tf_vectorizer.fit_transform(talks)

# Optional code to safe TF array
# tf_array = tf.toarray()
# np.savetxt("../outputs/tentexts_tf.csv", tf_array.astype(np.int), fmt='%d', delimiter=",")
# print("A tf array of {} has been saved to CSV.".format(tf.shape))

lda = LatentDirichletAllocation(n_components = n_topics, 
                                max_iter = 20, 
                                learning_method = 'online', 
                                learning_offset = 50.,
                                random_state = 0)

lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_components=40, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [9]:
# =-=-=-=-=-=-=-=-=-=-=
# Display the topics
# =-=-=-=-=-=-=-=-=-=-= 

tf_feature_names = tf_vectorizer.get_feature_names()
display_topics(lda, tf_feature_names, n_top_words)

| 0 | just 0.03, people 0.03, think 0.03, time 0.03, know 0.03, million 0.03, said 0.03, percent 0.03, fish 0.03, dollars 0.03,
| 1 | energy 910.07, power 656.41, nuclear 318.98, electricity 211.46, wind 174.49, fuel 160.54, battery 157.58, coal 146.68, carbon 104.94, cell 58.55,
| 2 | energy 0.03, earth 0.03, move 0.03, years 0.03, climate 0.03, year 0.03, warming 0.03, co 0.03, time 0.03, let 0.03,
| 3 | century 319.58, th 259.33, religion 212.15, book 189.74, history 187.98, images 169.24, great 165.47, iran 153.75, war 139.71, east 138.46,
| 4 | water 1056.35, air 297.12, map 203.32, plant 177.61, sand 166.56, see 164.58, environment 163.47, green 150.67, mushroom 130.52, temperature 124.68,
| 5 | data 1111.57, information 1000.88, internet 806.08, people 638.91, computer 539.53, just 525.34, technology 505.28, system 447.64, phone 412.28, government 401.27,
| 6 | see 1012.47, space 966.87, earth 918.17, universe 866.99, light 702.06, life 633.89, science 564.03, years 512.29, plan

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Create dataframes of TF, H, and W
# =-=-=-=-=-=-=-=-=-=-= 

# Create TF dataframe
df_tf = pd.DataFrame(data= tf_array, index = docs, columns = tf_feature_names)

# Uncomment to glimpse dataframe
# df_tf.head(10)

# Save TF dataframe to CSV file
df_tf.to_csv('../outputs/tf_frame.csv', sep=',')

# Get W (DTM) and H (WTM) arrays
lda_W = lda.transform(tf)
lda_H = lda.components_

df_lda_DTM = pd.DataFrame(data= lda_W, index = docs, columns = topic_labels)
df_lda_DTM.to_csv('../outputs/lda_DTM.csv', sep=',')
print(df_lda_DTM)

In [None]:
display_topics(nmf_H, nmf_W, tfidf_feature_names, talks, n_top_words, n_top_documents)