podcast data: each csv is an episode and each line is a sentence from the transcript

In [131]:
import pandas as pd

data = pd.read_csv('data/podcast1.csv', error_bad_lines=False);
documents = data[['headline']]

In [132]:
len(documents)

142

In [133]:
documents[:5]

Unnamed: 0,headline
0,Let's face.
1,It podcasting is a booming business.
2,I have information that I want the world to he...
3,Well good news.
4,I found the answer anchor anchor dot.


### Data Preprocessing

In [134]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(42)

In [135]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/harrywang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [136]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [137]:
%%time
processed_docs = documents['headline'].map(preprocess)

CPU times: user 46.5 ms, sys: 2.26 ms, total: 48.7 ms
Wall time: 70.5 ms


In [138]:
processed_docs[:10]

0                                               [face]
1                                [podcast, boom, busi]
2    [inform, want, world, hear, want, outrag, fee,...
3                                         [good, news]
4                             [answer, anchor, anchor]
5    [anchor, creat, content, distribut, free, worl...
6            [record, add, pay, play, listen, audienc]
7    [record, host, guest, music, bumper, track, im...
8    [want, creat, podcast, budget, expens, studio,...
9    [work, match, respons, listen, audienc, pay, p...
Name: headline, dtype: object

### Bag of words on the dataset

In [139]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [140]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 face
1 boom
2 busi
3 podcast
4 charg
5 distributor
6 fee
7 hear
8 inform
9 outrag
10 want


In [141]:
# skip this for podcast
# filtering tokens
#less than 15 documents (absolute number) or
#more than 0.5 documents (fraction of total corpus size, not absolute number).
#after the above two steps, keep only the first 100000 most frequent tokens.

#dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [142]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [143]:
bow_doc_2 = bow_corpus[2]

for i in range(len(bow_doc_2)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], 
                                                     dictionary[bow_doc_2[i][0]], 
                                                     bow_doc_2[i][1]))

Word 4 ("charg") appears 1 time.
Word 5 ("distributor") appears 1 time.
Word 6 ("fee") appears 1 time.
Word 7 ("hear") appears 1 time.
Word 8 ("inform") appears 1 time.
Word 9 ("outrag") appears 1 time.
Word 10 ("want") appears 2 time.
Word 11 ("world") appears 1 time.


### TF-IDF

In [144]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [145]:
corpus_tfidf = tfidf[bow_corpus]

In [146]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 1.0)]


### Running LDA using Bag of Words

In [147]:
%%time
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

CPU times: user 196 ms, sys: 39.6 ms, total: 235 ms
Wall time: 863 ms


In [148]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.022*"babylonian" + 0.020*"king" + 0.017*"string" + 0.015*"music" + 0.013*"languag" + 0.013*"sound" + 0.012*"nebuchadnezzar" + 0.012*"imag" + 0.012*"worship" + 0.012*"second"
Topic: 1 
Words: 0.020*"string" + 0.019*"note" + 0.017*"instrument" + 0.015*"like" + 0.012*"list" + 0.011*"music" + 0.011*"look" + 0.010*"liar" + 0.010*"play" + 0.010*"anchor"
Topic: 2 
Words: 0.022*"podcast" + 0.017*"music" + 0.015*"broadcast" + 0.013*"anchor" + 0.011*"note" + 0.011*"import" + 0.010*"cuneiform" + 0.010*"worship" + 0.010*"record" + 0.009*"time"
Topic: 3 
Words: 0.026*"music" + 0.012*"understand" + 0.012*"instrument" + 0.012*"number" + 0.011*"know" + 0.011*"imag" + 0.010*"chapter" + 0.010*"bibl" + 0.010*"king" + 0.009*"like"
Topic: 4 
Words: 0.019*"string" + 0.018*"instrument" + 0.018*"mode" + 0.015*"go" + 0.014*"copi" + 0.010*"fourth" + 0.010*"node" + 0.009*"tune" + 0.009*"like" + 0.008*"today"


Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [149]:
%%time
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

CPU times: user 139 ms, sys: 31 ms, total: 170 ms
Wall time: 473 ms


In [150]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"yeshua" + 0.008*"podcast" + 0.008*"anchor" + 0.007*"copi" + 0.007*"babylonian" + 0.006*"spirit" + 0.006*"tuna" + 0.006*"amen" + 0.006*"fourth" + 0.006*"shall"
Topic: 1 Word: 0.009*"string" + 0.008*"imag" + 0.008*"final" + 0.007*"fall" + 0.007*"worship" + 0.006*"fifth" + 0.006*"liar" + 0.006*"kadosh" + 0.006*"music" + 0.006*"akaka"
Topic: 2 Word: 0.009*"time" + 0.008*"understand" + 0.007*"opportun" + 0.007*"nation" + 0.007*"face" + 0.007*"gain" + 0.006*"broadcast" + 0.006*"donat" + 0.006*"radio" + 0.006*"scale"
Topic: 3 Word: 0.014*"manuscript" + 0.009*"mode" + 0.007*"like" + 0.007*"matter" + 0.007*"generat" + 0.006*"joyous" + 0.006*"thing" + 0.006*"triton" + 0.006*"follow" + 0.006*"note"
Topic: 4 Word: 0.008*"string" + 0.007*"strength" + 0.007*"nebuchadnezzar" + 0.007*"king" + 0.007*"look" + 0.006*"answer" + 0.006*"list" + 0.006*"beauti" + 0.006*"centuri" + 0.006*"think"


### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [151]:
processed_docs[2]

['inform',
 'want',
 'world',
 'hear',
 'want',
 'outrag',
 'fee',
 'distributor',
 'charg']

In [152]:
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9195331931114197	 
Topic: 0.020*"string" + 0.019*"note" + 0.017*"instrument" + 0.015*"like" + 0.012*"list" + 0.011*"music" + 0.011*"look" + 0.010*"liar" + 0.010*"play" + 0.010*"anchor"

Score: 0.02022414468228817	 
Topic: 0.022*"babylonian" + 0.020*"king" + 0.017*"string" + 0.015*"music" + 0.013*"languag" + 0.013*"sound" + 0.012*"nebuchadnezzar" + 0.012*"imag" + 0.012*"worship" + 0.012*"second"

Score: 0.02012956328690052	 
Topic: 0.022*"podcast" + 0.017*"music" + 0.015*"broadcast" + 0.013*"anchor" + 0.011*"note" + 0.011*"import" + 0.010*"cuneiform" + 0.010*"worship" + 0.010*"record" + 0.009*"time"

Score: 0.020068539306521416	 
Topic: 0.019*"string" + 0.018*"instrument" + 0.018*"mode" + 0.015*"go" + 0.014*"copi" + 0.010*"fourth" + 0.010*"node" + 0.009*"tune" + 0.009*"like" + 0.008*"today"

Score: 0.020044509321451187	 
Topic: 0.026*"music" + 0.012*"understand" + 0.012*"instrument" + 0.012*"number" + 0.011*"know" + 0.011*"imag" + 0.010*"chapter" + 0.010*"bibl" + 0.010*"king" 

Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [153]:
for index, score in sorted(lda_model_tfidf[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9195690155029297	 
Topic: 0.009*"string" + 0.008*"imag" + 0.008*"final" + 0.007*"fall" + 0.007*"worship" + 0.006*"fifth" + 0.006*"liar" + 0.006*"kadosh" + 0.006*"music" + 0.006*"akaka"

Score: 0.020143453031778336	 
Topic: 0.009*"time" + 0.008*"understand" + 0.007*"opportun" + 0.007*"nation" + 0.007*"face" + 0.007*"gain" + 0.006*"broadcast" + 0.006*"donat" + 0.006*"radio" + 0.006*"scale"

Score: 0.020101875066757202	 
Topic: 0.008*"string" + 0.007*"strength" + 0.007*"nebuchadnezzar" + 0.007*"king" + 0.007*"look" + 0.006*"answer" + 0.006*"list" + 0.006*"beauti" + 0.006*"centuri" + 0.006*"think"

Score: 0.0200973991304636	 
Topic: 0.009*"yeshua" + 0.008*"podcast" + 0.008*"anchor" + 0.007*"copi" + 0.007*"babylonian" + 0.006*"spirit" + 0.006*"tuna" + 0.006*"amen" + 0.006*"fourth" + 0.006*"shall"

Score: 0.020088206976652145	 
Topic: 0.014*"manuscript" + 0.009*"mode" + 0.007*"like" + 0.007*"matter" + 0.007*"generat" + 0.006*"joyous" + 0.006*"thing" + 0.006*"triton" + 0.006*"follow

Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [154]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.4102005958557129	 Topic: 0.022*"podcast" + 0.017*"music" + 0.015*"broadcast" + 0.013*"anchor" + 0.011*"note"
Score: 0.38971903920173645	 Topic: 0.020*"string" + 0.019*"note" + 0.017*"instrument" + 0.015*"like" + 0.012*"list"
Score: 0.06670007854700089	 Topic: 0.019*"string" + 0.018*"instrument" + 0.018*"mode" + 0.015*"go" + 0.014*"copi"
Score: 0.0666906014084816	 Topic: 0.026*"music" + 0.012*"understand" + 0.012*"instrument" + 0.012*"number" + 0.011*"know"
Score: 0.06668967008590698	 Topic: 0.022*"babylonian" + 0.020*"king" + 0.017*"string" + 0.015*"music" + 0.013*"languag"
