podcast data: each csv is an episode and each line is a sentence from the transcript

In [96]:
import pandas as pd

data = pd.read_csv('data/podcast1.csv', error_bad_lines=False);
documents = data[['headline']]

In [97]:
len(documents)

142

In [98]:
documents[:5]

Unnamed: 0,headline
0,Let's face.
1,It podcasting is a booming business.
2,I have information that I want the world to he...
3,Well good news.
4,I found the answer anchor anchor dot.


### Data Preprocessing

In [99]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
#np.random.seed(42)

In [100]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/harrywang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [101]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [102]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [103]:
%%time
processed_docs = documents['headline'].map(preprocess)

CPU times: user 40.7 ms, sys: 2.84 ms, total: 43.5 ms
Wall time: 71.6 ms


In [104]:
processed_docs[:10]

0                                               [face]
1                                [podcast, boom, busi]
2    [inform, want, world, hear, want, outrag, fee,...
3                                         [good, news]
4                             [answer, anchor, anchor]
5    [anchor, creat, content, distribut, free, worl...
6            [record, add, pay, play, listen, audienc]
7    [record, host, guest, music, bumper, track, im...
8    [want, creat, podcast, budget, expens, studio,...
9    [work, match, respons, listen, audienc, pay, p...
Name: headline, dtype: object

### Bag of words on the dataset

In [105]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [106]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 face
1 boom
2 busi
3 podcast
4 charg
5 distributor
6 fee
7 hear
8 inform
9 outrag
10 want


In [107]:
# skip this for podcast
# filtering tokens
#less than 15 documents (absolute number) or
#more than 0.5 documents (fraction of total corpus size, not absolute number).
#after the above two steps, keep only the first 100000 most frequent tokens.

#dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [108]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [109]:
bow_doc_2 = bow_corpus[2]

for i in range(len(bow_doc_2)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], 
                                                     dictionary[bow_doc_2[i][0]], 
                                                     bow_doc_2[i][1]))

Word 4 ("charg") appears 1 time.
Word 5 ("distributor") appears 1 time.
Word 6 ("fee") appears 1 time.
Word 7 ("hear") appears 1 time.
Word 8 ("inform") appears 1 time.
Word 9 ("outrag") appears 1 time.
Word 10 ("want") appears 2 time.
Word 11 ("world") appears 1 time.


### TF-IDF

In [110]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [111]:
corpus_tfidf = tfidf[bow_corpus]

In [112]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 1.0)]


### Running LDA using Bag of Words

In [113]:
%%time
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

CPU times: user 128 ms, sys: 24.1 ms, total: 152 ms
Wall time: 546 ms


In [114]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.018*"music" + 0.015*"copi" + 0.013*"understand" + 0.012*"mode" + 0.012*"nebuchadnezzar" + 0.012*"king" + 0.011*"bibl" + 0.010*"languag" + 0.009*"instrument" + 0.009*"babylonian"
Topic: 1 
Words: 0.020*"music" + 0.016*"podcast" + 0.016*"anchor" + 0.015*"worship" + 0.013*"king" + 0.013*"fall" + 0.012*"shall" + 0.012*"nebuchadnezzar" + 0.012*"sound" + 0.011*"time"
Topic: 2 
Words: 0.025*"instrument" + 0.023*"like" + 0.016*"music" + 0.011*"scholar" + 0.010*"liar" + 0.010*"text" + 0.010*"guitar" + 0.010*"tablet" + 0.008*"king" + 0.008*"string"
Topic: 3 
Words: 0.047*"string" + 0.020*"number" + 0.016*"music" + 0.013*"manuscript" + 0.012*"second" + 0.011*"fourth" + 0.010*"come" + 0.008*"call" + 0.008*"tablet" + 0.008*"podcast"
Topic: 4 
Words: 0.024*"note" + 0.014*"mode" + 0.013*"tune" + 0.010*"music" + 0.009*"time" + 0.009*"understand" + 0.009*"go" + 0.009*"list" + 0.009*"seven" + 0.009*"worship"


Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [115]:
%%time
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

CPU times: user 128 ms, sys: 27.4 ms, total: 156 ms
Wall time: 473 ms


In [116]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"time" + 0.009*"babylonian" + 0.007*"add" + 0.006*"gain" + 0.006*"face" + 0.006*"amen" + 0.006*"tablet" + 0.006*"note" + 0.006*"nebuchadnezzar" + 0.006*"beauti"
Topic: 1 Word: 0.010*"nebuchadnezzar" + 0.010*"king" + 0.009*"imag" + 0.008*"understand" + 0.007*"worship" + 0.007*"nation" + 0.007*"holi" + 0.007*"answer" + 0.007*"string" + 0.006*"fifth"
Topic: 2 Word: 0.010*"fourth" + 0.009*"anchor" + 0.008*"deliv" + 0.007*"yeshua" + 0.006*"string" + 0.006*"joyous" + 0.006*"tablet" + 0.006*"composit" + 0.006*"text" + 0.005*"hand"
Topic: 3 Word: 0.010*"manuscript" + 0.010*"like" + 0.008*"note" + 0.007*"instrument" + 0.007*"triton" + 0.006*"generat" + 0.006*"serv" + 0.006*"worship" + 0.005*"fifth" + 0.005*"choos"
Topic: 4 Word: 0.008*"kadosh" + 0.008*"tuna" + 0.006*"follow" + 0.006*"millennium" + 0.006*"final" + 0.006*"broadcast" + 0.006*"list" + 0.005*"listen" + 0.005*"mode" + 0.005*"like"


### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [117]:
processed_docs[2]

['inform',
 'want',
 'world',
 'hear',
 'want',
 'outrag',
 'fee',
 'distributor',
 'charg']

In [118]:
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9195420742034912	 
Topic: 0.025*"instrument" + 0.023*"like" + 0.016*"music" + 0.011*"scholar" + 0.010*"liar" + 0.010*"text" + 0.010*"guitar" + 0.010*"tablet" + 0.008*"king" + 0.008*"string"

Score: 0.020303118973970413	 
Topic: 0.020*"music" + 0.016*"podcast" + 0.016*"anchor" + 0.015*"worship" + 0.013*"king" + 0.013*"fall" + 0.012*"shall" + 0.012*"nebuchadnezzar" + 0.012*"sound" + 0.011*"time"

Score: 0.020136237144470215	 
Topic: 0.018*"music" + 0.015*"copi" + 0.013*"understand" + 0.012*"mode" + 0.012*"nebuchadnezzar" + 0.012*"king" + 0.011*"bibl" + 0.010*"languag" + 0.009*"instrument" + 0.009*"babylonian"

Score: 0.020012253895401955	 
Topic: 0.024*"note" + 0.014*"mode" + 0.013*"tune" + 0.010*"music" + 0.009*"time" + 0.009*"understand" + 0.009*"go" + 0.009*"list" + 0.009*"seven" + 0.009*"worship"

Score: 0.020006300881505013	 
Topic: 0.047*"string" + 0.020*"number" + 0.016*"music" + 0.013*"manuscript" + 0.012*"second" + 0.011*"fourth" + 0.010*"come" + 0.008*"call" + 0.008*"

Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [119]:
for index, score in sorted(lda_model_tfidf[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9197124242782593	 
Topic: 0.008*"kadosh" + 0.008*"tuna" + 0.006*"follow" + 0.006*"millennium" + 0.006*"final" + 0.006*"broadcast" + 0.006*"list" + 0.005*"listen" + 0.005*"mode" + 0.005*"like"

Score: 0.020113665610551834	 
Topic: 0.010*"fourth" + 0.009*"anchor" + 0.008*"deliv" + 0.007*"yeshua" + 0.006*"string" + 0.006*"joyous" + 0.006*"tablet" + 0.006*"composit" + 0.006*"text" + 0.005*"hand"

Score: 0.020106647163629532	 
Topic: 0.009*"time" + 0.009*"babylonian" + 0.007*"add" + 0.006*"gain" + 0.006*"face" + 0.006*"amen" + 0.006*"tablet" + 0.006*"note" + 0.006*"nebuchadnezzar" + 0.006*"beauti"

Score: 0.020052103325724602	 
Topic: 0.010*"nebuchadnezzar" + 0.010*"king" + 0.009*"imag" + 0.008*"understand" + 0.007*"worship" + 0.007*"nation" + 0.007*"holi" + 0.007*"answer" + 0.007*"string" + 0.006*"fifth"

Score: 0.020015127956867218	 
Topic: 0.010*"manuscript" + 0.010*"like" + 0.008*"note" + 0.007*"instrument" + 0.007*"triton" + 0.006*"generat" + 0.006*"serv" + 0.006*"worship" + 

Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [120]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3993603587150574	 Topic: 0.024*"note" + 0.014*"mode" + 0.013*"tune" + 0.010*"music" + 0.009*"time"
Score: 0.3985576927661896	 Topic: 0.047*"string" + 0.020*"number" + 0.016*"music" + 0.013*"manuscript" + 0.012*"second"
Score: 0.06868214905261993	 Topic: 0.025*"instrument" + 0.023*"like" + 0.016*"music" + 0.011*"scholar" + 0.010*"liar"
Score: 0.06670212745666504	 Topic: 0.018*"music" + 0.015*"copi" + 0.013*"understand" + 0.012*"mode" + 0.012*"nebuchadnezzar"
Score: 0.06669768691062927	 Topic: 0.020*"music" + 0.016*"podcast" + 0.016*"anchor" + 0.015*"worship" + 0.013*"king"
