In [27]:
### Podcast Micro-Categories
### Unsupervised LDA

from __future__ import print_function
from __future__ import division

import gc
import os
import time

import numpy as np
import pandas as pd
import scipy

import lda


In [114]:
samp = False
samp = '_samp' if samp else ''

In [115]:
# I. LOAD ---------------------------------------------------

# Load TDM
loader = np.load('../interim/028_preproc_heavy_tdm' + samp + '.npz')
tdm = scipy.sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

# Load feature names
feature_names = np.array(pd.read_pickle('../interim/028_preproc_heavy_names' + samp + '.p'))

# Load show names and subgenres
eps = pd.read_pickle('../interim/028_preproc_heavy_eps' + samp + '.p')

print("Episode Term Document Matrix Shape:", tdm.shape)
print("Episode Table Shape:", eps.shape)
assert tdm.shape[0] == eps.shape[0]
assert len(feature_names) == tdm.shape[1]

Episode Term Document Matrix Shape: (11795, 58684)
Episode Table Shape: (11795, 2)


In [116]:
eps.head()


Unnamed: 0,podcast_name,subgenre
0,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School
1,! World's Funniest Ringtones for iPhone & iPad...,Gadgets
2,! iPhone Ringtones,Video Games
3,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences
4,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature


In [117]:
tdm.toarray()[:5,0:20]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

# II. Filter

In [118]:
row_sums = np.array(tdm.sum(axis=1)).flatten()

In [119]:
eps.ix[np.where(row_sums == 0)[0],:]

Unnamed: 0,podcast_name,subgenre
88,2 Minute Chef,Food
712,Arabic Grammar 2010,Islam
1126,Bhagavad Gita Class (Ch1) in Sanskrit by Dr. K...,Hinduism
1307,Brain Talk,Medicine
1658,Chio in the morning's podcast,Podcasting
2500,Dr. Usama Al-Atar Lectures,Islam
2534,Driving Sports TV,Automotive
2847,Experimental Game Development Podcast,Tech News
3022,Figure Drawing Online,Visual Arts
4511,Kabbalah Media Updates,Judaism


In [121]:
pd.Series(row_sums).value_counts().head()

6     48
3     47
8     44
10    39
4     38
dtype: int64

In [122]:
# Remove rows with zero words in the set
eps = eps.ix[np.where(row_sums > 0)[0],:]
tdm = tdm[np.where(row_sums > 0)[0],:]

# III. LDA Model

In [None]:
# Create model
model = lda.LDA(n_topics=56, n_iter=500, random_state=42)

# Fit model
%time model.fit(tdm)

In [None]:
# Best Fitting Episodes for Each Topic


In [113]:
# Most Important Words for Each Topic
topic_words = model.topic_word_
topic_words.shape

top_n = 5
for ii, dist in enumerate(topic_words):
    topic_words = np.array(feature_names)[np.argsort(dist)][:-(top_n+1):-1]
    print('Topic {}:  {}'.format(ii, ' '.join(topic_words)))
    

(56L, 13463L)