# Podcast Micro-Categories
### Exploratory Notebook
### Unsupervised LDA

In [25]:
from __future__ import print_function
from __future__ import division

import gc
import os
import time

import numpy as np
import pandas as pd
import scipy

import lda

from sklearn.externals import joblib


In [26]:
samp = False
samp = '_samp' if samp else ''

# I. Load

In [27]:

# Load TDM
loader = np.load('../interim/028_preproc_heavy_tdm' + samp + '.npz')
tdm = scipy.sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

# Load feature names
feature_names = np.array(pd.read_pickle('../interim/028_preproc_heavy_names' + samp + '.p'))

# Load show names and subgenres that still remain
shows_concat = pd.read_pickle('../interim/028_preproc_heavy_shows_concat' + samp + '.p')

# Load full show table so that summaries can be pulled in
shows_full = pd.read_pickle('../interim/pods' + samp + '.p')

print("Episode Term Document Matrix Shape:", tdm.shape)
print("Episode Table Shape:", shows_concat.shape)
assert tdm.shape[0] == shows_concat.shape[0]
assert len(feature_names) == tdm.shape[1]

Episode Term Document Matrix Shape: (11795, 58684)
Episode Table Shape: (11795, 2)


In [28]:
# Remove duplicates from full show list (need to handle these earlier on in other scripts)
dupes = shows_full.groupby(['podcast_name', 'subgenre']).filter(lambda group: len(group) > 1).sort('podcast_name')
print(dupes.shape)
shows_full = shows_full.drop_duplicates(['podcast_name', 'subgenre'])
print(shows_full.shape)

(151, 18)
(12319, 18)


In [29]:
# Join data from full show table to list of shows in the model

print(shows_full.shape)
print(shows_concat.shape)

shows = pd.merge(shows_concat, shows_full[['podcast_name', 'subgenre', 'show_desc']], 
                 on = ['podcast_name', 'subgenre'], how='left', sort=False)
print(shows.shape)
assert shows.shape[0] == shows_concat.shape[0]
shows.head()

(12319, 18)
(11795, 2)
(11795, 3)


Unnamed: 0,podcast_name,subgenre,show_desc
0,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School,Get FREE RINGTONES when you subscribe! Footbal...
1,! World's Funniest Ringtones for iPhone & iPad...,Gadgets,Get FREE RINGTONES when you subscribe to The W...
2,! iPhone Ringtones,Video Games,Get FREE RINGTONES when you subscribe! The Wor...
3,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences,Dr. Tony Ferretti offers a psychological persp...
4,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature,"""A Tale of Two Cities"" was first published in ..."


In [30]:
# Small cutout of TDM - it is a sparse matrix and therefore mostly 0s
tdm.toarray()[:5,0:20]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

# II. Filter

In [31]:
row_sums = np.array(tdm.sum(axis=1)).flatten()

In [32]:
shows.ix[np.where(row_sums == 0)[0],:]

Unnamed: 0,podcast_name,subgenre,show_desc
88,2 Minute Chef,Food,Recipes you can cook fast and easily. These qu...
712,Arabic Grammar 2010,Islam,The beautiful Arabic language is the one chose...
1126,Bhagavad Gita Class (Ch1) in Sanskrit by Dr. K...,Hinduism,Bhagavad Gita Class (Ch1) in Sanskrit by Dr. K...
1307,Brain Talk,Medicine,"Brain Talk, a BSi production, hosts weekly int..."
1658,Chio in the morning's podcast,Podcasting,
2500,Dr. Usama Al-Atar Lectures,Islam,Al-Hajj Usama Al-Atar is originally from the h...
2534,Driving Sports TV,Automotive,The video series for a new generation of drivi...
2847,Experimental Game Development Podcast,Tech News,A show that talks about the development and se...
3022,Figure Drawing Online,Visual Arts,Figure Drawing Online provides free video podc...
4511,Kabbalah Media Updates,Judaism,Kabbalah podcast brings you the daily kabbalah...


In [33]:
pd.Series(row_sums).value_counts().head()

6     48
3     47
8     44
10    39
4     38
dtype: int64

In [34]:
# Remove rows with zero words in the set
shows = shows.ix[np.where(row_sums > 0)[0],:]
tdm = tdm[np.where(row_sums > 0)[0],:]
print(shows.shape)
print(tdm.shape)

(11770, 3)
(11770, 58684)


# III. Train LDA Model

In [123]:
# Create model
model = lda.LDA(n_topics=56, n_iter=500, random_state=42)

# Fit model (takes about 2 hours 15 minutes)
#%time model.fit(tdm)

Wall time: 2h 16min 22s


<lda.lda.LDA instance at 0x0000000012878BC8>

In [8]:
# (Save/Load model pickle object as needed)
# Pickle model
#joblib.dump(model, '../interim/trained_models/lda/lda' + samp + '.pkl')
# Load pickled model
#model = joblib.load('../interim/trained_models/lda/lda.pkl')

# IV. Exploration

## A. Model Words

In [35]:
# Most Important Words for Each Topic
topic_words = model.topic_word_
print(topic_words.shape)

top_n = 8
topic_names_temp = []
for ii, dist in enumerate(topic_words):
    topic_words = np.array(feature_names)[np.argsort(dist)][:-(top_n+1):-1]
    topic_names_temp.append('Topic ' + str(ii) + ': ' + '-'.join(topic_words.flatten()[:4]))
    print('Topic {}:  {}'.format(ii, ' '.join(topic_words.flatten())))
    

(56L, 58684L)
Topic 0:  buddhist zen dharma buddhism aviation retreat buddha lama
Topic 1:  nonprofit certified awardwinning internationally clinical therapist volunteer faculty
Topic 2:  survivor ron ian terry directed louis harris keith
Topic 3:  gay porn sexy dental toy emily sexuality spouse
Topic 4:  developer agile programming stack server chuck javascript github
Topic 5:  muslim islamic donate publisher reminder updated lasting countless
Topic 6:  gaming jj rpg dungeon kickstarter dice dragon miniature
Topic 7:  golf archive poker favourite luxury disc piano castaway
Topic 8:  paranormal psychic ufo tarot healer pagan intuitive investigation
Topic 9:  yoga oldtimeradiodvdcom dvd aired announcer shipping perry superman
Topic 10:  promo ep wwwyoutubecomwatch donation slate audible checkout discount
Topic 11:  bbc britain programme india historian centre royal german
Topic 12:  estate investor investing trading income retirement debt portfolio
Topic 13:  gang rant yeah gon installm

## B. Show Topics

In [39]:
model.doc_topic_.shape

(11770L, 56L)

In [52]:
model.doc_topic_.shape

(11770L, 56L)

In [59]:
# Best Fitting Episodes for Each Topic
show_topics = model.doc_topic_
print(show_topics.shape)


show_closest_topic = np.array(topic_names_temp)[show_topics.argmax(1)]
show_closest_topic_prob = show_topics.max(1)

(11770L, 56L)


In [64]:
shows.head()

Unnamed: 0,podcast_name,subgenre,show_desc
0,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School,Get FREE RINGTONES when you subscribe! Footbal...
1,! World's Funniest Ringtones for iPhone & iPad...,Gadgets,Get FREE RINGTONES when you subscribe to The W...
2,! iPhone Ringtones,Video Games,Get FREE RINGTONES when you subscribe! The Wor...
3,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences,Dr. Tony Ferretti offers a psychological persp...
4,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature,"""A Tale of Two Cities"" was first published in ..."


In [65]:
show_closest_topic.head()

Unnamed: 0,prob,topic
0,0.749332,Topic 51: marathon-runner-ufc-endurance
1,0.846425,Topic 51: marathon-runner-ufc-endurance
2,0.772727,Topic 51: marathon-runner-ufc-endurance
3,0.353501,Topic 20: mindfulness-yoga-miracle-gratitude
4,0.475575,Topic 36: yarn-knitting-sock-knit


In [61]:
show_closest_topic = pd.DataFrame({'topic': show_closest_topic, 
                                 'prob': show_closest_topic_prob})
show_closest_topic = pd.concat([show_closest_topic, shows], axis=1)

In [62]:
show_closest_topic

Unnamed: 0,prob,topic
0,0.749332,Topic 51: marathon-runner-ufc-endurance
1,0.846425,Topic 51: marathon-runner-ufc-endurance
2,0.772727,Topic 51: marathon-runner-ufc-endurance
3,0.353501,Topic 20: mindfulness-yoga-miracle-gratitude
4,0.475575,Topic 36: yarn-knitting-sock-knit
5,0.431818,Topic 45: celtic-bike-cycling-irish
6,0.274194,Topic 3: gay-porn-sexy-dental
7,0.340491,Topic 9: yoga-oldtimeradiodvdcom-dvd-aired
8,0.212329,Topic 47: startup-affiliate-seller-income
9,0.463105,Topic 20: mindfulness-yoga-miracle-gratitude


In [261]:
show_closest_topic.to_csv('C:/Users/JBLAUVELT/Desktop/review_cats.csv')

##C. Episode Words

In [66]:
# Most Important Words for Each Topic
topic_words = model.topic_word_
print(topic_words.shape)

(56L, 58684L)


In [None]:
topic_words.

In [None]:
top_n = 8
topic_names_temp = []
for ii, dist in enumerate(topic_words):
    topic_words = np.array(feature_names)[np.argsort(dist)][:-(top_n+1):-1]
    topic_names_temp.append('Topic ' + str(ii) + ': ' + '-'.join(topic_words.flatten()[:4]))
    print('Topic {}:  {}'.format(ii, ' '.join(topic_words.flatten())))