# Podcast Micro-Categories
### Exploratory Notebook
### Unsupervised LDA

In [1]:
from __future__ import print_function
from __future__ import division

import gc
import os
import time

import numpy as np
import pandas as pd
import scipy

import lda

import re

from sklearn.externals import joblib


In [2]:
samp = False
samp = '_samp' if samp else ''

# I. Load

In [3]:
preproc = '025_preproc_bigrams'
# Load TDM
loader = np.load('../interim/' + preproc + '_tdm' + samp + '.npz')
tdm = scipy.sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

# Load feature names
feature_names = np.array(pd.read_pickle('../interim/' + preproc + '_names' + samp + '.p'))

# Load show names and subgenres that still remain
shows_concat = pd.read_pickle('../interim/' + preproc + '_shows_concat' + samp + '.p')

# Load full show table so that summaries can be pulled in
shows_full = pd.read_pickle('../interim/pods' + samp + '.p')

print("Episode Term Document Matrix Shape:", tdm.shape)
print("Episode Table Shape:", shows_concat.shape)
assert tdm.shape[0] == shows_concat.shape[0]
assert len(feature_names) == tdm.shape[1]

Episode Term Document Matrix Shape: (11795, 334244)
Episode Table Shape: (11795, 2)


In [4]:
# Remove duplicates from full show list (need to handle these earlier on in other scripts)
dupes = shows_full.groupby(['podcast_name', 'subgenre']).filter(lambda group: len(group) > 1).sort('podcast_name')
print("Number of dupes removed: ", dupes.shape)
shows_full = shows_full.drop_duplicates(['podcast_name', 'subgenre'])
print("Number of shows remaining: ", shows_full.shape)

Number of dupes removed:  (151, 18)
Number of shows remaining:  (12319, 18)


In [5]:
# Join data from full show table to list of shows in the model

print(shows_full.shape)
print(shows_concat.shape)

shows = pd.merge(shows_concat, shows_full[['podcast_name', 'subgenre', 'show_desc']], 
                 on = ['podcast_name', 'subgenre'], how='left', sort=False)
print(shows.shape)
assert shows.shape[0] == shows_concat.shape[0]
shows.head()

(12319, 18)
(11795, 2)
(11795, 3)


Unnamed: 0,podcast_name,subgenre,show_desc
0,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School,Get FREE RINGTONES when you subscribe! Footbal...
1,! World's Funniest Ringtones for iPhone & iPad...,Gadgets,Get FREE RINGTONES when you subscribe to The W...
2,! iPhone Ringtones,Video Games,Get FREE RINGTONES when you subscribe! The Wor...
3,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences,Dr. Tony Ferretti offers a psychological persp...
4,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature,"""A Tale of Two Cities"" was first published in ..."


In [6]:
# Small cutout of TDM - it is a sparse matrix and therefore mostly 0s
tdm.toarray()[:5,0:20]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

# II. Filter

### A. Remove features with podcast-specific phrases

In [7]:
# List features with words that are specific to podcasts and are seen a lot
podcast_stop_words = ['download', 'subscribe', 'listen', 'itunes', 'audio', 'comment', 'question', 
                      'recorded', 'episode', 'listener', 'podcast', '\\brate\\b', 'sponsor', 'support', 
                      'review', 'http', 'instagram', 'twitter', 'youtube']
podcast_stop_words_regex = '|'.join(podcast_stop_words)

# Identify and count the features with these stop words
where_not_podstop = [1 if re.match(podcast_stop_words_regex, i) else 0 for i in feature_names.flatten()]
print('Removing {} features because they contain podcast stop words'.format(feature_names[np.where(where_not_podstop)].shape[0]))

# Remove the features with these stop words from the feature_names variable and the TDM
feature_names = feature_names[np.where(np.array(where_not_podstop) == 0)]
tdm = tdm[:,np.where(np.array(where_not_podstop) == 0)[0]]
print('{:,} features remaining'.format(feature_names.shape[0]))
assert tdm.shape[1] == feature_names.shape[0]

Removing 7718 features because they contain podcast stop words
326526 features remaining


### B. Remove shows with no words in the vocab

In [9]:
# Identify show that have no words in the vocabulary
row_sums = np.array(tdm.sum(axis=1)).flatten() # sum tdm counts across rows (across shows)
print("Number of shows with no words in the vocabulary: ", 
      shows.ix[np.where(row_sums == 0)[0],:].shape[0]) # print out count of rows that will be removed

Number of shows with no words in the vocabulary:  50


In [10]:
# Remove rows with zero words in the set
shows = shows.ix[np.where(row_sums > 0)[0],:]
tdm = tdm[np.where(row_sums > 0)[0],:]
print(shows.shape)
print(tdm.shape)

(11745, 3)
(11745, 326526)


# III. Train LDA Model

In [None]:
# Create model
model = lda.LDA(n_topics=56, n_iter=75, random_state=42)

# Fit model (takes about 17 minutes for 75 iterations
%time model.fit(tdm)

In [25]:
# (Save/Load model pickle object as needed)
# Pickle model
joblib.dump(model, '../interim/trained_models/lda/lda_25' + samp + '.pkl')
# Load pickled model
#model = joblib.load('../interim/trained_models/lda/lda.pkl')

['../interim/trained_models/lda/lda_25.pkl',
 '../interim/trained_models/lda/lda_25.pkl_01.npy',
 '../interim/trained_models/lda/lda_25.pkl_02.npy',
 '../interim/trained_models/lda/lda_25.pkl_03.npy',
 '../interim/trained_models/lda/lda_25.pkl_04.npy',
 '../interim/trained_models/lda/lda_25.pkl_05.npy',
 '../interim/trained_models/lda/lda_25.pkl_06.npy',
 '../interim/trained_models/lda/lda_25.pkl_07.npy']

# IV. Exploration

## A. Model Words

In [50]:
# Most Important Words for Each Topic
topic_words = model.topic_word_
print(topic_words.shape)

top_n = 6
topic_names_temp = []
for ii, dist in enumerate(topic_words):
    topic_words = np.array(feature_names)[np.argsort(dist)][:-(top_n+1):-1]
    topic_names_temp.append('Topic ' + str(ii) + ': ' + ' - '.join(topic_words.flatten()))
    print('Topic {}:  {}'.format(ii, ' - '.join(topic_words.flatten())))
    

(56L, 296959L)
Topic 0:  latest book - fly fishing - alexander technique - state university - south africa - 20th century
Topic 1:  listen episode - martial arts - want hear - rate review - contact info - fans podcast
Topic 2:  real estate - estate investing - cash flow - estate investors - stock market - personal finance
Topic 3:  rating review - links mentioned - send questions - nt forget - love hear - sure check
Topic 4:  podcast itunes - subscribe podcast - talk radio - email info - review itunes - episode notes
Topic 5:  real life - tells story - years later - executive producer - submit question - everyday life
Topic 6:  visit website - chat room - website http - click play - join host - psychic medium
Topic 7:  ll learn - episode ll - today talk - today podcast - today talking - answer question
Topic 8:  rss feed - welcome episode - send feedback - leave voicemail - subscribe rss - leave feedback
Topic 9:  award winning - today guest - author books - 30 years - 25 years - autho

## B. Show Topics

In [51]:
model.doc_topic_.shape

(11744L, 56L)

In [52]:
model.doc_topic_.shape

(11744L, 56L)

In [53]:
# Best Fitting Episodes for Each Topic
show_topics = model.doc_topic_
print(show_topics.shape)


show_closest_topic = np.array(topic_names_temp)[show_topics.argmax(1)]
show_closest_topic_prob = show_topics.max(1)

(11744L, 56L)


In [54]:
shows.head()

Unnamed: 0,podcast_name,subgenre,show_desc
0,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School,Get FREE RINGTONES when you subscribe! Footbal...
1,! World's Funniest Ringtones for iPhone & iPad...,Gadgets,Get FREE RINGTONES when you subscribe to The W...
2,! iPhone Ringtones,Video Games,Get FREE RINGTONES when you subscribe! The Wor...
3,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences,Dr. Tony Ferretti offers a psychological persp...
4,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature,"""A Tale of Two Cities"" was first published in ..."


In [55]:
show_closest_topic = pd.DataFrame({'topic': show_closest_topic, 
                                 'prob': show_closest_topic_prob})
show_closest_topic = pd.concat([show_closest_topic, shows], axis=1)

In [56]:
show_closest_topic

Unnamed: 0,prob,topic,podcast_name,subgenre,show_desc
0,0.671409,Topic 36: middle school - photography podcast ...,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School,Get FREE RINGTONES when you subscribe! Footbal...
1,0.238132,Topic 38: share experience - experience streng...,! World's Funniest Ringtones for iPhone & iPad...,Gadgets,Get FREE RINGTONES when you subscribe to The W...
2,0.234069,Topic 36: middle school - photography podcast ...,! iPhone Ringtones,Video Games,Get FREE RINGTONES when you subscribe! The Wor...
3,0.407588,Topic 42: mental health - health care - breast...,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences,Dr. Tony Ferretti offers a psychological persp...
4,0.475575,Topic 13: continue reading - week talk - week ...,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature,"""A Tale of Two Cities"" was first published in ..."
5,0.421182,Topic 13: continue reading - week talk - week ...,"""Adventures of Huckleberry Finn"" Audiobook (Au...",Literature,"""All modern American literature comes from one..."
6,0.114583,Topic 30: visit http - celtic music - web site...,"""He's Just Not That Into You: Ten Chick Flick ...",Music,He's Just Not That Into You: Ten Chick Flick C...
7,0.433594,Topic 13: continue reading - week talk - week ...,"""Heart of Darkness"" Audiobook (Audio book)",Literature,"""Heart of Darkness"" was first published in 190..."
8,0.246032,Topic 41: notes http - law school - says `` - ...,"""How-to"" (I hope) build an online e-commerce s...",Shopping,"A weekly audio journal sharing the inspired, a..."
9,0.241462,Topic 51: men women - sex life - love life - s...,"""Living in the Quantum Field"" with Asara Lovejoy",Spirituality,Are you ready to leap into another dimension o...


In [57]:
show_closest_topic.to_csv('C:/Users/JBLAUVELT/Desktop/review_cats.csv')

##C. Episode Words

In [66]:
# Most Important Words for Each Topic
topic_words = model.topic_word_
print(topic_words.shape)

(56L, 58684L)


In [None]:
topic_words.

In [None]:
top_n = 8
topic_names_temp = []
for ii, dist in enumerate(topic_words):
    topic_words = np.array(feature_names)[np.argsort(dist)][:-(top_n+1):-1]
    topic_names_temp.append('Topic ' + str(ii) + ': ' + '-'.join(topic_words.flatten()[:4]))
    print('Topic {}:  {}'.format(ii, ' '.join(topic_words.flatten())))

# V. Evaluation

In [None]:
cat_map = pd.read_csv('../raw/cat_maps/025_cat_map.csv')