# Podcast Micro-Categories
### Exploratory Notebook
### Unsupervised LDA

In [58]:
from __future__ import print_function
from __future__ import division

import gc
import os
import time

import numpy as np
import pandas as pd
import scipy

import lda

import re

from sklearn.externals import joblib


In [59]:
samp = False
samp = '_samp' if samp else ''

# I. Load

### A. Load Files

In [74]:
# Define scenario number (s[1-6])
scenario = 'final'

In [75]:
# 1. Load TDM
if scenario == 'final': 
    scen = 's1'
else:
    scen = scenario
loader = np.load('../interim/' + scen + '_tdm' + samp + '.npz')
tdm = scipy.sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

# 2. Load feature names
feature_names = np.array(pd.read_pickle('../interim/' + scen + '_names' + samp + '.p'))

# 3. Load show names and subgenres that still remain
shows_concat = pd.read_pickle('../interim/' + scen + '_shows_concat' + samp + '.p')

# 4. Load full show table so that summaries can be pulled in
shows_full = pd.read_pickle('../interim/pods' + samp + '.p')

# 5. Print shapes and check assertions
print("Episode Term Document Matrix Shape: {:,} shows x  {:,} features".format(tdm.shape[0], tdm.shape[1]))
print("Episode Table Shape: {:,} shows x  {:,} columns (show name and concatenated description)".format(shows_concat.shape[0], shows_concat.shape[1]))
assert tdm.shape[0] == shows_concat.shape[0]
assert len(feature_names) == tdm.shape[1]

Episode Term Document Matrix Shape: 11,795 shows x  55,821 features
Episode Table Shape: 11,795 shows x  2 columns (show name and concatenated description)


### B. Remove duplicates 

In [76]:
# Remove duplicates from full show list (need to handle these earlier on in other scripts)
dupes = shows_full.groupby(['podcast_name', 'subgenre']).filter(lambda group: len(group) > 1).sort('podcast_name')
print("Number of dupes removed: ", dupes.shape)
shows_full = shows_full.drop_duplicates(['podcast_name', 'subgenre'])
print("Number of shows remaining: ", shows_full.shape)

Number of dupes removed:  (151, 18)
Number of shows remaining:  (12319, 18)


### C. Join features from full show table to de-duplicated shows

In [77]:
# Join data from full show table to list of shows in the model

print(shows_full.shape)
print(shows_concat.shape)

shows = pd.merge(shows_concat, shows_full[['podcast_name', 'subgenre', 'show_desc']], 
                 on = ['podcast_name', 'subgenre'], how='left', sort=False)
print(shows.shape)
assert shows.shape[0] == shows_concat.shape[0]
shows.head()

(12319, 18)
(11795, 2)
(11795, 3)


Unnamed: 0,podcast_name,subgenre,show_desc
0,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School,Get FREE RINGTONES when you subscribe! Footbal...
1,! World's Funniest Ringtones for iPhone & iPad...,Gadgets,Get FREE RINGTONES when you subscribe to The W...
2,! iPhone Ringtones,Video Games,Get FREE RINGTONES when you subscribe! The Wor...
3,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences,Dr. Tony Ferretti offers a psychological persp...
4,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature,"""A Tale of Two Cities"" was first published in ..."


# II. Filter

### A. Remove shows with no words in the vocab

In [78]:
# Identify show that have no words in the vocabulary
row_sums = np.array(tdm.sum(axis=1)).flatten() # sum tdm counts across rows (across shows)

print("Number of shows with no words in the vocabulary: ", 
      shows.ix[np.where(row_sums == 0)[0],:].shape[0]) 

Number of shows with no words in the vocabulary:  33


In [79]:
# Remove shows with no words in the vocabulary, from both the original shows table
# and the TDM
shows = shows.ix[np.where(row_sums > 0)[0],:]
tdm = tdm[np.where(row_sums > 0)[0],:]
print(shows.shape)
print(tdm.shape)
assert shows.shape[0] == tdm.shape[0]

(11762, 3)
(11762, 55821)


# III. Train LDA Model

In [21]:
# Create model
n_topics = 200 if scenario == 'final' else 56
n_iter = 200 if scenario == 'final' else 75
model = lda.LDA(n_topics=56, n_iter=n_iter, random_state=42)

# Fit model (takes about 20 minutes for 75 iterations
%time model.fit(tdm)

Wall time: 5min 45s


<lda.lda.LDA instance at 0x000000000A97FA08>

In [22]:
# Save Pickle model
joblib.dump(model, '../interim/trained_models/lda/lda_' + scenario + samp + '.pkl')


['../interim/trained_models/lda/lda_s2.pkl',
 '../interim/trained_models/lda/lda_s2.pkl_01.npy',
 '../interim/trained_models/lda/lda_s2.pkl_02.npy',
 '../interim/trained_models/lda/lda_s2.pkl_03.npy',
 '../interim/trained_models/lda/lda_s2.pkl_04.npy',
 '../interim/trained_models/lda/lda_s2.pkl_05.npy',
 '../interim/trained_models/lda/lda_s2.pkl_06.npy',
 '../interim/trained_models/lda/lda_s2.pkl_07.npy']

In [50]:
# Load pickled model
model = joblib.load('../interim/trained_models/lda/lda_' + scenario + samp + '.pkl')

# IV. Exploration

## A. Model Words

In [70]:
# Most Important Words for Each Topic
topic_words = model.topic_word_
print(topic_words.shape)

top_n = 6
topic_names_temp = []
for ii, dist in enumerate(topic_words):
    top_n_words = np.array(feature_names)[np.argsort(dist)][:-(top_n+1):-1]
    topic_names_temp.append('Topic ' + str(ii) + ': ' + ' - '.join(top_n_words.flatten()))
    #print('Topic {}:  {}'.format(ii, ' - '.join(topic_names_temp)))
    
topic_names_temp
    

(200L, 55821L)


[u'Topic 0: wrestling - vince - wrestlemania - reign - hogan - elite',
 u'Topic 1: whisky - cocktail - cider - bourbon - whiskey - bartender',
 u'Topic 2: tumblr - kate - narrated - cat - narrator - lane',
 u'Topic 3: dana - bos - brad - href= - carl - christine',
 u'Topic 4: marvel - batman - superman - xmen - avenger - captain',
 u'Topic 5: piano - composer - opera - guitar - jazz - orchestra',
 u'Topic 6: cyber - hack - hacker - breach - packet - vulnerability',
 u'Topic 7: vegan - vegetarian - andrea - plantbased - nutrition - jazzy',
 u'Topic 8: shoe - plastic - cheap - bottle - yard - paint',
 u'Topic 9: licensed - macleod - incompetechcom - attribution - license - located',
 u'Topic 10: moral - philosopher - ethic - argues - argument - philosophical',
 u'Topic 11: brewing - brewery - brew - jimmy - brewer - homebrew',
 u'Topic 12: julie - darren - joshua - millennial - drunk - alcohol',
 u'Topic 13: appreciated - directed - tuning - greatly - sponsorship - commentator',
 u'Topic

## B. Show Topics

In [93]:
# Best Fitting Episodes for Each Topic
show_topics = model.doc_topic_
print(show_topics.shape)

# Get closest topic for each show
show_closest_topic = np.array(topic_names_temp)[show_topics.argmax(1)]
# Get probability of closest topic for each show
show_closest_topic_prob = show_topics.max(1)

# Create data frame with the above two vectors
show_closest_topic = pd.DataFrame({'usub': show_closest_topic,
                                   'usub_prob': show_closest_topic_prob})
# Add in the full show table
show_closest_topic = pd.concat([show_closest_topic, shows], axis=1)
# Extract the ID from the usub
show_closest_topic['usub_id'] = show_closest_topic['usub'].str.extract('(Topic [0-9]{1,2})').str.replace('Topic ', '')

# Reorder and rename columns
show_closest_topic = show_closest_topic[['usub_id', 'usub', 'usub_prob', 'podcast_name', 'subgenre', 'show_desc']]
show_closest_topic = show_closest_topic.rename(columns={'subgenre': 'isub'})


(11762L, 200L)


In [72]:
show_closest_topic.head()

Unnamed: 0,usub_id,usub,usub_prob,podcast_name,isub,show_desc
0,27,Topic 27: basketball - recruiting - alabama - ...,0.702484,"! Football Ringtones, Text Tones, Mail Alerts ...",College & High School,Get FREE RINGTONES when you subscribe! Footbal...
1,27,Topic 27: basketball - recruiting - alabama - ...,0.643662,! World's Funniest Ringtones for iPhone & iPad...,Gadgets,Get FREE RINGTONES when you subscribe to The W...
2,27,Topic 27: basketball - recruiting - alabama - ...,0.675,! iPhone Ringtones,Video Games,Get FREE RINGTONES when you subscribe! The Wor...
3,77,Topic 77: disorder - adhd - therapist - sympto...,0.237685,""" Talk to Tony - Radio Show "" Licensed Psychol...",Social Sciences,Dr. Tony Ferretti offers a psychological persp...
4,17,Topic 173: kyle - architecture - architect - j...,0.382143,"""A Tale of Two Cities"" Audiobook (Audio book)",Literature,"""A Tale of Two Cities"" was first published in ..."


In [80]:
# Export to CSV
show_closest_topic.to_csv('../output/' + scenario + '_shows_and_closest_topics.csv', index=False)

In [81]:
pd.DataFrame(model.doc_topic_).to_csv('../output/' + scenario + '_doctopics.csv', index=False)