In [1]:
import gc
import os
import time

import numpy as np
import pandas as pd
import scipy

from sklearn.externals import joblib

In [13]:
samp = True
samp = '_samp' if samp else ''

## I. Load

In [14]:
# Load TDM
loader = np.load('../interim/028_preproc_heavy_tdm' + samp + '.npz')
tdm = scipy.sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

loader = np.load('../interim/028_preproc_heavy_tfidf_tdm' + samp + '.npz')
tfidf_tdm = scipy.sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

# Load feature names
feature_names = np.array(pd.read_pickle('../interim/028_preproc_heavy_names' + samp + '.p'))

# Load show names and subgenres that still remain
shows_concat = pd.read_pickle('../interim/028_preproc_heavy_shows_concat' + samp + '.p')

# Load full show table so that summaries can be pulled in
shows_full = pd.read_pickle('../interim/pods' + samp + '.p')

print("Episode Term Document Matrix Shape:", tdm.shape)
print("Episode TFIDF Term Document Matrix Shape:", tfidf_tdm.shape)
print("Episode Table Shape:", shows_concat.shape)
assert tdm.shape[0] == shows_concat.shape[0]
assert len(feature_names) == tdm.shape[1]

('Episode Term Document Matrix Shape:', (1192, 14183))
('Episode TFIDF Term Document Matrix Shape:', (1192, 14183))
('Episode Table Shape:', (1192, 2))


In [15]:
# Remove duplicates from full show list 
dupes = shows_full.groupby(['podcast_name', 'subgenre']).filter(lambda group: len(group) > 1).sort('podcast_name')
print(dupes.shape)
shows_full = shows_full.drop_duplicates(['podcast_name', 'subgenre'])
print(shows_full.shape)

(2, 18)
(1240, 18)


  from ipykernel import kernelapp as app


In [24]:
# Join data from full show table to list of shows in the model

print(shows_full.shape)
print(shows_concat.shape)

shows = pd.merge(shows_concat, shows_full[['podcast_name', 'subgenre', 'show_desc']], 
                 on = ['podcast_name', 'subgenre'], how='left', sort=False)
print(shows.shape)
assert shows.shape[0] == shows_concat.shape[0]
shows.head()

(1240, 18)
(1192, 2)
(1192, 3)


Unnamed: 0,podcast_name,subgenre,show_desc
0,#NerdyCast,K-12,This is an education podcast featuring some of...
1,#SmartBrownVoices - Learning from Diversity,Business News,"When you look through iTunes podcast, you rare..."
2,1045 Home Improvement Show,Educational Technology,1045-WFLA's Home Improvement Show is all about...
3,2009 K-12 Online Conference Audio Podcast Channel,Educational Technology,The K-12 Online Conference invites participati...
4,3 Minute Hypnosis | Confidence Boost | Relaxat...,Alternative Health,Change your Day in 3 Minutes.... Specialised i...


In [21]:
# Small cutout of TDM - it is a sparse matrix and therefore mostly 0s
print tdm.toarray()[:5,0:20]
print '-'*80
print tfidf_tdm.toarray()[:5,0:20]

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
--------------------------------------------------------------------------------
[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.05003277  0.          0.          0.
   0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.        ]
 [ 0.     

## II. Filter

In [28]:
row_sums = np.array(tdm.sum(axis=1)).flatten()
tfidf_row_sums = np.array(tfidf_tdm.sum(axis=1)).flatten()

In [30]:
shows.ix[np.where(row_sums == 0)[0],:]

Unnamed: 0,podcast_name,subgenre,show_desc
749,Saloeurm Savath,Buddhism,This podcast is created for sharing the Khmer ...


In [31]:
shows.ix[np.where(tfidf_row_sums == 0)[0],:]

Unnamed: 0,podcast_name,subgenre,show_desc
749,Saloeurm Savath,Buddhism,This podcast is created for sharing the Khmer ...


In [32]:
pd.Series(row_sums).value_counts().head()

92     7
323    6
32     6
6      6
123    6
dtype: int64

In [33]:
pd.Series(tfidf_row_sums).value_counts().head()

1.000000    13
5.775120     2
6.718904     2
4.281483     2
4.956252     2
dtype: int64

In [36]:
# Remove rows with zero words in the set
shows = shows.ix[np.where(row_sums > 0)[0],:]
tfidf_shows=shows.ix[np.where(tfidf_row_sums > 0)[0],:]

tdm = tdm[np.where(row_sums > 0)[0],:]
tfidf_tdm = tfidf_tdm[np.where(tfidf_row_sums > 0)[0],:]

print 'Shows:"'
print(shows.shape)
print 'tfidf_Shows:'
print(tdfidf_shows.shape)
print 'TDM:"'
print(tdm.shape)
print 'tfidf_TDM:"'
print(tfidf_tdm.shape)

Shows:"
(1191, 3)
TDM:"
(1191, 14183)
tfidf_TDM:"
(1191, 14183)


## III. Export Arrays

In [42]:
np.savez('../interim/preprocessed_deduped_tdm' + samp + '.npz', data=tdm.data, 
         indices=tdm.indices, indptr=tdm.indptr, shape=tdm.shape)

np.savez('../interim/preprocessed_deduped_tfidf_tdm' + samp + '.npz', data=tfidf_tdm.data, 
         indices=tfidf_tdm.indices, indptr=tfidf_tdm.indptr, shape=tfidf_tdm.shape)

shows[['podcast_name', 'subgenre']].to_pickle('../interim/preprocessed_deduped_show_subgenre' + samp + '.p')

tfidf_shows[['podcast_name', 'subgenre']].to_pickle('../interim/preprocessed_deduped_tfidf_show_subgenre' + samp + '.p')