In [11]:
# General libraries.
import numpy as np
import pandas as pd
import feather
import time
from __future__ import division

# SK-learn libraries for learning
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text
from sklearn.feature_extraction.text import *
from sklearn.cross_validation import train_test_split

In [2]:
#eps_path = '../interim/eps.feather' 
#pods_path = '../interim/pods.feather'

eps_path = '../interim/eps_samp.feather' 
pods_path = '../interim/pods_samp.feather'

eps_df = feather.read_dataframe(eps_path)
pods_df = feather.read_dataframe(pods_path)

print(eps_df.shape)
print(pods_df.shape)

eps_df.head(5)

(102688, 5)
(1241, 18)


Unnamed: 0,description,podcast_name,release_date,title,subgenre
0,"Coming to Charlottesville, VA in April is the ...",GovLove,"Feb 24, 2017",#108 A One-of-a-Kind Celebration of Hometowns,Local
1,What's it like managing a community with a the...,GovLove,"Feb 17, 2017",#107 Managing Adorable Communities,Local
2,Dan Ralley the Assistant City Manager of Upper...,GovLove,"Feb 14, 2017",#106 Customer Service & Permitting How LocalGo...,Local
3,Our fearless intern is back! The ICMA/ELGL Ore...,GovLove,"Feb 10, 2017","#105 The Intern Files with Julie Rusk, Santa M...",Local
4,Taking a Silicon Valley approach to city manag...,GovLove,"Feb 07, 2017",#104 Empathy & Disruption in City Management w...,Local


In [3]:
pods_df.head(5)

Unnamed: 0,also_sub_1,also_sub_2,also_sub_3,also_sub_4,also_sub_5,by,genre,more_by_1,more_by_2,more_by_3,more_by_4,more_by_5,num_ratings,podcast_name,rating,show_desc,subgenre,website
0,Edward G. Talbot - New World Orders/Short Stories,Down From Ten,Title Fight by Scott Sigler,Black Shadow - Podcast Novel by Steve Saylor,Hell Comes With Wood Paneled Doors,Seth Harwood,Arts,Great Moments in History,Infected by Scott Sigler,7th Son: Book One - Descent (The Beta Version),Contagious by Scott Sigler,Ancestor by Scott Sigler,91.0,Jack Wakes Up,4.62636,What does a movie-star one-hit-wonder and ex-d...,Literature,http://www.podiobooks.com/title/jack-wakes-up
1,ICMA: Local Gov Life,The GovTech Social Podcast,Municipal Equation Podcast,Gov Innovator podcast,GovEx Data Points,ELGL,Government & Organizations,,,,,,41.0,GovLove,5.0,A podcast about local government.,Local,http://elgl.org/govlove/
2,,,,,,Evelar Solar,Technology,,,,,,,Going Solar: The Evelar Experience,,Going Solar: The Evelar Experience Podcast is ...,Gadgets,http://goingsolar.libsyn.com/podcast
3,Supreme Court Oral Argument Audio,Short Circuit,U.S. Supreme Court 2012 Term Arguments,ABA Journal: Asked and Answered,U.S. Supreme Court Opinion Announcements,Pacific Legal Foundation,Government & Organizations,,,,,,,Courting Liberty,,A weekly look at developments in our high-prof...,National,https://www.pacificlegal.org/
4,Pastor Rick's Daily Hope,Saddleback Church Weekend Messages,Max LucadoMax Lucado,Willow Creek Community Church Weekend Podcast,Catalyst Podcast,Rick Warren,Religion & Spirituality,,,,,,43.0,Rick Warren's Ministry Podcast,3.83721,A weekly conversation with pastors from around...,Christianity,http://mediacenter.saddleback.com/mediacenter/...


In [4]:
min_desc_length = 100

comb_eps = eps_df[(eps_df['description'].str.len() > min_desc_length) & (~eps_df['description'].isnull())]
print('%d / %d episodes removed because len() < %d' % (eps_df.shape[0] - comb_eps.shape[0], eps_df.shape[0], min_desc_length))

comb_eps = comb_eps.groupby(['podcast_name' , 'subgenre']).apply(lambda x: ' '.join(x['description']))
comb_eps = comb_eps.reset_index()
comb_eps.columns = ['podcast_name', 'subgenre', 'comb_desc']
print("%d unique podcasts with concatenated descriptions" % comb_eps.shape[0])

17991 / 102688 episodes removed because len() < 100
1173 unique podcasts with concatenated descriptions


In [5]:
# reserve 10% for test data
x_all, x_test, y_all, y_test = train_test_split(comb_eps['comb_desc'], comb_eps['subgenre'], test_size=0.1, random_state=np.random.RandomState())

print("x_all shape:", x_all.shape)
print("y_all shape:", y_all.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

('x_all shape:', (1055,))
('y_all shape:', (1055,))
('x_test shape:', (118,))
('y_test shape:', (118,))


In [6]:
# split the rest .75/.25 as train/dev

def random_test_train():
    x_train, x_dev, y_train, y_dev = train_test_split(x_all, y_all, test_size=0.25, random_state=np.random.RandomState())

    print "test:  ", x_test.shape, y_test.shape
    print "dev:   ", x_dev.shape, y_dev.shape
    print "train: ", x_train.shape, y_train.shape
    return x_train, x_dev, y_train, y_dev

x_train, x_dev, y_train, y_dev = random_test_train()

test:   (118,) (118,)
dev:    (264,) (264,)
train:  (791,) (791,)


In [13]:
# vectorize the train and dev data
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_dev_vectors = count_vect.transform(x_dev)

print 'Score for dev_data:\n%s' % ('-'*45)

n_neighbors_list = [1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 50]

for n in n_neighbors_list:
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(x_train_counts, y_train)

    predictions = clf.predict( x_dev_vectors )
    cnt = np.sum(predictions == y_dev)

    print 'Score for %d neighbors: %3.4f, matches:%d' % ( n, clf.score(x_dev_vectors , y_dev), cnt )


Score for dev_data:
---------------------------------------------
Score for 1 neighbors: 0.0833, matches:22
Score for 2 neighbors: 0.0909, matches:24
Score for 3 neighbors: 0.0833, matches:22
Score for 4 neighbors: 0.0606, matches:16
Score for 5 neighbors: 0.0644, matches:17
Score for 7 neighbors: 0.0758, matches:20
Score for 10 neighbors: 0.0682, matches:18
Score for 15 neighbors: 0.0530, matches:14
Score for 20 neighbors: 0.0606, matches:16
Score for 25 neighbors: 0.0530, matches:14
Score for 30 neighbors: 0.0492, matches:13
Score for 50 neighbors: 0.0568, matches:15
