In [1]:
# General libraries.
import numpy as np
import pandas as pd
import feather
import time
from __future__ import division

# SK-learn libraries for learning
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text
from sklearn.feature_extraction.text import *
from sklearn.cross_validation import train_test_split

In [12]:
#eps_path = '../interim/eps.feather' 
#pods_path = '../interim/pods.feather'

eps_path = '../interim/eps_samp.feather' 
pods_path = '../interim/pods_samp.feather'

eps_df = feather.read_dataframe(eps_path)
pods_df = feather.read_dataframe(pods_path)

print(eps_df.shape)
print(pods_df.shape)

eps_df.head(5)

(107340, 5)
(1241, 18)


Unnamed: 0,description,podcast_name,release_date,title,subgenre
0,Corey and RC are moving up to the 'Big Show' a...,Layers TV,"Sep 29, 2010",Episode 148: Final Episode,Software How-To
1,Corey designs an image composite utilizing mas...,Layers TV,"Sep 23, 2010",Episode 147,Software How-To
2,RC opens up Fireworks and talks about layout d...,Layers TV,"Sep 15, 2010",Episode 146,Software How-To
3,RC experiments with the new divide blend mode ...,Layers TV,"Sep 02, 2010",Episode 145,Software How-To
4,"Corey expands on his tutorial from last week, ...",Layers TV,"Aug 25, 2010",Episode 144,Software How-To


In [11]:
eps_df.head(5)

Unnamed: 0,description,podcast_name,release_date,title,subgenre
0,Corey and RC are moving up to the 'Big Show' a...,Layers TV,"Sep 29, 2010",Episode 148: Final Episode,Software How-To
1,Corey designs an image composite utilizing mas...,Layers TV,"Sep 23, 2010",Episode 147,Software How-To
2,RC opens up Fireworks and talks about layout d...,Layers TV,"Sep 15, 2010",Episode 146,Software How-To
3,RC experiments with the new divide blend mode ...,Layers TV,"Sep 02, 2010",Episode 145,Software How-To
4,"Corey expands on his tutorial from last week, ...",Layers TV,"Aug 25, 2010",Episode 144,Software How-To


In [4]:
min_desc_length = 100

comb_eps = eps_df[(eps_df['description'].str.len() > min_desc_length) & (~eps_df['description'].isnull())]
print('%d / %d episodes removed because len() < %d' % (eps_df.shape[0] - comb_eps.shape[0], eps_df.shape[0], min_desc_length))

comb_eps = comb_eps.groupby(['podcast_name' , 'subgenre']).apply(lambda x: ' '.join(x['description']))
comb_eps = comb_eps.reset_index()
comb_eps.columns = ['podcast_name', 'subgenre', 'comb_desc']
print("%d unique podcasts with concatenated descriptions" % comb_eps.shape[0])

18420 / 107340 episodes removed because len() < 100
1166 unique podcasts with concatenated descriptions


In [5]:
# reserve 10% for test data
x_all, x_test, y_all, y_test = train_test_split(comb_eps['comb_desc'], comb_eps['subgenre'], test_size=0.1, random_state=np.random.RandomState())

print("x_all shape:", x_all.shape)
print("y_all shape:", y_all.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

('x_all shape:', (1049,))
('y_all shape:', (1049,))
('x_test shape:', (117,))
('y_test shape:', (117,))


In [6]:
# split the rest .75/.25 as train/dev

def random_test_train():
    x_train, x_dev, y_train, y_dev = train_test_split(x_all, y_all, test_size=0.25, random_state=np.random.RandomState())

    print "test:  ", x_test.shape, y_test.shape
    print "dev:   ", x_dev.shape, y_dev.shape
    print "train: ", x_train.shape, y_train.shape
    return x_train, x_dev, y_train, y_dev

x_train, x_dev, y_train, y_dev = random_test_train()

test:   (117,) (117,)
dev:    (263,) (263,)
train:  (786,) (786,)


In [7]:
# vectorize the train and dev data
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_dev_vectors = count_vect.transform(x_dev)

print 'Score for dev_data:\n%s' % ('-'*45)

n_neighbors_list = [1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 50]

for n in n_neighbors_list:
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(x_train_counts, y_train)

    predictions = clf.predict( x_dev_vectors )
    cnt = np.sum(predictions == y_dev)

    print 'Score for %d neighbors: %3.4f, matches:%d' % ( n, clf.score(x_dev_vectors , y_dev), cnt )


Score for dev_data:
---------------------------------------------
Score for 1 neighbors: 0.0494, matches:13
Score for 2 neighbors: 0.0684, matches:18
Score for 3 neighbors: 0.0494, matches:13
Score for 4 neighbors: 0.0608, matches:16
Score for 5 neighbors: 0.0570, matches:15
Score for 7 neighbors: 0.0570, matches:15
Score for 10 neighbors: 0.0456, matches:12
Score for 15 neighbors: 0.0418, matches:11
Score for 20 neighbors: 0.0304, matches:8
Score for 25 neighbors: 0.0380, matches:10
Score for 30 neighbors: 0.0266, matches:7
Score for 50 neighbors: 0.0266, matches:7
