In [73]:
# General libraries.
import numpy as np
import pandas as pd
import feather
import time

# SK-learn libraries for learning
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text
from sklearn.feature_extraction.text import *
from sklearn.cross_validation import train_test_split

In [26]:
#eps_path = '../interim/eps.feather' 
#pods_path = '../interim/pods.feather'

eps_path = '../interim/eps_samp.feather' 
pods_path = '../interim/pods_samp.feather'

eps_df = feather.read_dataframe(eps_path)
pods_df = feather.read_dataframe(pods_path)

eps_df.head(5)

Unnamed: 0,description,podcast_name,release_date,title,subgenre
0,Bert Beeckman and his partners at Forgotten Em...,Hanselminutes,"Feb 24, 2017","Forgotten Empires, amazing games - Age of Empi...",Software How-To
1,It's been a few hundred episodes. It's not epi...,Hanselminutes,"Feb 17, 2017",The Return of Mo - Lessons from Scott,Software How-To
2,Laron Walker is a technologist and entrepreneu...,Hanselminutes,"Feb 10, 2017",A new kind of STEM learning with Laron Walker,Software How-To
3,Ada Rose is an engineer and developer advocate...,Hanselminutes,"Feb 03, 2017",Discovering WebVR with Ada Rose Edwards,Software How-To
4,"Mina Markham built ""Pantsuit,"" Hillary for Ame...",Hanselminutes,"Jan 27, 2017",Building Pantsuit: The Hillary Clinton UI Patt...,Software How-To


In [8]:
pods_df.head(5)

Unnamed: 0,also_sub_1,also_sub_2,also_sub_3,also_sub_4,also_sub_5,by,genre,more_by_1,more_by_2,more_by_3,more_by_4,more_by_5,num_ratings,podcast_name,rating,show_desc,subgenre,website
0,GrapeRadio  Wine Talk Show,Wine Spectator Video,Guild of Sommeliers Wine Podcasts,3 Wine Guys,Understanding Wine (with Austin Beeman) HD,Chris Scott,Arts,,,,,,16.0,UK Wine Show,4.625,The UK Wine Show is all about wine and the UK ...,Food,http://www.thirtyfifty.co.uk/uk-wine-show.asp
1,Driving Participation Podcast: Whats Working ...,The Fundraising Coach - Tom Iselin |Nonprofit ...,Nonprofit Hub Radio,Through the Noise,Nonproft and Charity Fundraising Podcast,CauseVox,Government & Organizations,,,,,,6.0,Rally & Engage - Online Fundraising & Marketin...,5.0,You?re listening to Rally & Engage by CauseVox...,Non-Profit,https://www.causevox.com/podcast
2,"Tiny Leaps, Big Changes: Motivation | Inspirat...",Fearless And Healthy Podcast:High Performance|...,Your Motivational High 5 | 5-Minute Inspiratio...,Love Is A Verb - Self love stories & actionabl...,The Positivity Effect | Daily chats on positiv...,"Jacob Sokol - Life Coach, Philosopher, Old Sch...",Health,,,,,,50.0,WTF Should I Do W/ My Life?!,4.94,"No BS, real-life, street-smart conversations w...",Self-Help,https://sensophy.com/
3,SQL Server Pain Relief: Office Hours with Bren...,Dear SQL DBA,SQL Data Partners Podcast,voiceofthedba's podcast,RunAs Radio,"Matan Yungman, Guy Glantser",Technology,,,,,,6.0,SQL Server Radio,5.0,SQL Server Radio is a Podcast for SQL Server D...,Software How-To,http://www.sqlserverradio.com/
4,The Smoking Tire,Alison Rosen Is Your New Best Friend,CarStuff,Larry Miller Show,Hooniverse,PodcastOne / Carolla Digital,Games & Hobbies,The Adam Carolla Show,The Adam and Dr. Drew Show,The Dr. Drew Podcast,Penn's Sunday School,,1657.0,CarCast,4.78576,CarCast is an automotive podcast and Internet ...,Automotive,http://carcastshow.com/


In [81]:
all_pods = pods_df.as_matrix(['podcast_name', 'show_desc'])
all_eps = eps_df.as_matrix(['podcast_name', 'description'])

start = time.time()
# loop through all podcasts
for i in range(all_pods.shape[0]):
    # loop through all episodes
    for j in range(all_eps.shape[0]):
        # match episode's podcast_name to the show's podcast_name
        if ( all_eps[j, 0] == all_pods[i, 0] ) and ( all_eps[j, 1]!= None ) and ( len(all_eps[j, 1]) > 100 ):
            # if episode's description is greater than 100 characters, concat onto show's description
            if all_pods[i, 1] == None:
                all_pods[i, 1] = all_eps[j, 1]
            else:
                all_pods[i, 1] += ' ' + all_eps[j, 1]
    if all_pods[i, 1] == None:
        all_pods[i, 1] = ''
print(time.time() - start)

79.623319149


In [28]:
# store the all_pods array in a file
all_pods_array = '../interim/all_pods_array' 
np.save(all_pods_array, all_pods)

In [29]:
# restore the all_pods array from a file
all_pods_array = '../interim/all_pods_array.npy' 
all_pods = np.load(all_pods_array)

In [77]:
# = pods_df.as_matrix( ['show_desc'] )
# = pods_df.as_matrix( ['subgenre'] )
#x_all, x_test, y_all, y_test = train_test_split( X, Y, test_size=0.1, random_state=np.random.RandomState())

# reserve 10% for test data
x_all, x_test, y_all, y_test = train_test_split( all_pods[:,1].reshape(-1), all_pods[:,0].reshape(-1), test_size=0.1, random_state=np.random.RandomState())

In [78]:
# split the rest .75/.25 as train/dev

def random_test_train():
    x_train, x_dev, y_train, y_dev = train_test_split( x_all, y_all, test_size=0.25, random_state=np.random.RandomState())

    print "test:  ", x_test.shape, y_test.shape
    print "dev:   ", x_dev.shape, y_dev.shape
    print "train: ", x_train.shape, y_train.shape
    return x_train, x_dev, y_train, y_dev

x_train, x_dev, y_train, y_dev = random_test_train()

test:   (125,) (125,)
dev:    (279,) (279,)
train:  (837,) (837,)


In [79]:
# vectorize the train and dev data
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_dev_vectors = count_vect.transform(x_dev)

print 'Score for dev_data:\n%s' % ('-'*45)

n_neighbors_list = [1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 50]

for n in n_neighbors_list:
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(x_train_counts, y_train)

    predictions = clf.predict( x_dev_vectors )
    cnt = 0
    for j in range(y_dev.shape[0]):
        if predictions[j]== y_dev[j]:
            cnt +=1

    print 'Score for %d neighbors: %3.4f, matches:%d' % ( n, clf.score(x_dev_vectors , y_dev), cnt )


Score for dev_data:
---------------------------------------------
Score for 1 neighbors: 0.0000, matches:0
Score for 2 neighbors: 0.0000, matches:0
Score for 3 neighbors: 0.0000, matches:0
Score for 4 neighbors: 0.0000, matches:0
Score for 5 neighbors: 0.0000, matches:0
Score for 7 neighbors: 0.0000, matches:0
Score for 10 neighbors: 0.0000, matches:0
Score for 15 neighbors: 0.0000, matches:0
Score for 20 neighbors: 0.0000, matches:0
Score for 25 neighbors: 0.0000, matches:0
Score for 30 neighbors: 0.0000, matches:0
Score for 50 neighbors: 0.0000, matches:0
