In [3]:
# Import functions about pandas and Kmodes

import pandas as pd
from kmodes.kmodes import KModes

In [4]:
# Import dataset that is podcasts
df = pd.read_csv('shows.csv')
df.head(3)

Unnamed: 0,id,feed_url,title,subtitle,description,summary,author,email,link,language,explicit,image,category,subcategory,created_at,last_build_date
0,0002da5f-49a2-31ba-b44c-cdeabdf113cb,http://www.cbc.ca/podcasting/includes/dispatch...,Dispatches from CBC Radio,,CBC Radio's Dispatches host Rick MacInnes-Rae ...,CBC Radio's Dispatches host Rick MacInnes-Rae ...,CBC Radio,podcasting@cbc.ca,http://www.cbc.ca/podcasting,en-ca,0,http://www.cbc.ca/podcasting/images/promo-disp...,News & Politics,Public Radio,2016-11-27T03:15:54,2016-11-27T03:15:54
1,00044845-beb8-35f3-99e2-01dbcfb6eb63,http://everydayissaturday.com/feed/podcast/,Motivation | Inspiration| Success with Entrepr...,Motivation - Inspiration - Success - Mindset -...,Sam Crowley delivers million dollar motivation...,Sam Crowley delivers million dollar motivation...,Sam Crowley,sam@everydayissaturday.com,http://everydayissaturday.com,en-us,0,http://everydayissaturday.com/wp-content/uploa...,Business,Business,2016-11-26T19:11:59,2016-11-21T14:19:34
2,00048bfa-8363-3f10-ac8e-8f27a31fe11f,http://disability411.jinkle.com/disability411.xml,Disability411 Podcast,,Disability411 - The Podcast for Disability Inf...,,Beth Case,,http://disability411.jinkle.com/,en-us,0,,Government & Organizations,Business,2016-11-27T00:37:30,2016-11-27T00:37:30


In [5]:
# Function about makeing KModes Model
def k_modes_model(dataset, cluster_num):
    # Drop and select dataset, and also make index in dataFrame
    df_new1 = dataset.drop(['explicit','link','created_at','image','email','feed_url','last_build_date'], axis=1)
    df_data = df_new1[['id', 'author', 'language', 'category', 'subcategory']]
    df_data = df_data[0:20000]
    df_data_kmodes = df_data[['language','category','subcategory']]
    df_data_kmodes = df_data_kmodes.set_index(df_data['id'])

    # Fill value with strin in NaN value.
    df_data_kmodes['language'] = df_data_kmodes['language'].fillna('No language')
    df_data_kmodes['category'] = df_data_kmodes['category'].fillna('No category')
    df_data_kmodes['subcategory'] = df_data_kmodes['subcategory'].fillna('No subcategory')

    # Start KModes function and make dataframe with cluster number.
    kmode = KModes(n_clusters=cluster_num, init = "random", n_init = 5, verbose=1)
    clusters = kmode.fit_predict(df_data_kmodes)
    df_data_kmodes.insert(0, "Cluster", clusters, True)

    # Return dataset and kmode model
    return df_data_kmodes, kmode



In [12]:
def k_modes_recommend(dataset, kmode, predict_dataset, array_num):
    # Predict data about example dataset, and make array.
    predict_array = kmode.predict(predict_dataset)

    # Find the value of the random row and proceed in the form of kmode.
    df_data_kmodes_cp = dataset.copy()
    forex = dataset['Cluster'] == predict_array[array_num]

    # Return recommendation which is included in same cluster number.
    return df_data_kmodes_cp[forex].sample(n=5)

In [7]:
# Make Example dataset which has language, category, and subcategory feature.
df_ex_preidct = pd.DataFrame()
df_ex_preidct['language'] = ['en-us', 'en-us', 'en']
df_ex_preidct['category'] = ['Health', 'News & Politics', 'Music']
df_ex_preidct['subcategory'] = ['No subcategory', 'Education', 'Music']
df_ex_preidct['id'] = ['1','2','3']
df_ex_preidct = df_ex_preidct.set_index('id')
df_ex_preidct

Unnamed: 0_level_0,language,category,subcategory
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,en-us,Health,No subcategory
2,en-us,News & Politics,Education
3,en,Music,Music


In [28]:
def Kmodes_recommend(dataset, cluster_number, example_dataset, array_number):
    # Make KModes model and return cluster dataframe.
    cluseter_data, kmode_model = k_modes_model(dataset, cluster_number)
    # Get KModes recommendation use previous results and prediciton dataset.
    sample = k_modes_recommend(cluseter_data, kmode_model, example_dataset, array_number)
    return sample

In [29]:
# Example of doing upper work
result = Kmodes_recommend(df, 50, df_ex_preidct, 1)
result

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 8334, cost: 26854.0
Run 1, iteration: 2/100, moves: 1709, cost: 26577.0
Run 1, iteration: 3/100, moves: 1433, cost: 26577.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 8313, cost: 26838.0
Run 2, iteration: 2/100, moves: 871, cost: 26838.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 8190, cost: 25391.0
Run 3, iteration: 2/100, moves: 1023, cost: 24558.0
Run 3, iteration: 3/100, moves: 681, cost: 24558.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 4590, cost: 26285.0
Run 4, iteration: 2/100, moves: 519, cost: 26205.0
Run 4, iteration: 3/100, moves: 53, cost: 26205.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 4

Unnamed: 0_level_0,Cluster,language,category,subcategory
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
b079e9bd-6ca3-31be-a7ac-277c013a3331,6,en-ca,gay,News & Politics
d5a2c7b8-6ca6-35ea-b666-f5e39bc2f12a,6,ja,News & Politics,News & Politics
053c982b-cda9-37ec-b88f-471b004c578c,6,en-us,News & Politics,Government & Organizations
39e58247-6fdf-3688-9e25-7cf2e50a417a,6,en-us,Cold War,News & Politics
3be2bb1e-019f-3b40-b44b-a1493c9a3f78,6,en-us,News & Politics,News & Politics


Unnamed: 0_level_0,Cluster,language,category,subcategory
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ff8c9846-f5c4-320c-a6c8-669ed999c463,9,en-us,News & Politics,News & Politics
47f9a7f6-0b86-3b53-9277-930f217f258a,9,en-us,News & Politics,Religion
ef5f8df4-5460-3a2c-af66-8db94e4a9b1d,9,en-au,News & Politics,Music
a74289f1-9c11-32a0-a87f-3229119b2680,9,en-us,News & Politics,Education
858d31c8-2a65-3ee3-b806-2e5f1a80451a,9,cs,News & Politics,No subcategory
