## Imports

In [25]:
import pandas as pd
import pickle
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import unicodedata
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load Data

In [26]:
data = pd.read_csv('../data/translated_podcast_samples.csv')

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47679 entries, 0 to 47678
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   47679 non-null  int64 
 1   uuid         47679 non-null  object
 2   title        47679 non-null  object
 3   description  47679 non-null  object
 4   pred_lang    47679 non-null  object
 5   categories   47679 non-null  object
 6   author       47679 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.5+ MB


In [28]:
data.dropna(axis=0, inplace=True)

In [29]:
data['uuid'].nunique()

47679

In [30]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [31]:
data['description'] = data['description'].str.replace('https?\S+|www.\S+', '', case=False)

In [32]:
data.shape

(47679, 6)

In [33]:
data.head()

Unnamed: 0,uuid,title,description,pred_lang,categories,author
0,c539085ab36a499eb25ab06505ce10c3,RoomOfRequirement,Room of Requirement is a podcast dedicated to ...,en,News & Politics,Room of Requirement
1,8ad3e312defb4faab8f87946f6f67a13,The Pollsters,Politics. Policy. Polling. Pop Culture.\n\nExp...,en,Social Sciences | Science & Medicine | Managem...,audioBoom
2,5ac07e5979ac4c9d99db71962b468378,BABME™,BABME™ is a company that Micah Sanders (@micah...,en,Comedy,BABME™ / Anchor
3,e22d49ae12dc44349bc9c1ae45d5f443,Baseball Tonight with Buster Olney,ESPN MLB Insider Buster Olney leads the baseba...,en,Sports & Recreation,ESPN Radio
4,0443cb89bcf242d89d3075a381d794ee,Legacy Baptist Church Sermons,AM and PM Sermons from the weekly services at ...,en,Christianity | Religion & Spirituality,Jamen


In [34]:
descriptions = data['description'].to_list()
descriptions[0]

'Room of Requirement is a podcast dedicated to reason and resilience in the Time of Trump. Hosted by Miracle Jones and Kamalesh Rao and based in Jackson Heights, Queens, the podcast aims to be a voice of advocacy for democracy and human rights.'

## Clean Text

In [35]:
# Create remove_accents function.
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

In [36]:
# Create clean_text function.
def clean_text(docs):
    # Make all words in documents lowercase.
    low_docs = [doc.lower() for doc in docs]
    # Remove all accents from documents.
    acc_docs = [remove_accents(doc) for doc in low_docs]
    # Tokenize each document.
    tokens = [word_tokenize(doc) for doc in acc_docs]
    # Remove stopwords and punctuation.
    stopwords_ = set(stopwords.words('english'))
    punctuation_ = set(string.punctuation)
    tokens = [[word for word in token if word not in stopwords_ and word not in punctuation_] for token in tokens]
    # Apply Lemmatizer Stemmer.
    lemmatizer = WordNetLemmatizer()
    lemmatize_tokens = [list(map(lemmatizer.lemmatize, token)) for token in tokens]
    # Join tokens in each document.
    token_docs = [' '.join(tokens) for tokens in lemmatize_tokens]
    return token_docs

In [37]:
clean_descriptions = clean_text(descriptions)

In [38]:
clean_descriptions[0]

'room requirement podcast dedicated reason resilience time trump hosted miracle jones kamalesh rao based jackson height queen podcast aim voice advocacy democracy human right'

## Vectorize

In [39]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.5)
tfidf_matrix = tfidf.fit_transform(clean_descriptions)

In [40]:
tfidf_matrix.shape

(47679, 18406)

In [41]:
tfidf_matrix = tfidf_matrix.toarray()
feature_names = tfidf.get_feature_names()
feature_df = pd.DataFrame(tfidf_matrix, index=data['title'], columns=feature_names)
feature_df.shape

(47679, 18406)

## Make Recommendations

In [66]:
test = ['how to get my dog to go poop']
cleaned_test = clean_text(test)
test_matrix = tfidf.transform(cleaned_test)
test_matrix = test_matrix.toarray()
feature_names = tfidf.get_feature_names()
test_df = pd.DataFrame(test_matrix, index=['test'], columns=feature_names)
test_df.shape

(1, 18406)

In [67]:
test_similarities = cosine_similarity(feature_df, test_df).T[0]
test_similarities

array([0., 0., 0., ..., 0., 0., 0.])

In [68]:
test_similarities.shape

(47679,)

In [69]:
num_recs = 3

In [70]:
idxs = test_similarities.argsort()[-(num_recs):]
idxs

array([38396, 41886, 29695])

In [71]:
list(feature_df.iloc[idxs].index)

['Pups n PopCulture - The Podcast for Dog Lovers',
 'Can I Pet Your Dog?',
 'Der Hunderatgeber-alles Rund um den Hund']