## Imports

In [1]:
import pandas as pd
import pickle
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import unicodedata
import string
from langid.langid import LanguageIdentifier, model

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load Data

In [2]:
data = pd.read_csv('../data/podcasts.csv')

In [3]:
data.head()

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id
0,8d62d3880db2425b890b986e58aca393,"Ecommerce Conversations, by Practical Ecommerce",http://is4.mzstatic.com/image/thumb/Music6/v4/...,Listen in as the Practical Ecommerce editorial...,English,Technology,http://www.practicalecommerce.com,Practical Ecommerce,874457373
1,cbbefd691915468c90f87ab2f00473f9,Eat Sleep Code Podcast,http://is4.mzstatic.com/image/thumb/Music71/v4...,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,http://developer.telerik.com/,Telerik,1015556393
2,73626ad1edb74dbb8112cd159bda86cf,SoundtrackAlley,http://is5.mzstatic.com/image/thumb/Music71/v4...,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,https://soundtrackalley.podbean.com,Randy Andrews,1158188937
3,0f50631ebad24cedb2fee80950f37a1a,The Tech M&A Podcast,http://is1.mzstatic.com/image/thumb/Music71/v4...,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,http://www.corumgroup.com,Timothy Goddard,538160025
4,69580e7b419045839ca07af06cf0d653,"The Tech Informist - For fans of Apple, Google...",http://is4.mzstatic.com/image/thumb/Music62/v4...,The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,http://techinformist.com,The Tech Informist,916080498


In [4]:
data.shape

(121175, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121175 entries, 0 to 121174
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uuid         121175 non-null  object
 1   title        121173 non-null  object
 2   image        121175 non-null  object
 3   description  119832 non-null  object
 4   language     121175 non-null  object
 5   categories   121175 non-null  object
 6   website      120005 non-null  object
 7   author       118678 non-null  object
 8   itunes_id    121175 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 8.3+ MB


## Clean Data

In [6]:
data.dropna(axis=0, inplace=True)
data.shape

(116374, 9)

## Filter for English Podcasts

In [7]:
def pred_lang(df, column):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    lang_lst = []
    for text in df[column].values:
        lang, score = identifier.classify(text)
        lang_lst.append(lang)
    return lang_lst

In [8]:
english_data = data[data['language']=='English']

In [9]:
english_data.shape

(95118, 9)

In [10]:
lang_lst = pred_lang(english_data, 'description')

In [11]:
english_data['pred_lang'] = lang_lst

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
english_data = english_data[english_data['pred_lang']=='en']

In [13]:
english_data.shape

(90049, 10)

In [14]:
english_data['uuid'].nunique()

90049

In [15]:
english_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90049 entries, 0 to 121174
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uuid         90049 non-null  object
 1   title        90049 non-null  object
 2   image        90049 non-null  object
 3   description  90049 non-null  object
 4   language     90049 non-null  object
 5   categories   90049 non-null  object
 6   website      90049 non-null  object
 7   author       90049 non-null  object
 8   itunes_id    90049 non-null  int64 
 9   pred_lang    90049 non-null  object
dtypes: int64(1), object(9)
memory usage: 7.6+ MB


## Text Preprocessing

In [16]:
english_data['description'] = english_data['description'].str.replace('https?\S+|www.\S+', '', case=False)

In [17]:
english_data.shape

(90049, 10)

In [18]:
english_data.head()

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id,pred_lang
0,8d62d3880db2425b890b986e58aca393,"Ecommerce Conversations, by Practical Ecommerce",http://is4.mzstatic.com/image/thumb/Music6/v4/...,Listen in as the Practical Ecommerce editorial...,English,Technology,http://www.practicalecommerce.com,Practical Ecommerce,874457373,en
1,cbbefd691915468c90f87ab2f00473f9,Eat Sleep Code Podcast,http://is4.mzstatic.com/image/thumb/Music71/v4...,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,http://developer.telerik.com/,Telerik,1015556393,en
2,73626ad1edb74dbb8112cd159bda86cf,SoundtrackAlley,http://is5.mzstatic.com/image/thumb/Music71/v4...,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,https://soundtrackalley.podbean.com,Randy Andrews,1158188937,en
3,0f50631ebad24cedb2fee80950f37a1a,The Tech M&A Podcast,http://is1.mzstatic.com/image/thumb/Music71/v4...,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,http://www.corumgroup.com,Timothy Goddard,538160025,en
4,69580e7b419045839ca07af06cf0d653,"The Tech Informist - For fans of Apple, Google...",http://is4.mzstatic.com/image/thumb/Music62/v4...,The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,http://techinformist.com,The Tech Informist,916080498,en


In [19]:
descriptions = english_data['description'].to_list()
descriptions[0]

'Listen in as the Practical Ecommerce editorial staff interviews interesting personalities in the ecommerce space.'

In [20]:
# Create remove_accents function.
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

In [21]:
# Create clean_text function.
def clean_text(docs):
    # Make all words in documents lowercase.
    low_docs = [doc.lower() for doc in docs]
    # Remove all accents from documents.
    acc_docs = [remove_accents(doc) for doc in low_docs]
    # Tokenize each document.
    tokens = [word_tokenize(doc) for doc in acc_docs]
    # Remove stopwords and punctuation.
    stopwords_ = set(stopwords.words('english'))
    punctuation_ = set(string.punctuation)
    tokens = [[word for word in token if word not in stopwords_ and word not in punctuation_] for token in tokens]
    # Apply Lemmatizer Stemmer.
    lemmatizer = WordNetLemmatizer()
    lemmatize_tokens = [list(map(lemmatizer.lemmatize, token)) for token in tokens]
    # Join tokens in each document.
    token_docs = [' '.join(tokens) for tokens in lemmatize_tokens]
    return token_docs

In [22]:
clean_descriptions = clean_text(descriptions)

In [23]:
clean_descriptions[0]

'listen practical ecommerce editorial staff interview interesting personality ecommerce space'

## Vectorize

In [24]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.5)
tfidf_matrix = tfidf.fit_transform(clean_descriptions)

In [25]:
tfidf_matrix.shape

(90049, 25867)

In [26]:
tfidf_matrix = tfidf_matrix.toarray()
feature_names = tfidf.get_feature_names()
feature_df = pd.DataFrame(tfidf_matrix, index=english_data['title'], columns=feature_names)
feature_df.shape

(90049, 25867)

In [27]:
feature_df.to_pickle('../data/features.pkl')

In [None]:
pickle.dump(tfidf, open('../data/vectorizer.pkl', 'wb'))

## Make Recommendations

In [51]:
test = ['crime, murder, mystery, killer']
cleaned_test = clean_text(test)
test_matrix = tfidf.transform(cleaned_test)
test_matrix = test_matrix.toarray()
feature_names = tfidf.get_feature_names()
test_df = pd.DataFrame(test_matrix, index=['test'], columns=feature_names)
test_df.shape

(1, 25867)

In [52]:
test_similarities = cosine_similarity(feature_df, test_df).T[0]
test_similarities

array([0., 0., 0., ..., 0., 0., 0.])

In [53]:
test_similarities.shape

(90049,)

In [54]:
num_recs = 3

In [55]:
idxs = test_similarities.argsort()[-(num_recs):]
idxs

array([40434, 82958, 20892])

In [56]:
test_similarities.argmax()

20892

In [57]:
titles = list(reversed(list(feature_df.iloc[idxs].index)))
titles

['The Hidden Staircase', 'MurderCast', 'True Crime and Mysteries']

In [58]:
cos_scores = list(reversed(list(test_similarities[idxs])))
cos_scores

[0.6851918515923576, 0.5976549010716439, 0.4883104660343828]

In [None]:
cos_scores_round = [round(score, 2) for score in cos_scores]
cos_scores_round