# Text Preprocessing

In [44]:
import pandas as pd
import pickle
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import unicodedata
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import NMF

  return f(*args, **kwds)


## Load Data

In [2]:
data = pd.read_csv('../data/translated_podcast_samples.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19126 entries, 0 to 19125
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   19126 non-null  int64 
 1   uuid         19126 non-null  object
 2   title        19126 non-null  object
 3   description  19126 non-null  object
 4   pred_lang    19126 non-null  object
 5   categories   19126 non-null  object
 6   author       19126 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


## Clean Data

In [4]:
# Drop null values.
data.dropna(axis=0, inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19126 entries, 0 to 19125
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   19126 non-null  int64 
 1   uuid         19126 non-null  object
 2   title        19126 non-null  object
 3   description  19126 non-null  object
 4   pred_lang    19126 non-null  object
 5   categories   19126 non-null  object
 6   author       19126 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.2+ MB


In [6]:
# Check for uuid uniqueness.
data['uuid'].nunique()

19126

In [7]:
# Drop extra index column.
data.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,uuid,title,description,pred_lang,categories,author
0,c539085ab36a499eb25ab06505ce10c3,RoomOfRequirement,Room of Requirement is a podcast dedicated to ...,en,News & Politics,Room of Requirement
1,8ad3e312defb4faab8f87946f6f67a13,The Pollsters,Politics. Policy. Polling. Pop Culture.\n\nExp...,en,Social Sciences | Science & Medicine | Managem...,audioBoom
2,5ac07e5979ac4c9d99db71962b468378,BABME™,BABME™ is a company that Micah Sanders (@micah...,en,Comedy,BABME™ / Anchor
3,e22d49ae12dc44349bc9c1ae45d5f443,Baseball Tonight with Buster Olney,ESPN MLB Insider Buster Olney leads the baseba...,en,Sports & Recreation,ESPN Radio
4,0443cb89bcf242d89d3075a381d794ee,Legacy Baptist Church Sermons,AM and PM Sermons from the weekly services at ...,en,Christianity | Religion & Spirituality,Jamen


In [9]:
# Remove any hyperlinks hidden in the descriptions.
data['description'] = data['description'].str.replace('http\S+|www.\S+', '', case=False)

In [10]:
data.shape

(19126, 6)

## Clean Text

In [11]:
# List where each element is a string of the description.
descriptions = data['description'].to_list()
descriptions[0]

'Room of Requirement is a podcast dedicated to reason and resilience in the Time of Trump. Hosted by Miracle Jones and Kamalesh Rao and based in Jackson Heights, Queens, the podcast aims to be a voice of advocacy for democracy and human rights.'

In [12]:
# List where each element is a string of the category.
categories = data['categories'].to_list()
categories[0]

'News & Politics'

In [13]:
# Create remove_accents function.
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

In [14]:
# Create clean_text function.
def clean_text(docs):
    # Make all words in documents lowercase.
    low_docs = [doc.lower() for doc in docs]
    # Remove all accents from documents.
    acc_docs = [remove_accents(doc) for doc in low_docs]
    # Tokenize each document.
    tokens = [word_tokenize(doc) for doc in acc_docs]
    # Remove stopwords and punctuation.
    stopwords_ = set(stopwords.words('english'))
    punctuation_ = set(string.punctuation)
    tokens = [[word for word in token if word not in stopwords_ and word not in punctuation_] for token in tokens]
    # Apply Snowball Stemmer.
    snowball = SnowballStemmer('english')
    snowball_tokens = [list(map(snowball.stem, token)) for token in tokens]
    # Join tokens in each document.
    token_docs = [' '.join(tokens) for tokens in snowball_tokens]
    return token_docs

In [15]:
# Create clean_cat function.
def clean_cat(docs):
    # Make all words in documents lowercase.
    low_docs = [doc.lower() for doc in docs]
    # Remove all accents from documents.
    acc_docs = [remove_accents(doc) for doc in low_docs]
    # Tokenize each document.
    tokens = [word_tokenize(doc) for doc in acc_docs]
    # Remove stopwords and punctuation.
    stopwords_ = set(stopwords.words('english'))
    punctuation_ = set(string.punctuation)
    docs = [[word for word in token if word not in stopwords_ and word not in punctuation_] for token in tokens]
    # Apply Snowball Stemmer.
    ## snowball = SnowballStemmer('english')
    ## snowball_tokens = [list(map(snowball.stem, token)) for token in tokens]
    # Join tokens in each document.
    token_docs = [' '.join(tokens) for tokens in docs]
    return token_docs

In [16]:
descriptions = clean_text(descriptions)

In [17]:
descriptions[0]

'room requir podcast dedic reason resili time trump host miracl jone kamalesh rao base jackson height queen podcast aim voic advocaci democraci human right'

In [18]:
categories = clean_cat(categories)

In [19]:
categories[0]

'news politics'

__I'll use the snowball stemmer because I feel that it reduced the words efficiently without losing meaning.__

__ie. 'month' and 'develop' instead of 'monthly' and 'developing'__

## Create Feature Matrix

In [36]:
descriptions_tfidf = TfidfVectorizer(min_df=3, max_df=0.5)
categories_tfidf = TfidfVectorizer(min_df=3, max_df=0.5)
descriptions_matrix = descriptions_tfidf.fit_transform(descriptions)
categories_matrix = categories_tfidf.fit_transform(categories)

In [37]:
descriptions_matrix.shape

(19126, 9077)

In [38]:
categories_matrix.shape

(19126, 83)

In [39]:
descriptions_matrix = descriptions_matrix.toarray()
feature_names = descriptions_tfidf.get_feature_names()
descriptions_df = pd.DataFrame(descriptions_matrix, index=data['title'], columns=feature_names)
descriptions_df.shape

(19126, 9077)

In [40]:
categories_matrix = categories_matrix.toarray()
feature_names = categories_tfidf.get_feature_names()
categories_df = pd.DataFrame(categories_matrix, index=data['title'], columns=feature_names)
categories_df.shape

(19126, 83)

In [41]:
lang_df = pd.get_dummies(data['pred_lang'])
lang_df['title'] = data['title'].tolist()
lang_df.set_index('title', inplace=True)
lang_df.shape

(19126, 37)

In [43]:
feature_matrix = pd.concat([descriptions_df, categories_df, lang_df], axis=1)

## Create Feature Matrix

In [20]:
# Vectorize descriptions and categories.
cv_descriptions = CountVectorizer()
cv_categories = CountVectorizer()
X_descriptions = cv_descriptions.fit_transform(descriptions)
X_categories = cv_categories.fit_transform(categories)

In [21]:
X_descriptions.shape

(19126, 34180)

In [22]:
X_categories.shape

(19126, 83)

In [23]:
# Change X_descriptions into a dataframe.
X_descriptions = X_descriptions.toarray()
feature_names = cv_descriptions.get_feature_names()
descriptions_df = pd.DataFrame(X_descriptions, index=data['title'], columns=feature_names)
descriptions_df.shape

(19126, 34180)

In [24]:
descriptions_df.head()

Unnamed: 0_level_0,00,000,001,007,0084,0096,00a,00am,00p,00pm,...,zurich,zurita,zweig,zwell,zwicker,zydeco,zyla,zylka,zyuzyaev,zyx
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RoomOfRequirement,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Pollsters,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BABME™,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Baseball Tonight with Buster Olney,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Legacy Baptist Church Sermons,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Change X_categories into a dataframe.
X_categories = X_categories.toarray()
feature_names = cv_categories.get_feature_names()
categories_df = pd.DataFrame(X_categories, index=data['title'], columns=feature_names)
categories_df.shape

(19126, 83)

In [26]:
categories_df.head()

Unnamed: 0_level_0,12,alternative,amateur,arts,automotive,aviation,beauty,buddhism,business,careers,...,spirituality,sports,tech,technology,to,training,travel,tv,video,visual
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RoomOfRequirement,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Pollsters,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
BABME™,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Baseball Tonight with Buster Olney,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Legacy Baptist Church Sermons,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [27]:
# Create dataframe with language dummy variables.
lang_df = pd.get_dummies(data['pred_lang'])
lang_df['title'] = data['title'].tolist()
lang_df.set_index('title', inplace=True)
lang_df.shape

(19126, 37)

In [28]:
lang_df.head()

Unnamed: 0_level_0,af,ca,cs,da,de,en,eo,es,et,eu,...,pt,rw,si,sk,sl,sv,tl,vi,xh,zh
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RoomOfRequirement,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Pollsters,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BABME™,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Baseball Tonight with Buster Olney,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Legacy Baptist Church Sermons,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Concatenate the descriptions_df, categories_df, and lang_df.
feature_df = pd.concat([descriptions_df, categories_df, lang_df], axis=1)

In [30]:
feature_df.shape

(19126, 34300)

In [31]:
feature_df.head()

Unnamed: 0_level_0,00,000,001,007,0084,0096,00a,00am,00p,00pm,...,pt,rw,si,sk,sl,sv,tl,vi,xh,zh
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RoomOfRequirement,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Pollsters,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BABME™,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Baseball Tonight with Buster Olney,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Legacy Baptist Church Sermons,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
test_description = ['crime, murder, kill, thriller']
test_category = ['true crime']
language = ['en']

In [62]:
cleaned_test = clean_text(test)
test_matrix = descriptions_tfidf.transform(cleaned_test)
test_matrix = test_matrix.toarray()
feature_names = descriptions_tfidf.get_feature_names()
test_df = pd.DataFrame(test_matrix, index=['test'], columns=feature_names)
test_df.shape

(1, 9077)

In [70]:
feature_matrix = pd.concat([feature_matrix, test_df], axis=0)

ValueError: Plan shapes are not aligned

## Create Similarity Matrix

In [48]:
feature_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19126 entries, RoomOfRequirement to Mundo de Restauracion
Columns: 9197 entries, 00 to zh
dtypes: float64(9160), uint8(37)
memory usage: 1.3+ GB


In [49]:
titles_list = list(feature_matrix.index)

In [50]:
# Save list of titles for reference.
with open('../app/titles_list.pkl', 'wb') as f:
    pickle.dump(titles_list, f)

In [51]:
cat_dict = {title:category for title, category in zip(titles_list, categories)}

In [52]:
# Save title and category dictionary as pickle file.
with open('../app/category_dict.pkl', 'wb') as f:
    pickle.dump(cat_dict, f)

In [54]:
# Create similarity matrix.
sim_mat = pd.DataFrame(cosine_similarity(feature_matrix, feature_matrix))

In [55]:
sim_mat.shape

(19126, 19126)

In [56]:
sim_mat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19116,19117,19118,19119,19120,19121,19122,19123,19124,19125
0,1.0,0.450129,0.341157,0.333333,0.333333,0.339872,0.618935,0.338864,0.333333,0.35111,...,0.342342,0.012177,0.0,0.00524,0.0,0.0,0.012372,0.017148,0.044168,0.01308
1,0.450129,1.0,0.335955,0.341931,0.342139,0.348794,0.458474,0.340587,0.348334,0.333333,...,0.117853,0.00408,0.0,0.0,0.006807,0.0,0.004145,0.005745,0.014797,0.004382
2,0.341157,0.335955,1.0,0.333333,0.333333,0.342075,0.520384,0.336511,0.333333,0.344,...,0.012043,0.01628,0.0,0.0,0.333333,0.0,0.016539,0.022925,0.059048,0.017487
3,0.333333,0.341931,0.333333,1.0,0.333333,0.333333,0.333333,0.338259,0.333333,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.333333,0.342139,0.333333,0.333333,1.0,0.342144,0.339942,0.402067,0.741655,0.687598,...,0.0,0.175962,0.333333,0.0,0.0,0.145068,0.0,0.262979,0.333333,0.333333


In [57]:
# Save similarity matrix for reference.
sim_mat.to_pickle('../app/similarity_matrix.pkl')