# Content Based Recommender 

This notebook builds a recommender based on the director, cast, genre, and description of the content to find other similar content.

In [1]:
import pandas as pd
import numpy as np
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [95]:
df = pd.read_csv('C:/Users/kparekh/netflix_titles_nov_2019.csv')
#df = pd.read_csv('.../netflix_titles_nov_2019.csv')
df = df[['title','director','cast','listed_in','description']]
df.head(n=2)

Unnamed: 0,title,director,cast,listed_in,description
0,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...","International TV Shows, Korean TV Shows, Roman...",Brought together by meaningful meals in the pa...
1,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,"Documentaries, International Movies","From Sierra de las Minas to Esquipulas, explor..."


In [96]:
df.isnull().sum()

title             0
director       1901
cast            556
listed_in         0
description       0
dtype: int64

In [97]:
#extract keywords from description
df['description_keywords'] = ''
for index, row in df.iterrows():
    r = Rake()
    r.extract_keywords_from_text(row['description'])
    keywords_dict_scores = r.get_word_degrees()
    row['description_keywords'] = ','.join(keywords_dict_scores.keys())
df = df.drop('description', 1)

#null values and remoce spaces
for col in df.columns:
    if col in ['director', 'cast']:
        df[col] = df[col].fillna('')
    if col in ['director', 'cast', 'listed_in']:
        df[col] = df[col].str.replace(' ','')

df.set_index('title', inplace = True)

df['bagofwords'] = df.description_keywords+','+df.listed_in+','+df.cast+','+df.director
df['bagofwords'] = df.bagofwords.str.replace(',',' ')

df.head(n=2)

Unnamed: 0_level_0,director,cast,listed_in,description_keywords,bagofwords
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chocolate,,"HaJi-won,YoonKye-sang,JangSeung-jo,KangBu-ja,L...","InternationalTVShows,KoreanTVShows,RomanticTVS...","reacquainted,hospice,ward,present,meaningful,m...",reacquainted hospice ward present meaningful m...
Guatemala: Heart of the Mayan World,"LuisAra,IgnacioJaunsolo",ChristianMorales,"Documentaries,InternationalMovies","cultural,natural,wonders,esquipulas,including,...",cultural natural wonders esquipulas including ...


In [98]:
df.isnull().sum()

director                0
cast                    0
listed_in               0
description_keywords    0
bagofwords              0
dtype: int64

In [99]:
#tokenize text and build a vocabulary
count = CountVectorizer()
count_matrix = count.fit_transform(df['bagofwords'])

indices = pd.Series(df.index)

#compute cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [100]:
#recommend the top 10 choices based on similarity
def recommendations(Title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx = indices[indices == Title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
    return recommended_movies

recommendations('Rocky')

['Rocky III',
 'Rocky II',
 'Rocky IV',
 'Rocky V',
 'Abdo Mota',
 'Defiance',
 'Submission',
 'The Bleeder',
 'Rangoon',
 "Logan's Run"]

In [101]:
recommendations('NCIS')

['MINDHUNTER',
 'Criminal Minds',
 'How to Get Away with Murder',
 'Manhunt',
 'Re:Mind',
 'Persona',
 'The Sinner',
 'Unit 42',
 'Brotherhood',
 'Secret City']

In [102]:
recommendations('Swades')

['English Babu Desi Mem',
 'Lagaan',
 'Pimpal',
 'Janaan',
 'Pahuna',
 'Luv Shuv Tey Chicken Khurana',
 'Jodhaa Akbar',
 'Tanu Weds Manu',
 'Haapus',
 "What's Your Raashee?"]