# Content Based Recommender

# Data

https://www.kaggle.com/rounakbanik/the-movies-dataset

# Setup

In [109]:
import pandas as pd
import numpy as np
from ast import literal_eval
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
warnings.filterwarnings("ignore")

# Plot description based recommender

In [91]:
df = pd.read_csv('data/movies_metadata.csv', low_memory=False)

In [92]:
df = df[['id', 'title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count', 'overview']]
df['year'] = df['release_date'].fillna('9999-12-31').apply(lambda x: int(str(x).split('-')[0]))
df = df.drop('release_date', axis=1)
df['overview'] = df['overview'].fillna('')

In [72]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

(45466, 75827)

In [37]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
        0.        ],
       [0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
        0.00929411],
       [0.        , 0.04681953, 1.        , ..., 0.        , 0.01402548,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00595453, 0.02198641, 0.01402548, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00929411, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [73]:
indices = pd.Series(df.index, index=df['title'])

In [74]:
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [75]:
#Get recommendations for The Lion King
content_recommender('The Lion King')

34682    How the Lion Cub and the Turtle Sang a Song
9353                                The Lion King 1½
9115                  The Lion King 2: Simba's Pride
42829                                           Prey
25654                                 Fearless Fagan
17041                                   African Cats
27933              Massaï, les guerriers de la pluie
6094                                       Born Free
37409                                     Sour Grape
3203                                The Waiting Game
Name: title, dtype: object

# Metadata Based Recommender

In [94]:
cred_df = pd.read_csv('data/credits.csv')
key_df = pd.read_csv('data/keywords.csv')

In [95]:
cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [96]:
key_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [97]:
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [98]:
df['id'] = df['id'].apply(clean_ids)
df = df[df['id'].notnull()]

In [99]:
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')
df.head()

Unnamed: 0,id,title,genres,runtime,vote_average,vote_count,overview,year,cast,crew,keywords
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,A family wedding reignites the ancient feud be...,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,Just when George Banks has recovered from his ...,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [100]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].fillna('[]')
    df[feature] = df[feature].apply(literal_eval)
    df[feature] = df[feature].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])
    df[feature] = df[feature].apply(lambda x: x[:5])

In [101]:
df.head()

Unnamed: 0,id,title,genres,runtime,vote_average,vote_count,overview,year,cast,crew,keywords
0,862,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,"[tom hanks, tim allen, don rickles, jim varney...","[john lasseter, joss whedon, andrew stanton, j...","[jealousy, toy, boy, friendship, friends]"
1,8844,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,"[robin williams, jonathan hyde, kirsten dunst,...","[larry j. franco, jonathan hensleigh, james ho...","[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92.0,A family wedding reignites the ancient feud be...,1995,"[walter matthau, jack lemmon, ann-margret, sop...","[howard deutch, mark steven johnson, mark stev...","[fishing, best friend, duringcreditsstinger, o..."
3,31357,Waiting to Exhale,"[comedy, drama, romance]",127.0,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",1995,"[whitney houston, angela bassett, loretta devi...","[forest whitaker, ronald bass, ronald bass, ez...","[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,[comedy],106.0,5.7,173.0,Just when George Banks has recovered from his ...,1995,"[steve martin, diane keaton, martin short, kim...","[alan silvestri, elliot davis, nancy meyers, n...","[baby, midlife crisis, confidence, aging, daug..."


In [102]:
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [103]:
for feature in ['cast', 'crew', 'keywords', 'genres']:
    df[feature] = df[feature].apply(sanitize)

In [104]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['crew']) + ' ' + ' '.join(x['genres'])

In [105]:
df['soup'] = df.apply(create_soup, axis=1)

In [112]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])
count_matrix.shape

(46628, 139833)

In [111]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [114]:
indices2 = pd.Series(df.index, index=df['title'])

In [115]:
content_recommender('The Lion King', cosine_sim2, df, indices2)

8263                                            Shark Tale
40904                   VeggieTales: Josh and the Big Wall
40913    VeggieTales: Minnesota Cuke and the Search for...
10177                                           Madagascar
29198                                      Superstar Goofy
29607                                          Cheburashka
30244                                              My Love
31179                Pokémon: Arceus and the Jewel of Life
32498                               Puff, the Magic Dragon
34495                                      Yedyanchi Jatra
Name: title, dtype: object