In [18]:
import pandas as pd
import json
import numpy as np
import requests
import sklearn

# Parse the stringified features into their corresponding python objects
from ast import literal_eval


In [160]:
movies = pd.read_csv('movies.csv')
movies.head(2)

Unnamed: 0,id,title,genres,overview,tagline,keywords,poster,lang,date,cast,director,popularity,vote_average,vote_count
0,19995,Avatar,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,"['culture clash', 'future', 'space war', 'spac...",/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,en,2009-12-10,"[{'name': 'Sam Worthington', 'character': 'Jak...",James Cameron,150,7.2,11800
1,285,Pirates of the Caribbean: At World's End,"['Adventure', 'Fantasy', 'Action']","Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.","['ocean', 'drug abuse', 'exotic island', 'east...",/jGWpG4YhpQwVmjyHEGkxEkeRf0S.jpg,en,2007-05-19,"[{'name': 'Johnny Depp', 'character': 'Captain...",Gore Verbinski,139,6.9,4500


In [161]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def extract_cast_names(entries_str):
    entries = literal_eval(entries_str)
    return [entry['name'] for entry in entries[:5]]

In [162]:
features = ['cast', 'director', 'keywords', 'genres']
df2 = movies.copy()[ features]

df2['cast'] = df2['cast'].apply(extract_cast_names) # extract cast names


In [163]:
for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [152]:
df2.iloc[1]['cast']

['johnnydepp',
 'orlandobloom',
 'keiraknightley',
 'stellanskarsgård',
 'chowyun-fat']

In [136]:
df2['director']

id
19995         jamescameron
285          goreverbinski
206647           sammendes
49026     christophernolan
49529        andrewstanton
                ...       
9367       robertrodriguez
72766          edwardburns
231617          scottsmith
126186          danielhsia
25975      brianherzlinger
Name: director, Length: 4803, dtype: object

Creating a "soup" of the selected metadata
Also - give the director 3 times weight in the soup

In [164]:
df2['director'] = df2['director'].apply(lambda x: (x + ' ') * 3 if isinstance(x, str) else x) # give the director more weight
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

Not using TF-IDF beacause it might eliminate directors/cast that played in ,ultiple movies

In [165]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [166]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [170]:

title_to_id = pd.Series(movies.index, index=movies['title'])

In [173]:
id_to_index = pd.Series(movies.index, index=movies['id'])

In [182]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations_by_title(title, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = title_to_id[title]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]

    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]

In [183]:
rec = get_recommendations_by_title('A Serious Man', cosine_sim2)
rec

1283           The Hudsucker Proxy
1330            Burn After Reading
1365                     True Grit
1790    O Brother, Where Art Thou?
1829        No Country for Old Men
2041                 Hail, Caesar!
2607              The Big Lebowski
2951           Inside Llewyn Davis
756            Intolerable Cruelty
1397               The Ladykillers
Name: title, dtype: object

In [174]:
def get_recommendations_by_id(id, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = id_to_index[id]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]

    # Get the movie indices
    movies_idx = [i[0] for i in similarity_scores]

    # Return the top 10 most similar movies
    return movies['id'].iloc[movies_idx]

In [176]:
movies.iloc[title_to_id['A Serious Man']]

index                                                        3362
id                                                          12573
title                                               A Serious Man
genres                                        ['Comedy', 'Drama']
overview        A Serious Man is the story of an ordinary man'...
tagline                                             ...seriously!
keywords        ['professor', 'dark comedy', 'telephone call',...
poster                           /5gGxDS8WmrebPlMHexVS8EVehiP.jpg
lang                                                           en
date                                                   2009-10-02
cast            [{'name': 'Michael Stuhlbarg', 'character': 'L...
director                                                Joel Coen
popularity                                                     23
vote_average                                                  6.6
vote_count                                                    483
Name: 3362

In [177]:
rec = get_recommendations_by_id(12573, cosine_sim2)
rec

1283     11934
1330      4944
1365     44264
1790       134
1829      6977
2041    270487
2607       115
2951     86829
756      11775
1397      5516
Name: id, dtype: int64

In [178]:
rec.to_list()

[11934, 4944, 44264, 134, 6977, 270487, 115, 86829, 11775, 5516]

export cosine similarity

In [188]:
id_to_index

id
19995        0
285          1
206647       2
49026        3
49529        4
          ... 
9367      4798
72766     4799
231617    4800
126186    4801
25975     4802
Length: 4803, dtype: int64

In [190]:
import pickle

with open('similarity_matrix.pkl', 'wb') as f1:
    pickle.dump(cosine_sim2, f1)

with open('movie_id_to_matrix_similarity_idx.pkl', 'wb') as f2:
    pickle.dump(id_to_index, f2)