In [78]:
import pandas as pd
import json
import numpy as np
import requests
import sklearn

# Parse the stringified features into their corresponding python objects
from ast import literal_eval


In [79]:
movies = pd.read_csv('datasets/mine/top_voted_movies.csv')
movies.head(2)

Unnamed: 0,adult,backdrop_path,id,original_language,overview,popularity,poster_path,release_date,title,vote_average,vote_count,genres,year,imdb_id,revenue,production_companies,cast,director,tagline,keywords
0,False,/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg,278,en,Framed in the 1940s for the double murder of h...,113.704,/7Gvzbduz2iApKA3g7nbyGcfoBCL.jpg,1994-09-23,The Shawshank Redemption,8.7,24399,"['Drama', 'Crime']",1994,tt0111161,28341469,['Castle Rock Entertainment'],"[{'name': 'Tim Robbins', 'character': 'Andy Du...",Frank Darabont,Fear can hold you prisoner. Hope can set you f...,"['prison', 'corruption', 'police brutality', '..."
1,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,424,en,The true story of how businessman Oskar Schind...,58.232,/bnF8WWJvoL3pWdT8h7FkZlKFcke.jpg,1993-12-15,Schindler's List,8.6,14432,"['Drama', 'History', 'War']",1993,tt0108052,321365567,['Amblin Entertainment'],"[{'name': 'Liam Neeson', 'character': 'Oskar S...",Steven Spielberg,"Whoever saves one life, saves the world entire.","['based on novel or book', 'factory', 'concent..."


In [80]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def extract_cast_names(entries_str):
    entries = literal_eval(entries_str)
    return [entry['name'] for entry in entries[:5]]

In [81]:
features = ['cast', 'director', 'keywords', 'genres', 'production_companies']
df2 = movies.copy()[ features]

df2['cast'] = df2['cast'].apply(extract_cast_names) # extract cast names


In [82]:
for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [83]:
df2.iloc[1]['cast']

['liamneeson',
 'benkingsley',
 'ralphfiennes',
 'carolinegoodall',
 'jonathansagall']

In [84]:
df2['director']

0            frankdarabont
1          stevenspielberg
2             adityachopra
3            hayaomiyazaki
4            makotoshinkai
               ...        
8667        michaellehmann
8668    johnpatrickshanley
8669         bobbyfarrelly
8670         bernardcampan
8671              jamescox
Name: director, Length: 8672, dtype: object

Creating a "soup" of the selected metadata
Also - give the director 3 times weight in the soup

In [85]:
df2['genres']

0                      ['drama','crime']
1              ['drama','history','war']
2           ['comedy','drama','romance']
3       ['animation','family','fantasy']
4        ['romance','animation','drama']
                      ...               
8667     ['action','adventure','comedy']
8668                ['comedy','romance']
8669                ['comedy','romance']
8670    ['comedy','adventure','fantasy']
8671                ['drama','thriller']
Name: genres, Length: 8672, dtype: object

In [86]:
from ast import literal_eval

df2['soup'] = df2['director'].apply(lambda x: [x] * 3) + \
        df2['keywords'].apply(literal_eval) + \
        df2['genres'].apply(literal_eval) + \
        df2['production_companies'].apply(literal_eval) + \
        df2['cast']   

df2['soup'] = df2['soup'].apply(lambda x: ' '.join(x))   

In [87]:
df2['soup'][0]

'frankdarabont frankdarabont frankdarabont prison corruption policebrutality basedonnovelorbook prisoncell delinquent paroleboard prisonescape wrongfulimprisonment framedformurder 1940s voiceover drama crime castlerockentertainment timrobbins morganfreeman bobgunton williamsadler clancybrown'

Not using TF-IDF beacause it might eliminate directors/cast that played in ,ultiple movies

In [88]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [89]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [90]:

title_to_id = pd.Series(movies.index, index=movies['title'])

In [91]:
id_to_index = pd.Series(movies.index, index=movies['id'])

In [92]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations_by_title(title, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = title_to_id[title]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:15]

    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]

    # Return the top 15 most similar movies
    return movies['title'].iloc[movie_indices]

In [93]:
rec = get_recommendations_by_title('A Serious Man', cosine_sim2)
rec

4438              Burn After Reading
7835                   Hail, Caesar!
1133                     Barton Fink
1574      O Brother, Where Art Thou?
2198    The Ballad of Buster Scruggs
6935                 The Ladykillers
888         The Man Who Wasn't There
367                            Fargo
1641                       True Grit
1963             Inside Llewyn Davis
2612                 Raising Arizona
2454             The Hudsucker Proxy
7805             Intolerable Cruelty
2805          The Tragedy of Macbeth
Name: title, dtype: object

In [94]:
def get_recommendations_by_id(id, cosine_similarity=cosine_similarity):
    # Get the index of the movie that matches the title
    idx = id_to_index[id]

    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_similarity[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]

    # Get the movie indices
    movies_idx = [i[0] for i in similarity_scores]

    # Return the top 10 most similar movies
    return movies['id'].iloc[movies_idx]

In [95]:
movies.iloc[title_to_id['A Serious Man']]

adult                                                               False
backdrop_path                            /ihwfod79qlBbFpjVQNeXxmX03ql.jpg
id                                                                  12573
original_language                                                      en
overview                It is 1967, and Larry Gopnik, a physics profes...
popularity                                                         18.706
poster_path                              /vWWxILkTc0OMQwkKsePVophxkhi.jpg
release_date                                                   2009-09-30
title                                                       A Serious Man
vote_average                                                          6.8
vote_count                                                           1873
genres                                                ['Comedy', 'Drama']
year                                                                 2009
imdb_id                               

In [96]:
rec = get_recommendations_by_id(12573, cosine_sim2)
rec

4438      4944
7835    270487
1133       290
1574       134
2198    537996
6935      5516
888      10778
367        275
1641     44264
1963     86829
Name: id, dtype: int64

In [97]:
rec.to_list()

[4944, 270487, 290, 134, 537996, 5516, 10778, 275, 44264, 86829]

export cosine similarity

In [98]:
id_to_index

id
278          0
424          1
19404        2
129          3
372058       4
          ... 
9292      8667
2565      8668
48988     8669
18298     8670
385360    8671
Length: 8672, dtype: int64

In [99]:
import pickle

with open('similarity_matrix.pkl', 'wb') as f1:
    pickle.dump(cosine_sim2, f1)

with open('movie_id_to_matrix_similarity_idx.pkl', 'wb') as f2:
    pickle.dump(id_to_index, f2)