# Simple Recommenders

In [19]:
import pandas as pd

metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)

print(metadata.size)
print(metadata.shape)

metadata = metadata.head(100)
print(metadata.size)
print(metadata.shape)

1091184
(45466, 24)
2400
(100, 24)


In [20]:
list(metadata.columns.values)

['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count']

In [21]:
print(metadata.iloc[:, 0:3].head(15))

    adult                              belongs_to_collection    budget
0   False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000
1   False                                                NaN  65000000
2   False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0
3   False                                                NaN  16000000
4   False  {'id': 96871, 'name': 'Father of the Bride Col...         0
5   False                                                NaN  60000000
6   False                                                NaN  58000000
7   False                                                NaN         0
8   False                                                NaN  35000000
9   False  {'id': 645, 'name': 'James Bond Collection', '...  58000000
10  False                                                NaN  62000000
11  False                                                NaN         0
12  False  {'id': 117693, 'name': 'Balto Collection', 'po...         0
13  Fa

In [22]:
print(metadata.head())

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497         

In [23]:
# Calculate C - the mean rating across all movies
C = metadata['vote_average'].mean()
print (C)

6.219000000000001


In [24]:
# Calculate m - the minimum number of votes required to be in the chart
# Considering 90th percentile i.e. it must have more votes than at least 90% of the movies in the list

m = metadata['vote_count'].quantile(0.90)
print(m)

1134.6000000000006


In [25]:
# Filer out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(10, 24)

In [26]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

v = number of votes for the movie  
m = minimum votes required to be listed in the chart  
R = average rating for the movie  
C = mean vote across all movies

In [27]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [28]:
# Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

# Print top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(15)

Unnamed: 0,title,vote_count,vote_average,score
46,Se7en,5915.0,8.1,7.797262
49,The Usual Suspects,3334.0,8.1,7.622405
0,Toy Story,5415.0,7.7,7.443443
5,Heat,1886.0,7.7,7.143706
15,Casino,1343.0,7.8,7.075992
31,Twelve Monkeys,2470.0,7.4,7.028263
1,Jumanji,2413.0,6.9,6.682201
69,From Dusk Till Dawn,1644.0,6.9,6.621924
47,Pocahontas,1509.0,6.7,6.493561
9,GoldenEye,1194.0,6.6,6.414359


# Content-Based Recommender System

## Plot Description Based Recommender

In [29]:
# Print plot overview of the first 5 movies
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [30]:
# Import TfIdfVectorizer
from sklearn.feature_extraction.text import  TfidfVectorizer

# Define TfIdfVectorizer object and remove all stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with empty string
metadata['overview'] = metadata['overview'].fillna('')

# Construct TF-IDF Matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

tfidf_matrix.shape

(100, 1895)

Calculate cosine similarity
Since we have used the TF-IDF vectorizer, calculating the dot product will directly give you the cosine similarity score. Therefore, you will use `sklearn`'s `linear_kernel()` instead of `cosine_similarities()` since it is faster.

In [31]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [33]:
# get_recommendations function
# input - movie title
# output - most similar movies

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    # ignore the first one since the movie most similar to a particular movie is the movie itself
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [34]:
get_recommendations('Casino')

24                           Leaving Las Vegas
80     Things to Do in Denver When You're Dead
44                                  To Die For
46                                       Se7en
12                                       Balto
96                                    Shopping
84                          Angels and Insects
74                                   Big Bully
59                  The Indian in the Cupboard
82    Once Upon a Time... When We Were Colored
Name: title, dtype: object

In [35]:
get_recommendations('Toy Story')

17                                  Four Rooms
59                  The Indian in the Cupboard
83                 Last Summer in the Hamptons
56                       Home for the Holidays
33                                        Babe
57                                 The Postman
82    Once Upon a Time... When We Were Colored
54                                     Georgia
76                                   Nico Icon
50                              Guardian Angel
Name: title, dtype: object

In [36]:
get_recommendations('Se7en')

21                                    Copycat
44                                 To Die For
31                             Twelve Monkeys
15                                     Casino
12                                      Balto
25                                    Othello
75                                  Screamers
14                           Cutthroat Island
80    Things to Do in Denver When You're Dead
35                           Dead Man Walking
Name: title, dtype: object

In [37]:
get_recommendations('Balto')

9                                    GoldenEye
91                         Vampire in Brooklyn
15                                      Casino
46                                       Se7en
7                                 Tom and Huck
74                                   Big Bully
50                              Guardian Angel
59                  The Indian in the Cupboard
31                              Twelve Monkeys
82    Once Upon a Time... When We Were Colored
Name: title, dtype: object