# Building a Movie Recommendation System

In [1]:
import pandas as pd
import numpy as np
import regex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

## Exploring the Data

In [2]:
movies = pd.read_csv('movies.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(62423, 3)

In [5]:
movies.tail()

Unnamed: 0,movieId,title,genres
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama


## Cleaning the Title Column

In [6]:
movies['year'] = movies['title'].str.extract('(\d\d\d\d)')

In [7]:
movies['title'] = movies['title'].str.replace('(\(\d\d\d\d\))','',regex=True)

In [8]:
movies['title_year'] = movies['title'] + ' ' + movies['year']

In [9]:
movies['title'] = movies['title'].str.strip()

## Dealing with Null Values

Below, we see that there are 406 rows where there is no information about the year the movie was released. Additionally, we see that these rows are often also missing data in the genre column. Given the missing information and the fact that 406 is less than one percent of the 62,000 rows in the data, we will remove these rows. 

In [10]:
movies.isna().sum()

movieId         0
title           0
genres          0
year          406
title_year    406
dtype: int64

In [11]:
movies[ movies['year'].isnull()].head(10)

Unnamed: 0,movieId,title,genres,year,title_year
15036,79607,"Millions Game, The (Das Millionenspiel)",Action|Drama|Sci-Fi|Thriller,,
25387,123619,Terrible Joe Moran,(no genres listed),,
26284,125571,The Court-Martial of Jackie Robinson,(no genres listed),,
26309,125632,In Our Garden,(no genres listed),,
26392,125958,Stephen Fry In America - New World,(no genres listed),,
26576,126438,Two: The Story of Roman & Nyro,Documentary|Drama,,
26699,127005,A Year Along the Abandoned Road,(no genres listed),,
27216,128612,Body/Cialo,Comedy|Drama|Mystery,,
27269,128734,Polskie gówno,Comedy|Musical,,
27577,129651,The Third Reich: The Rise & Fall,(no genres listed),,


In [12]:
movies.dropna(inplace=True)

## Creating the Search Function

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [14]:
tfid = vectorizer.fit_transform(movies['title_year'])

In [15]:
def get_five_most_similar(title):
    title = regex.sub(r' \W+', '', title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfid).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    return movies.iloc[indices].iloc[::-1]

In [16]:
get_five_most_similar('harry potter')

Unnamed: 0,movieId,title,genres,year,title_year
4790,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,2001,Harry Potter and the Sorcerer's Stone (a.k.a. ...
5704,5816,Harry Potter and the Chamber of Secrets,Adventure|Fantasy,2002,Harry Potter and the Chamber of Secrets 2002
10408,40815,Harry Potter and the Goblet of Fire,Adventure|Fantasy|Thriller|IMAX,2005,Harry Potter and the Goblet of Fire 2005
11700,54001,Harry Potter and the Order of the Phoenix,Adventure|Drama|Fantasy|IMAX,2007,Harry Potter and the Order of the Phoenix 2007
13512,69844,Harry Potter and the Half-Blood Prince,Adventure|Fantasy|Mystery|Romance|IMAX,2009,Harry Potter and the Half-Blood Prince 2009


In [17]:
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        if len(title) > 5:
            display(get_five_most_similar(title))
            
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

## Building the Recommendation System

### Exploring the Ratings Data

In [18]:
ratings = pd.read_csv('ratings.csv')

In [19]:
ratings.shape

(25000095, 4)

In [20]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [21]:
ratings['userId'].unique().size #Number of Distinct Reviewers

162541

### Building a Recommendation Function 

In [22]:
similar_reviewers = ratings[ (ratings['movieId'] == 1) & (ratings['rating'] >= 5)]

In [23]:
similar_user_movies = ratings[ (ratings['userId'].isin(similar_reviewers['userId'])) & (ratings['rating'] >= 5)]['movieId']

In [24]:
movie_users_perc_like = similar_user_movies.value_counts() / len(similar_reviewers)

In [25]:
ten_perc_plus_like = movie_users_perc_like[movie_users_perc_like > .1]

In [26]:
ten_perc_plus_like

1        1.000000
318      0.367762
260      0.358285
296      0.299126
356      0.292240
           ...   
1089     0.105064
590      0.104620
780      0.102991
78499    0.101436
750      0.100992
Name: movieId, Length: 64, dtype: float64

In [27]:
all_users = ratings[(ratings['movieId'].isin(ten_perc_plus_like.index)) & (ratings['rating'] >= 5)]

In [28]:
all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [29]:
all_user_recs

318      0.293673
296      0.238848
356      0.192436
260      0.191589
2571     0.189198
           ...   
6377     0.039507
1073     0.039218
1148     0.039099
8961     0.037465
78499    0.020982
Name: movieId, Length: 64, dtype: float64

In [30]:
combine_recs = pd.concat([ten_perc_plus_like,all_user_recs], axis=1)
combine_recs.columns = ['similar_user_perc', 'all_user_perc']

In [31]:
combine_recs #Comparing the percent of similar users who liked each movie vs the percent of all users that liked each movie

Unnamed: 0,similar_user_perc,all_user_perc
1,1.000000,0.100279
318,0.367762,0.293673
260,0.358285,0.191589
296,0.299126,0.238848
356,0.292240,0.192436
...,...,...
1089,0.105064,0.068961
590,0.104620,0.062680
780,0.102991,0.045744
78499,0.101436,0.020982


In [32]:
combine_recs['score'] = combine_recs['similar_user_perc'] / combine_recs['all_user_perc']

In [33]:
combine_recs = combine_recs.sort_values(by='score', ascending=False)
top_ten = combine_recs.head(10)

In [34]:
top_ten = top_ten.merge(movies, left_on=top_ten.index, right_on=movies['movieId'])

In [35]:
top_ten

Unnamed: 0,key_0,similar_user_perc,all_user_perc,score,movieId,title,genres,year,title_year
0,1,1.0,0.100279,9.972161,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,Toy Story 1995
1,3114,0.236043,0.039849,5.923467,3114,Toy Story 2,Adventure|Animation|Children|Comedy|Fantasy,1999,Toy Story 2 1999
2,78499,0.101436,0.020982,4.834345,78499,Toy Story 3,Adventure|Animation|Children|Comedy|Fantasy|IMAX,2010,Toy Story 3 2010
3,4886,0.149119,0.041111,3.627223,4886,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy,2001,"Monsters, Inc. 2001"
4,588,0.193988,0.055753,3.479433,588,Aladdin,Adventure|Animation|Children|Comedy|Musical,1992,Aladdin 1992
5,6377,0.131423,0.039507,3.326552,6377,Finding Nemo,Adventure|Animation|Children|Comedy,2003,Finding Nemo 2003
6,595,0.170295,0.053496,3.183341,595,Beauty and the Beast,Animation|Children|Fantasy|Musical|Romance|IMAX,1991,Beauty and the Beast 1991
7,364,0.211832,0.069229,3.059883,364,"Lion King, The",Adventure|Animation|Children|Drama|Musical|IMAX,1994,"Lion King, The 1994"
8,1073,0.119502,0.039218,3.047154,1073,Willy Wonka & the Chocolate Factory,Children|Comedy|Fantasy|Musical,1971,Willy Wonka & the Chocolate Factory 1971
9,8961,0.112469,0.037465,3.001925,8961,"Incredibles, The",Action|Adventure|Animation|Children|Comedy,2004,"Incredibles, The 2004"


In [36]:
movies['title'] = movies['title'].str.strip()

In [37]:
movie_id = movies[movies['title'] == 'Jumanji']['movieId']
movie_id

1    2
Name: movieId, dtype: int64

In [38]:
def get_10_recommendations(title):
    movie_id = int(movies[movies['title'] == title]['movieId'])
    similar_reviewers = ratings[ (ratings['movieId'] == movie_id) & (ratings['rating'] >= 5)]
    similar_user_movies = ratings[ (ratings['userId'].isin(similar_reviewers['userId'])) & (ratings['rating'] >= 5)]['movieId']
    movie_users_perc_like = similar_user_movies.value_counts() / len(similar_reviewers)
    ten_perc_plus_like = movie_users_perc_like[movie_users_perc_like > .1]
    all_users = ratings[(ratings['movieId'].isin(ten_perc_plus_like.index)) & (ratings['rating'] >= 5)]
    all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
    combine_recs = pd.concat([ten_perc_plus_like,all_user_recs], axis=1)
    combine_recs.columns = ['similar_user_perc', 'all_user_perc']
    combine_recs['score'] = combine_recs['similar_user_perc'] / combine_recs['all_user_perc']
    combine_recs = combine_recs.sort_values(by='score', ascending=False)
    top_ten = combine_recs.head(11).tail(10)
    top_ten = top_ten.merge(movies, left_on=top_ten.index, right_on=movies['movieId'])
    return top_ten[['score','title','genres']]

In [39]:
get_10_recommendations('Casper')

Unnamed: 0,score,title,genres
0,44.905005,Junior,Comedy|Sci-Fi
1,33.046973,Congo,Action|Adventure|Mystery|Sci-Fi
2,29.857885,Nine Months,Comedy|Romance
3,26.688152,Pocahontas,Animation|Children|Drama|Musical|Romance
4,25.942701,"Santa Clause, The",Comedy|Drama|Fantasy
5,25.10274,Dangerous Minds,Drama
6,23.667654,Beverly Hills Cop III,Action|Comedy|Crime|Thriller
7,20.906159,Jumanji,Adventure|Children|Fantasy
8,18.642358,Addams Family Values,Children|Comedy|Fantasy
9,17.824431,"Walk in the Clouds, A",Drama|Romance


In [47]:
def get_10_recommendations(title, my_genre=None):
    movie_id = int(movies[movies['title'] == title]['movieId'])
    similar_reviewers = ratings[ (ratings['movieId'] == movie_id) & (ratings['rating'] >= 5)]
    similar_user_movies = ratings[ (ratings['userId'].isin(similar_reviewers['userId'])) & (ratings['rating'] >= 5)]['movieId']
    movie_users_perc_like = similar_user_movies.value_counts() / len(similar_reviewers)
    ten_perc_plus_like = movie_users_perc_like[movie_users_perc_like > .1]
    all_users = ratings[(ratings['movieId'].isin(ten_perc_plus_like.index)) & (ratings['rating'] >= 5)]
    all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
    combine_recs = pd.concat([ten_perc_plus_like,all_user_recs], axis=1)
    combine_recs.columns = ['similar_user_perc', 'all_user_perc']
    combine_recs['score'] = combine_recs['similar_user_perc'] / combine_recs['all_user_perc']
    combine_recs = combine_recs.merge(movies, left_on=combine_recs.index, right_on=movies['movieId'])
    if my_genre != None:
        combine_recs = combine_recs[combine_recs['genres'].str.contains(my_genre)]
    combine_recs = combine_recs.sort_values(by='score', ascending=False)
    top_ten = combine_recs.head(11).tail(10)
    return top_ten[['score','title','genres']]

In [48]:
get_10_recommendations('Toy Story')

Unnamed: 0,score,title,genres
10,5.923467,Toy Story 2,Adventure|Animation|Children|Comedy|Fantasy
62,4.834345,Toy Story 3,Adventure|Animation|Children|Comedy|Fantasy|IMAX
34,3.627223,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
20,3.479433,Aladdin,Adventure|Animation|Children|Comedy|Musical
40,3.326552,Finding Nemo,Adventure|Animation|Children|Comedy
23,3.183341,Beauty and the Beast,Animation|Children|Fantasy|Musical|Romance|IMAX
14,3.059883,"Lion King, The",Adventure|Animation|Children|Drama|Musical|IMAX
50,3.047154,Willy Wonka & the Chocolate Factory,Children|Comedy|Fantasy|Musical
54,3.001925,"Incredibles, The",Action|Adventure|Animation|Children|Comedy
55,2.819701,Wallace & Gromit: The Wrong Trousers,Animation|Children|Comedy|Crime


In [51]:
get_10_recommendations('Toy Story', 'Comedy')

Unnamed: 0,score,title,genres
10,5.923467,Toy Story 2,Adventure|Animation|Children|Comedy|Fantasy
62,4.834345,Toy Story 3,Adventure|Animation|Children|Comedy|Fantasy|IMAX
34,3.627223,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
20,3.479433,Aladdin,Adventure|Animation|Children|Comedy|Musical
40,3.326552,Finding Nemo,Adventure|Animation|Children|Comedy
50,3.047154,Willy Wonka & the Chocolate Factory,Children|Comedy|Fantasy|Musical
54,3.001925,"Incredibles, The",Action|Adventure|Animation|Children|Comedy
55,2.819701,Wallace & Gromit: The Wrong Trousers,Animation|Children|Comedy|Crime
48,2.623763,Shrek,Adventure|Animation|Children|Comedy|Fantasy|Ro...
16,2.419551,Back to the Future,Adventure|Comedy|Sci-Fi
