# Recommender System

## Load libraries and dataset

In [1]:
# Load libraries
import pandas as pd
import numpy as np

In [2]:
# Load rating dataset
rating_df = pd.read_csv(r'Data/ratings.csv')

# View the dataframe
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Load movie title dataset
title_df = pd.read_csv(r'Data/movies.csv')

# View the dataframe
title_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Merge the dataframes
rating_df = rating_df.merge(title_df, on='movieId', how='left')

# View the dataframe
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [5]:
# Drop unneeded columns
rating_df = rating_df.drop(['movieId', 'timestamp', 'genres'], axis=1)

In [6]:
# Create rating counts by title
total_ratings = pd.DataFrame(rating_df.groupby('title')['rating'].count())
total_ratings = total_ratings.reset_index()

In [7]:
# Rename column
total_ratings = total_ratings.rename(columns={'rating':'number of ratings'})

# View the dataframe
total_ratings

Unnamed: 0,title,number of ratings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [8]:
# Remove Titles with less than 100 ratings

# Merge the two dataframes
rating_df = rating_df.merge(total_ratings, on='title', how='left')

# View the dataframe
rating_df

Unnamed: 0,userId,rating,title,number of ratings
0,1,4.0,Toy Story (1995),215
1,1,4.0,Grumpier Old Men (1995),52
2,1,4.0,Heat (1995),102
3,1,5.0,Seven (a.k.a. Se7en) (1995),203
4,1,5.0,"Usual Suspects, The (1995)",204
...,...,...,...,...
100831,610,4.0,Split (2017),6
100832,610,5.0,John Wick: Chapter Two (2017),7
100833,610,5.0,Get Out (2017),15
100834,610,5.0,Logan (2017),25


In [9]:
# Filter out movies with less than 100 ratings
rating_df = rating_df[rating_df['number of ratings'] > 100]

In [10]:
# Drop unneeded columns
rating_df = rating_df.drop(['number of ratings'], axis=1)

# Pivot the Ratings Table

In [13]:
# These steps are necessary when using the 25M version of the dataset to reduce memory usage

# Downcast rating to use float32 to save memory
# rating_df['rating'] = rating_df['rating'].astype(np.float32)

# Clear other dataframes to save memory
# total_ratings = ""
# title_df = ""

In [11]:
# Pivot the table
user_df = rating_df.pivot_table(index='userId', columns='title', values='rating')

# Obtain Movie Suggestions

The next steps return the top 10 movies based on rating correlation with the selected movie

In [12]:
# Set movie title
movie = 'Dark Knight, The (2008)'

In [24]:
# Calculate correlations
correlations = user_df.corrwith(user_df[movie]).sort_values(ascending=False)

# Print movie suggestions
print('Your movie suggestions are:\n')
for i in range (1,11):
    print(correlations.index[i])

Your movie suggestions are:

Batman Begins (2005)
Ferris Bueller's Day Off (1986)
Pretty Woman (1990)
Inception (2010)
Fugitive, The (1993)
X-Men (2000)
Rock, The (1996)
WALL·E (2008)
Up (2009)
Breakfast Club, The (1985)
