In [25]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt


In [26]:
moviedf = pd.read_csv("movies.csv")
ratingsdf = pd.read_csv("ratings.csv")
moviedf.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
#remove the year from the title column and create new year column
moviedf['year'] = moviedf.title.str.extract('(\(\d\d\d\d\))', expand = False)
#remove the parenthesis
moviedf['year'] = moviedf.year.str.extract('(\d\d\d\d)', expand = False)
#remove the year form the 'title' column
moviedf['title'] = moviedf.title.str.replace('(\(\d\d\d\d\))', '')
#apply strip function to get rid of any ending whitespace char
    #lambda x: apply strip functioin to every x in the 'title' col
moviedf['title'] = moviedf['title'].apply(lambda x: x.strip())
moviedf.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [28]:
#split the values in Genres column into a list
moviedf['genres'] = moviedf.genres.str.split('|')
moviedf.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [29]:
#Use OneHotEncoding to covert list of genres to a vector where each column corresponds to 
    #one possible value of the feature
#Store every different genre in columns that contain either 1 or 0. 1: movie has that genre

#store dataframe in another variable since genres won't be important for our 1st rec system
moviesWithGenres = moviedf.copy()

#for every row in the df, iterate through the list of genres and place a 1 into the corresponding column
for index, row in moviedf.iterrows():
    for genre in row['genres']:
        moviesWithGenres.at[index, genre] = 1

#fill in the NaN values with 0
moviesWithGenres = moviesWithGenres.fillna(0)
moviesWithGenres.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
#now look at the ratings df
ratingsdf.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [31]:
#remove 'timestamp' column
ratingsdf = ratingsdf.drop('timestamp', axis = 1)
ratingsdf.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [32]:
#Content-Based Recommendation System

#User input
userInput = [
    {'title': 'Jumanji', 'rating':4},
    {'title': 'Bad Boys', 'rating':3.5},
    {'title': 'Jurassic Park', 'rating': 4.5},
    {'title': 'Hush', 'rating': 3.5},
    {'title': 'Black Mirror', 'rating': 5},
    {'title': 'Parasite', 'rating': 5},
]
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Jumanji,4.0
1,Bad Boys,3.5
2,Jurassic Park,4.5
3,Hush,3.5
4,Black Mirror,5.0
5,Parasite,5.0


In [33]:
#extract the input movie's ID's from the movies df and add them into input

#filtering out the movies by title
inputId = moviedf[moviedf['title'].isin(inputMovies['title'].tolist())]
#merge it so we can get the movieID
inputMovies = pd.merge(inputId, inputMovies)
#drop info we won't use from the input df
inputMovies = inputMovies.drop('genres', axis = 1).drop('year', axis = 1)

inputMovies

Unnamed: 0,movieId,title,rating
0,2,Jumanji,4.0
1,145,Bad Boys,3.5
2,480,Jurassic Park,4.5
3,1798,Hush,3.5
4,156726,Hush,3.5
5,2256,Parasite,5.0
6,176601,Black Mirror,5.0


In [34]:
#create new df with no dupes
inputMovies_nodupes = inputMovies.drop_duplicates('title')
inputMovies_nodupes

Unnamed: 0,movieId,title,rating
0,2,Jumanji,4.0
1,145,Bad Boys,3.5
2,480,Jurassic Park,4.5
3,1798,Hush,3.5
5,2256,Parasite,5.0
6,176601,Black Mirror,5.0


In [35]:
#Learn the input's preferences
    #get subset of movies that the user has watched from the df containing genres defined with binary values
#filter out the movies from the input
userMovies = moviesWithGenres[moviesWithGenres['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118,145,Bad Boys,"[Action, Comedy, Crime, Drama, Thriller]",1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
418,480,Jurassic Park,"[Action, Adventure, Sci-Fi, Thriller]",1993,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1329,1798,Hush,[Thriller],1998,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1676,2256,Parasite,"[Horror, Sci-Fi]",1982,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9264,156726,Hush,[Thriller],2016,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9611,176601,Black Mirror,[(no genres listed)],0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
#We only need the genre table so clean up by resetting the index and dropping movieId, title, genre, and year columns

#resetting the index
userMovies = userMovies.reset_index(drop = True)
#dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', axis=1).drop('title', axis=1).drop('genres', axis=1).drop('year', axis=1)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [40]:
#Learn the input's preferences
inputMovies['rating']

0    4.0
1    3.5
2    4.5
3    3.5
4    3.5
5    5.0
6    5.0
Name: rating, dtype: float64

In [44]:
#Turn each genre into weights by using the user's reviews and multiplying them into the user's
    #genre table and then summing up the resulting table by column
#This operation is a dot product between a matrix and a vector
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])

#user profile
userProfile

Adventure              8.5
Animation              0.0
Children               4.0
Comedy                 3.5
Fantasy                4.0
Romance                0.0
Drama                  3.5
Action                 8.0
Crime                  3.5
Thriller              16.5
Horror                 3.5
Mystery                0.0
Sci-Fi                 8.0
War                    0.0
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     5.0
dtype: float64

In [45]:
#get genres of every movie in our orig df
genreTable = moviesWithGenres.set_index(moviesWithGenres['movieId'])
#drop unnecessary info
genreTable = genreTable.drop('movieId', axis=1).drop('title', axis=1).drop('genres', axis=1).drop('year', axis=1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
genreTable.shape

(9742, 20)

In [48]:
#With the user's profile and the complete list of movies and their genres, we're going to take
    #the weighted avg of every movie based on the user profile and recommend top 20 movies
    
#multiply the genres by the weights and then take the weighted avg
recommendationTable = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable.head()

movieId
1    0.294118
2    0.242647
3    0.051471
4    0.102941
5    0.051471
dtype: float64

In [55]:
#sort our recommendations in descending order
recommendationTable = recommendationTable.sort_values(ascending = False)
recommendationTable.head()

movieId
164226    0.713235
71999     0.713235
81132     0.691176
58025     0.654412
27032     0.654412
dtype: float64

In [56]:
#The Final Recommendation Table
moviedf.loc[moviedf['movieId'].isin(recommendationTable.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
1972,2617,"Mummy, The","[Action, Adventure, Comedy, Fantasy, Horror, T...",1999
1978,2625,Black Mask (Hak hap),"[Action, Adventure, Crime, Sci-Fi, Thriller]",1996
4631,6902,Interstate 60,"[Adventure, Comedy, Drama, Fantasy, Mystery, S...",2002
5161,8361,"Day After Tomorrow, The","[Action, Adventure, Drama, Sci-Fi, Thriller]",2004
5612,27032,Who Am I? (Wo shi shei),"[Action, Adventure, Comedy, Sci-Fi, Thriller]",1998
5665,27618,"Sound of Thunder, A","[Action, Adventure, Drama, Sci-Fi, Thriller]",2005
5980,36509,"Cave, The","[Action, Adventure, Horror, Mystery, Sci-Fi, T...",2005
6145,43932,Pulse,"[Action, Drama, Fantasy, Horror, Mystery, Sci-...",2006
6330,48774,Children of Men,"[Action, Adventure, Drama, Sci-Fi, Thriller]",2006
6681,58025,Jumper,"[Action, Adventure, Drama, Sci-Fi, Thriller]",2008
