In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from math import sqrt

%matplotlib inline

In [2]:
#------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------
# Content-Based recommendation system
#------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------

In [3]:
# Download the movie data set
!wget -O moviedataset.zip https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
print('unziping ...')

# Unpack the data set
!unzip -o -j moviedataset.zip -d data/

# Remove the archive
!rm -f moviedataset.zip

--2020-02-14 11:05:07--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
Resolving s3-api.us-geo.objectstorage.softlayer.net... 67.228.254.196
Connecting to s3-api.us-geo.objectstorage.softlayer.net|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 160301210 (153M) [application/zip]
Saving to: 'moviedataset.zip'


2020-02-14 11:05:23 (9.99 MB/s) - 'moviedataset.zip' saved [160301210/160301210]

unziping ...
Archive:  moviedataset.zip
  inflating: data/links.csv          
  inflating: data/movies.csv         
  inflating: data/ratings.csv        
  inflating: data/README.txt         
  inflating: data/tags.csv           


In [4]:
# Load the csv files
movies_df = pd.read_csv('data' + os.path.sep + 'movies.csv', index_col = False)
display(movies_df.head())
ratings_df = pd.read_csv('data' + os.path.sep + 'ratings.csv', index_col = False)
display(ratings_df.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [5]:
# Remove the year from the title column by using pandas'
# replace function and store in a new year column.
print('Movies df shape:', movies_df.shape)

# Extract the year - the year in round brackets
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand = False)
display(movies_df.year.head())

# Remove the round brackets from the year
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand = False)
display(movies_df.year.head())

# Remove the year from the title
movies_df.title = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
display(movies_df.title.head())

# Strip the title
movies_df.title = movies_df.title.apply(lambda x: x.strip())
display(movies_df.title.head())

# See id the re is any NA
print('Number of NULL year movies, before:', movies_df[movies_df.isnull().year].shape[0])
movies_df.loc[movies_df.isnull().year,'year'] = '1900'
print('Number of NULL year movies, after:', movies_df[movies_df.isnull().year].shape[0])

Movies df shape: (34208, 3)


0    (1995)
1    (1995)
2    (1995)
3    (1995)
4    (1995)
Name: year, dtype: object

0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object

0                      Toy Story 
1                        Jumanji 
2               Grumpier Old Men 
3              Waiting to Exhale 
4    Father of the Bride Part II 
Name: title, dtype: object

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

Number of NULL year movies, before: 68
Number of NULL year movies, after: 0


In [6]:
# Check on the column types
print(movies_df.dtypes)

# Convert the years column into numeric
movies_df.year = movies_df.year.astype('int32')

print(movies_df.dtypes)

movieId     int64
title      object
genres     object
year       object
dtype: object
movieId     int64
title      object
genres     object
year        int32
dtype: object


In [7]:
# Split the values in the Genres column into a list of Genres to simplify future use
movies_df.genres = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [8]:
# Keeping genres in a list format isn't optimal for the content-based
# recommendation system technique, we will use the One Hot Encoding
# technique to convert the list of genres to a vector where each
# column corresponds to one possible value of the feature. This
# encoding is needed for feeding categorical data. In this case,
# we store every different genre in columns that contain either 1
# or 0. 1 shows that a movie has that genre and 0 shows that it doesn't.
movies_wg_df = movies_df.copy()

# Iterate over the genres in each row and add flags as columns
for idx, row in movies_df.iterrows():
    for genre in row['genres']:
        movies_wg_df.at[idx, genre] = 1

# The resulting N/A entries in the newly added colums are to be filled with zeros
movies_wg_df.fillna(0, inplace = True)
movies_wg_df = movies_wg_df.drop(['genres'], axis = 1)
movies_wg_df.head()

Unnamed: 0,movieId,title,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,1995,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,1995,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,1995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,1995,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,1995,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# We won't be needing the timestamp column, so let's drop it to save on memory.
ratings_df = ratings_df.drop('timestamp', 1)
display(ratings_df.head())

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [10]:
# Content-Based or Item-Item recommendation system: 
#     Attempts to figure out what a user's favourite aspects of an item is,
#     and then recommends items that present those aspects. In our case,
#     we're going to try to figure out the input's favorite genres from
#     the movies and ratings given.

# Take ratings for the user with id 5
user_id = 5
user_r_df = ratings_df[ratings_df.userId == user_id]
display(user_r_df.head())

# Add the movie title to this user input
user_movies = movies_df[movies_df.movieId.isin(user_r_df.movieId.tolist())]
display(user_movies.head())

# Merge the title column into the user ratings
user_r_df = pd.merge(user_r_df, user_movies)
user_r_df = user_r_df.drop(['genres', 'year'], 1)
display(user_r_df.head())


Unnamed: 0,userId,movieId,rating
194,5,1203,4.0
195,5,2571,2.5
196,5,6016,5.0
197,5,7502,4.5
198,5,58559,4.0


Unnamed: 0,movieId,title,genres,year
1178,1203,12 Angry Men,[Drama],1957
2487,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999
5918,6016,City of God (Cidade de Deus),"[Action, Adventure, Crime, Drama, Thriller]",2002
7357,7502,Band of Brothers,"[Action, Drama, War]",2001
12540,58559,"Dark Knight, The","[Action, Crime, Drama, IMAX]",2008


Unnamed: 0,userId,movieId,rating,title
0,5,1203,4.0,12 Angry Men
1,5,2571,2.5,"Matrix, The"
2,5,6016,5.0,City of God (Cidade de Deus)
3,5,7502,4.5,Band of Brothers
4,5,58559,4.0,"Dark Knight, The"


In [11]:
# Let us now start by learning the user's preferences. We'll only need the actual
# genre table, so let's clean this up a bit by resetting the index and dropping 
# the movieId, title, genres and year columns.
user_movies_pref = movies_wg_df[movies_wg_df.movieId.isin(user_r_df.movieId.tolist())]
user_movies_pref = user_movies_pref.drop(['movieId', 'year', 'title', '(no genres listed)'], 1)
user_movies_pref = user_movies_pref.reset_index(drop = True)
display(user_movies_pref.head())

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# To turn each genre into a weight we should take the user's review ratings
# and multiplying them into the user's genre table and then summing up the
# resulting table by column. which is a dot product.
user_profile = user_movies_pref.transpose().dot(user_r_df['rating'])
print(user_profile)
print('User profile shape:', user_profile.shape)

Adventure      16.0
Animation       3.0
Children        3.0
Comedy         25.5
Fantasy         3.0
Romance         1.5
Drama          59.5
Action         41.0
Crime          21.0
Thriller       32.5
Horror          0.0
Mystery         4.5
Sci-Fi         16.0
IMAX            7.0
Documentary     0.0
War            15.0
Musical         0.0
Western         0.0
Film-Noir       0.0
dtype: float64
User profile shape: (19,)


In [15]:
# Using the user profile, we can recommend movies that satisfy the user's preferences.
movies_wg_df.set_index(movies_wg_df.movieId, inplace= True)
all_movie_genres = movies_wg_df.drop('movieId', 1).drop('title', 1).drop('year', 1).drop('(no genres listed)', 1)
display(all_movie_genres.head())
print('All movie genres:', all_movie_genres.shape)

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


All movie genres: (34208, 19)


In [23]:
# Now to compute the user preferences for each movie:
#Multiply the genres by the weights and then take the weighted average
user_movie_prefs = ((all_movie_genres * user_profile).sum(axis=1)) / (user_profile.sum())
print('user_movie_prefs type:', type(user_movie_prefs))
user_movie_prefs = pd.DataFrame( {'userScore': user_movie_prefs})
display(user_movie_prefs.head())

user_movie_prefs type: <class 'pandas.core.series.Series'>


Unnamed: 0_level_0,userScore
movieId,Unnamed: 1_level_1
1,0.203219
2,0.088531
3,0.108652
4,0.348089
5,0.102616


In [28]:
# Restore the movies
user_movie_final = pd.merge(user_movie_prefs, movies_df, left_index=True, right_on='movieId').sort_values(by='userScore', ascending=False)
display(user_movie_final.head())

Unnamed: 0,userScore,movieId,title,genres,year
24565,0.808853,115479,"Whip Hand, The","[Action, Adventure, Crime, Drama, Sci-Fi, Thri...",1951
16055,0.804829,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
13250,0.78672,64645,The Wrecking Crew,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1968
26442,0.78672,122787,The 39 Steps,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1959
15001,0.758551,75408,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,"[Action, Animation, Comedy, Crime, Drama, Myst...",2008


In [37]:
# We can also just compute the top 20 scoring movies for the user:
top_20_user_scores = user_movie_prefs.sort_values(by='userScore', ascending=False).head(20)
top_20_user_movies = movies_df[movies_df['movieId'].isin(top_20_user_scores.index)]
top_20_user_movies = pd.merge(top_20_user_scores, top_20_user_movies, left_index = True, right_on = 'movieId')
display(top_20_user_movies)

Unnamed: 0,userScore,movieId,title,genres,year
24565,0.808853,115479,"Whip Hand, The","[Action, Adventure, Crime, Drama, Sci-Fi, Thri...",1951
16055,0.804829,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
13250,0.78672,64645,The Wrecking Crew,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1968
26442,0.78672,122787,The 39 Steps,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1959
15001,0.758551,75408,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,"[Action, Animation, Comedy, Crime, Drama, Myst...",2008
15073,0.758551,76153,Lupin III: First Contact (Rupan Sansei: Faasut...,"[Action, Animation, Comedy, Crime, Drama, Myst...",2002
11494,0.744467,49530,Blood Diamond,"[Action, Adventure, Crime, Drama, Thriller, War]",2006
4625,0.740443,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
15562,0.730382,79132,Inception,"[Action, Crime, Drama, Mystery, Sci-Fi, Thrill...",2010
27514,0.72837,127341,Longshot,"[Action, Comedy, Crime, Drama, Romance, Thriller]",2001


In [38]:
#------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------
# Collaborative Filtering
#------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------