In [1]:
# Importing standard libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
%matplotlib inline

In [2]:
# Importing ratings file
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
users = pd.read_csv('users.csv')
users.head()
# One-hot encode gender
mlb = MultiLabelBinarizer()
users = users.join(pd.DataFrame(mlb.fit_transform(users.pop('sex')),
                          columns=mlb.classes_,
                          index=users.index))

In [4]:
items = pd.read_csv('items.csv')
items.head()
items.drop(['video_release_date', 'imdb_url'], axis=1, inplace=True)
items.dropna(inplace=True)

In [5]:
# Create a number of ratings column to see how many ratings each movie has
ratings_over_fifty = pd.DataFrame(ratings.groupby('movie_id')['rating'].count())
# Rename the column to number_of_ratings
ratings_over_fifty.columns = ['number_of_ratings']
# Get only the movies that have more than 50 ratings
ratings_over_fifty = ratings_over_fifty[ratings_over_fifty['number_of_ratings']>50]
# Reset index so that movie_id is not index, put an artificial index value
ratings_over_fifty.reset_index(inplace=True)
# See the movies with over 50 ratings
ratings_over_fifty.head(20)

Unnamed: 0,movie_id,number_of_ratings
0,1,452
1,2,131
2,3,90
3,4,209
4,5,86
5,7,392
6,8,219
7,9,299
8,10,89
9,11,236


In [6]:
# Define function to check if a movie in the main dataset has over 50 ratings or not
def rows_above_under_fifty(row):
    if np.any(row == ratings_over_fifty['movie_id']):
        return True
    return False

In [7]:
# Apply that function on the movies column, assigns the True or False values in a new Series object
series_above_under_fifty = ratings['movie_id'].apply(rows_above_under_fifty)
# It can be seen that the 1st, 2nd movies have over 50 ratings, whereas the 3rd one does not
# 1st corresponds to movie_id 242
# 2nd corresponds to movie_id 302
# 3rd corresponds to movie_id 377
series_above_under_fifty.head()

0     True
1     True
2    False
3     True
4     True
Name: movie_id, dtype: bool

In [8]:
# Let's see if that is the case
# Prints out that it has 117 ratings
print(ratings_over_fifty[ratings_over_fifty['movie_id'] == 242])
# Prints out that is has 297 ratings
print(ratings_over_fifty[ratings_over_fifty['movie_id'] == 302])
# Returns empty dataframe => it is not in the ratings_over_fifty list => it does not have over 50 ratings
print(ratings_over_fifty[ratings_over_fifty['movie_id'] == 377])

     movie_id  number_of_ratings
209       242                117
     movie_id  number_of_ratings
257       302                297
Empty DataFrame
Columns: [movie_id, number_of_ratings]
Index: []


In [9]:
# Filter ratings dataset so that only the movies with above 50 rating remain
ratings_filtered = ratings[series_above_under_fifty]
ratings_filtered.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806


In [10]:
# Merge the 3 dataframes to get all features in just 1 dataframe
merged = pd.merge(users, pd.merge(items, ratings, on='movie_id'), on='user_id')
# Merge the 3 dataframes to get all features in just 1 dataframe, where the min. number of ratings is 50
merged_filtered = pd.merge(users, pd.merge(items, ratings_filtered, on='movie_id'), on='user_id')
merged.head()

Unnamed: 0,user_id,age,occupation,zip_code,F,M,movie_id,title,release_date,unknown,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating,unix_timestamp
0,1,24,technician,85711,0,1,1,Toy Story (1995),01-Jan-1995,0,...,0,0,0,0,0,0,0,0,5,874965758
1,1,24,technician,85711,0,1,2,GoldenEye (1995),01-Jan-1995,0,...,0,0,0,0,0,1,0,0,3,876893171
2,1,24,technician,85711,0,1,3,Four Rooms (1995),01-Jan-1995,0,...,0,0,0,0,0,1,0,0,4,878542960
3,1,24,technician,85711,0,1,4,Get Shorty (1995),01-Jan-1995,0,...,0,0,0,0,0,0,0,0,3,876893119
4,1,24,technician,85711,0,1,5,Copycat (1995),01-Jan-1995,0,...,0,0,0,0,0,1,0,0,3,889751712


In [11]:
# It can be seen that the movie with id 242 is in the filtered dataset
print('Filtered, movie_id = 242, Num of rows = ' + str(len(merged_filtered[merged_filtered['movie_id'] == 242])))
# It can be seen that the movie with id 377 is not in the filtered dataset anymore
print('Filtered, movie_id = 377, Num of rows = ' + str(len(merged_filtered[merged_filtered['movie_id'] == 377])))
# However, movie with id 377 is still in the unfiltered dataset
print('Unfiltered, movie_id = 377, Num of rows = ' + str(len(merged[merged['movie_id'] == 377])))

Filtered, movie_id = 242, Num of rows = 117
Filtered, movie_id = 377, Num of rows = 0
Unfiltered, movie_id = 377, Num of rows = 13


In [12]:
# Split the ratings dataframe into train and test set
from sklearn.model_selection import train_test_split

# Model with the whole dataset
ratings_train, ratings_test = train_test_split(merged, test_size=0.10)
# Model with only the movies that have above 50 rating
#ratings_train, ratings_test = train_test_split(merged_filtered, test_size=0.10)

In [13]:
# Turicreate is a high-level machine learning library created by Apple
import turicreate as tc
ratings_train = tc.SFrame(ratings_train)
ratings_test = tc.SFrame(ratings_test)

  from ._conv import register_converters as _register_converters


In [14]:
# A simple popularity model is trained, which recommends movies based on their popularity
# item_id parameter specifies the column name that will be recommended, namely, movie_id
popularity_model = tc.popularity_recommender.create(ratings_train, user_id='user_id',
                                                            item_id='movie_id', target='rating')

In [15]:
# Recommend the top 5 movies to users 1, 2, 3, 4, 5
popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5],k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   814    |  5.0  |  1   |
|    1    |   1500   |  5.0  |  2   |
|    1    |   1293   |  5.0  |  3   |
|    1    |   1467   |  5.0  |  4   |
|    1    |   1367   |  5.0  |  5   |
|    2    |   814    |  5.0  |  1   |
|    2    |   1500   |  5.0  |  2   |
|    2    |   1293   |  5.0  |  3   |
|    2    |   1467   |  5.0  |  4   |
|    2    |   1367   |  5.0  |  5   |
|    3    |   814    |  5.0  |  1   |
|    3    |   1500   |  5.0  |  2   |
|    3    |   1293   |  5.0  |  3   |
|    3    |   1467   |  5.0  |  4   |
|    3    |   1367   |  5.0  |  5   |
|    4    |   814    |  5.0  |  1   |
|    4    |   1500   |  5.0  |  2   |
|    4    |   1293   |  5.0  |  3   |
|    4    |   1467   |  5.0  |  4   |
|    4    |   1367   |  5.0  |  5   |
|    5    |   814    |  5.0  |  1   |
|    5    |   1500   |  5.0  |  2   |
|    5    |   1293   |  5.0  |  3   |
|    5    | 

In [16]:
# The above was a simple popularity model. Now we will build a collaborative-filtering model.
# Ranking factorization recommender trains a model to predict a rating for
# each possible combination of users and movies. The internal coefficients of the model are
# learned from known ratings of users on movies. Recommendations are then based on these ratings.
# Training the model
item_sim_model = tc.ranking_factorization_recommender.create(ratings_train, user_id='user_id',
                                                             item_id='movie_id', target='rating')

In [17]:
# Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[1,2,3,4,5],k=5)
item_sim_recomm.print_rows(num_rows=25)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   286    | 4.0907851423458705 |  1   |
|    1    |   302    | 4.049964782162155  |  2   |
|    1    |   762    | 3.966235239370311  |  3   |
|    1    |   471    | 3.9660469855503684 |  4   |
|    1    |   591    | 3.9560885991291648 |  5   |
|    2    |   191    | 4.2581757924841295 |  1   |
|    2    |   427    | 4.246132301478232  |  2   |
|    2    |    12    | 4.244044440178717  |  3   |
|    2    |   238    | 4.243991466550673  |  4   |
|    2    |   357    | 4.236781122474516  |  5   |
|    3    |   153    | 4.139021735949958  |  1   |
|    3    |   185    |  4.1327195282416   |  2   |
|    3    |   483    | 4.070365678830589  |  3   |
|    3    |   211    | 4.062940310998405  |  4   |
|    3    |   480    | 4.058870997591461  |  5   |
|    4    |   302    | 4.052674558325256  |  1   |
|    4    |   313    | 4.035897

In [18]:
# Evaluate RMSE (Root Mean Square Error), the lower it is, the better. A lower score means the actual data points
# are closer to the regression line (therefore the regression line's predicted value is closer to the actual value)
# For visual explanation refer to:
# https://www.khanacademy.org/math/ap-statistics/bivariate-data-ap/assessing-fit-least-squares-regression/v/standard-dev-residuals
item_sim_model.evaluate_rmse(ratings_test, target='rating')
# For this model, on the testing data, the RMSE is ~1, which is not too bad. It is also quite close to
# the RMSE of the model on the training data, which is ~0.73.

{'rmse_by_user': Columns:
 	user_id	int
 	rmse	float
 	count	int
 
 Rows: 924
 
 Data:
 +---------+--------------------+-------+
 | user_id |        rmse        | count |
 +---------+--------------------+-------+
 |   747   | 1.0253579557650696 |   26  |
 |   118   | 1.1566056772649114 |   6   |
 |   153   | 0.7576928995309292 |   3   |
 |   660   | 0.7986550819196537 |   24  |
 |    92   | 0.9467938661862798 |   36  |
 |   264   | 0.8111248554854931 |   11  |
 |   690   | 1.013448401167215  |   5   |
 |   839   | 1.340271080153556  |   7   |
 |   837   | 1.1959237840553119 |   3   |
 |   208   | 0.7263953991261606 |   5   |
 +---------+--------------------+-------+
 [924 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_item': Columns:
 	movie_id	int
 	rmse	float
 	count	int
 
 Rows: 1253
 
 Data:
 +----------+--------------------+-------+
 | movie_id |        rmse        | cou