In [1]:
import numpy as np
import pandas as pd

### Read data

In [2]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
data = pd.merge(ratings_df, movies_df, on=['movieId'])
data.drop(['timestamp'], axis=1, inplace=True)
print("data dimensions:", data.shape)
data.head()

data dimensions: (100836, 5)


Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
num_users = data['userId'].nunique()
num_movies = movies_df['movieId'].nunique()
print("total number of users:", num_users)
print("total number of movies:", num_movies)

total number of users: 610
total number of movies: 9742


In [16]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [15]:
# create training/testing datasets
train_df, test_df = train_test_split(data, test_size=0.2)

reader = Reader() # default rating scale is (1,5)
train_data_sur = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
train_data = train_data_sur.build_full_trainset()

test_data_sur = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader)
test_data = test_data_sur.build_full_trainset()

In [24]:
# finding the best params
param_grid = {'n_factors': [1, 10, 100, 1000],
              'n_epochs': [5, 10, 15, 20], 
              'lr_all': [0.001, 0.003, 0.009, 0.01],
              'reg_all': [0, 0.01, 0.02, 0.04]}
gs = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(train_data_sur)
gs.best_score['rmse']

0.8739117884702884

In [25]:
params = gs.best_params['rmse']
params

{'n_factors': 1, 'n_epochs': 20, 'lr_all': 0.009, 'reg_all': 0.04}

In [26]:
svd = SVD(n_factors=params['n_factors'], 
          n_epochs=params['n_epochs'],
          lr_all=params['lr_all'], 
          reg_all=params['reg_all'], verbose=True)
svd.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1796a5670>

In [27]:
preds = svd.test(test_data.build_testset())
print("RMSE:", accuracy.rmse(preds, verbose=False))

RMSE: 0.8714248515222702
