In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [2]:
# Load both the ratings and movies data into a pandas DataFrame
ratings_df = pd.read_csv('./ml-25m/ratings.csv', 
                         usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
movies_df = pd.read_csv('./ml-25m/movies.csv', 
                        usecols=['movieId', 'title', 'genres'], dtype={'movieId': 'int32', 'title': 'str', 'genres': 'str'})

In [3]:
# One-hot encode genres
genres_df = movies_df['genres'].str.get_dummies(sep='|')
movies_df = pd.concat([movies_df.drop('genres', axis=1), genres_df], axis=1)
movies_df.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Merge ratings on movies
merge_df = ratings_df.merge(movies_df, on='movieId')

# Define Reader for min. and max. ratings and load data using Surprise Dataset
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(merge_df[['userId', 'movieId', 'rating']], reader)

In [5]:
train, test = train_test_split(data, test_size=0.2)

In [6]:
# Define SVD model
model = SVD(n_factors=50, n_epochs=20, reg_all=0.02, lr_all=0.005)

In [7]:
# Fit data and make predictions
model.fit(train)
predictions = model.test(test)

In [8]:
# Evaluate the model using RMSE
rmse = accuracy.rmse(predictions)

print("Root Mean Squared Error = {:.4f}".format(rmse))

RMSE: 0.9932
Root Mean Squared Error = 0.9932
