In [13]:
import pandas as pd
import requests
from surprise import Dataset
from surprise import Reader

from surprise import SVD # matrix factorization algorithm
from surprise.model_selection import GridSearchCV

In [9]:
ratings = pd.read_csv('datasets/ml-25m/ratings_tmdb.csv')
ratings.sample(2)

Unnamed: 0,userId,tmdbId,rating,timestamp
11792271,102076,1637,3.5,1054321149
4351997,37664,544,3.5,1199090557


In [11]:
ratings['userId'].max()

162541

Adding my votes

In [14]:
USER_ID = 200_000
response = requests.get(f"http://cinema.itamar.live/votes/{USER_ID}")
votes_json = response.json()

my_votes = pd.DataFrame.from_dict(votes_json)
my_votes.sample(1)

Unnamed: 0,movie_id,vote,user_id
31,210577,3,200000


In [15]:
my_votes.rename(columns={'movie_id': 'tmdbId', 'vote':'rating', 'user_id': 'userId'}, inplace=True)
my_votes.sample(1)

Unnamed: 0,tmdbId,rating,userId
21,532067,1,200000


In [16]:
ratings = pd.concat([ratings, my_votes], ignore_index=True)
ratings.sample(1)

Unnamed: 0,userId,tmdbId,rating,timestamp
179314,1686,18,2.0,915471473.0


## Important
SVD expect dataframe with that columns order: userId, movieId, rating

In [18]:

reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
data = Dataset.load_from_df(ratings[["userId", "tmdbId", "rating"]], reader)

Params Tuning

In [19]:
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

KeyboardInterrupt: 

In [20]:
trainingSet = data.build_full_trainset()

svd = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)

svd.fit(trainingSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8beed686d0>

In [28]:
prediction = svd.predict(USER_ID, 13809) # rocknrolla
prediction.est

3.4509163780793792

In [None]:
import pickle

with open('svd_25m.pkl', 'wb') as f1:
    pickle.dump(svd, f1)

## Prediction for all movies

In [29]:
movies = pd.read_csv("datasets/mine/top_voted_movies.csv")
movies = movies[["id"]]
movies['est'] = movies['id'].apply(lambda id: svd.predict(USER_ID, id).est )

In [35]:
movies.to_json("recommended.json", orient='records')