In [1]:
import surprise
print(surprise.__version__)

1.1.1


### System construction using Surprise

In [2]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [3]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.25, random_state=0)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] 

 y


Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\zephy/.surprise_data/ml-100k


In [7]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2326e6bcfd0>

In [11]:
# predict ratings for the input test data set
predictions = algo.test(testset)
print('prediction type:', type(predictions), 'size:', len(predictions))
print('prediction 5 results:\n')
predictions[:5]

prediction type: <class 'list'> size: 25000
prediction 5 results:



[Prediction(uid='120', iid='282', r_ui=4.0, est=3.6540622220817593, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=3.9815433839887824, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=3.8101968034755096, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.4069739105580714, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.3978810744459222, details={'was_impossible': False})]

In [12]:
# access to data
[(pred.uid, pred.iid, pred.est) for pred in predictions[:3]]

[('120', '282', 3.6540622220817593),
 ('882', '291', 3.9815433839887824),
 ('535', '507', 3.8101968034755096)]

In [13]:
# predict ratings for the input user info -> .predict()
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid)
print(pred)

user: 196        item: 302        r_ui = None   est = 4.47   {'was_impossible': False}


In [14]:
# evaluation
accuracy.rmse(predictions)

RMSE: 0.9513


0.9512640835251172

### Surprise modules 

In [15]:
# Dataset.load_builtin
# Dataset.load_from_file
# Dataset.load_from_df
import pandas as pd

ratings = pd.read_csv('./MovieLens_Nearest_Neighbor/ratings.csv')
# remove index and header
ratings.to_csv('./MovieLens_Nearest_Neighbor/ratings_no_header.csv', index=False, header=False)

In [17]:
from surprise import Reader

# Defining data parsing format 
# When it's being loades, only three columns, column 0~ column 2, are loaded.
reader = Reader(line_format = 'user item rating timestamp', sep=',', rating_scale=(0.5, 5))
data = Dataset.load_from_file('./MovieLens_Nearest_Neighbor/ratings_no_header.csv', reader=reader)

In [18]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=0)
# n_factors : number of latent factors
algo = SVD(n_factors=50, random_state=0)

# training with trainding data, and predicting ratings using testing data
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8682


0.8681952927143516

In [19]:
# Data loading from data frame
import pandas as pd
from surprise import Reader, Dataset

ratings = pd.read_csv('./MovieLens_Nearest_Neighbor/ratings.csv')
reader = Reader(rating_scale=(0.5,5.0))

# rating datafrome needs to have columns in order of "userId, itemId, rating"
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25, random_state=0)

algo = SVD(n_factors=50, random_state=0)
algo.fit(trainset)
predictions=algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8682


0.8681952927143516

### Cross Validation and Hyperparameter tuning

In [22]:
from surprise.model_selection import cross_validate

# Data loading from Pandas dataframe
ratings = pd.read_csv('./MovieLens_Nearest_Neighbor/ratings.csv')
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

algo = SVD(random_state=0)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8695  0.8692  0.8619  0.8810  0.8800  0.8723  0.0072  
MAE (testset)     0.6688  0.6663  0.6663  0.6762  0.6753  0.6706  0.0043  
Fit time          5.04    5.07    5.13    5.66    5.23    5.23    0.23    
Test time         0.13    0.13    0.27    0.13    0.13    0.16    0.05    


{'test_rmse': array([0.86950635, 0.86917774, 0.8619118 , 0.88098171, 0.88001759]),
 'test_mae': array([0.66881643, 0.66627282, 0.66633127, 0.67618699, 0.67526475]),
 'fit_time': (5.044509649276733,
  5.073760032653809,
  5.125361680984497,
  5.662141799926758,
  5.225922584533691),
 'test_time': (0.13382458686828613,
  0.1288740634918213,
  0.2667093276977539,
  0.13184857368469238,
  0.1328601837158203)}

In [26]:
from surprise.model_selection import GridSearchCV
# Parameters for optimization
param_grid = {'n_epochs':[20, 40, 60], 'n_factors':[50, 100, 200]}

# CV =3, measures = ['rmse', 'mae'] GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8777028987418909
{'n_epochs': 20, 'n_factors': 50}


### Personalized movie recommendation system using Surprise

In [27]:
# This code will raise error
# In order to use .fit() methode, input data should be Trainset type that can be made through train_test_split
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
algo = SVD(n_factors=50, random_state=0)
algo.fit(data)

AttributeError: 'DatasetAutoFolds' object has no attribute 'global_mean'

In [28]:
# In order to use whole data set as an input, use DatasetAutoFold class
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5,5))
data_folds = DatasetAutoFolds(ratings_file='./MovieLens_Nearest_Neighbor/ratings_no_header.csv', reader=reader)

# Trainset transform
trainset = data_folds.build_full_trainset()

In [29]:
algo = SVD(n_epochs=20, n_factors=50, random_state=0)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23276f37bb0>

In [30]:
# movies info loading
movies = pd.read_csv('./MovieLens_Nearest_Neighbor/movies.csv')

# userId=9, movieId=42 rating check
movieIds = ratings[ratings['userId']==9]['movieId']
if movieIds[movieIds==42].count()==0:
    print('User9 didn''t rate the movieId 42')
print(movies[movies['movieId']==42])

User9 didnt rate the movieId 42
    movieId                   title              genres
38       42  Dead Presidents (1995)  Action|Crime|Drama


In [31]:
uid = str(9)
iid = str(42)
pred = algo.predict(uid, iid, verbose=True)

user: 9          item: 42         r_ui = None   est = 3.13   {'was_impossible': False}


In [32]:
def get_unseen_surprise(ratings, movies, userId):
    seen_movies = ratings[ratings['userId']==userId]['movieId'].tolist()
    # Whole movie list
    total_movies = movies['movieId'].tolist()
    # unseen movie extraction
    unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
    print('number of rated movies:', len(seen_movies), 'number of unseen movies:',
          len(unseen_movies), 'number of total movies:', len(total_movies))
    return unseen_movies

unseen_movies = get_unseen_surprise(ratings, movies, 9)


number of rated movies: 46 number of unseen movies: 9696 number of total movies: 9742


In [36]:
def recomm_movie_by_surprise(algo, userId, unseen_movies, top_n=10):
    predictions = [algo.predict(str(userId), str(movieId)) for movieId in unseen_movies]
    # predictions list [Prediction(uid='9', iid='1', est=3.69), Prediction(....)]
    # sort by est
    def sortkey_est(pred):
        return pred.est
    predictions.sort(key=sortkey_est, reverse=True)
    top_predictions = predictions[:top_n]
    
    # get top_prediction movie info 
    top_movie_ids = [int(pred.iid) for pred in top_predictions]
    top_movie_rating = [pred.est for pred in top_predictions]
    top_movie_titles = movies[movies['movieId'].isin(top_movie_ids)]['title']
    top_movie_preds = [(id, title, rating) for id, title, rating in zip(top_movie_ids, top_movie_titles, top_movie_rating)]
    
    return top_movie_preds

unseen_movies = get_unseen_surprise(ratings, movies, 9)
top_movie_preds = recomm_movie_by_surprise(algo, 9, unseen_movies, top_n=10)

print('#### Top-10 recommanded movies for user 9 ####')
for top_movie in top_movie_preds:
    print(top_movie[1], ":", top_movie[2])

number of rated movies: 46 number of unseen movies: 9696 number of total movies: 9742
#### Top-10 recommanded movies for user 9 ####
Usual Suspects, The (1995) : 4.306302135700814
Star Wars: Episode IV - A New Hope (1977) : 4.281663842987387
Pulp Fiction (1994) : 4.278152632122759
Silence of the Lambs, The (1991) : 4.226073566460876
Godfather, The (1972) : 4.1918097904381995
Streetcar Named Desire, A (1951) : 4.154746591122658
Star Wars: Episode V - The Empire Strikes Back (1980) : 4.122016128534504
Star Wars: Episode VI - Return of the Jedi (1983) : 4.108009609093436
Goodfellas (1990) : 4.083464936588478
Glory (1989) : 4.07887165526957
