In [10]:
import os as os
import pandas as pd
from surprise import Dataset, Reader

In [13]:
# path to dataset file
file_path = os.path.expanduser("C:/Users/AI-Lab/Desktop/推薦系統/ml-100k/ml-100k/u.data")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_file(file_path, reader=reader)

#Set columns for pandas to read
data_cols = ["user_id", "movie_id", "rating", "timestamp"]

data_pandas = pd.read_csv("C:/Users/AI-Lab/Desktop/推薦系統/ml-100k/ml-100k/u.data",
                            sep="\t", names=data_cols, encoding="latin-1",)
data_pandas.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [16]:
import numpy as np
from surprise import SVD
from surprise.model_selection import GridSearchCV

In [17]:
# Define a parameter grid to search over
param_grid = {
    'n_epochs': [5, 10], # Number of epochs. You can try different numbers here.
    'lr_all': [0.002, 0.005], # Learning rate. You can try different values here.
    'reg_all': [0.4, 0.6] # Regularization term. You can try different values here.
}

# Setup the grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

# Perform the grid search
gs.fit(data)

# Best RMSE score
print(f'RMSE: {gs.best_score["rmse"]}')


RMSE: 0.9611798776421395


In [18]:
from surprise.model_selection import train_test_split

In [19]:
# Load the dataset and train the model using the best parameters found by GridSearchCV
trainset, testset = train_test_split(data, test_size=0.2)

best_params = gs.best_params['rmse']
algo = SVD(n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])
algo.fit(trainset)

# Make predictions on the testset
predictions = algo.test(testset)

In [20]:
# Calculate_ndcg
def calculate_ndcg(predictions, k=10):
    user_est_true = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))

    ndcg = 0
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        user_ratings = user_ratings[:k]

        dcg = sum([true_r / np.log2(i + 2) for i, (_, true_r) in enumerate(user_ratings)])
        idcg = sum([np.log2(i + 2) for i in range(len(user_ratings))])
        ndcg += dcg / idcg if idcg > 0 else 0

    return ndcg / len(user_est_true)

ndcg_value = calculate_ndcg(predictions, k=10)
print(f'NDCG: {ndcg_value}')

NDCG: 0.8889543854295812
