In [1]:
import os as os
import numpy as np
import pandas as pd
from surprise import AlgoBase, Dataset, PredictionImpossible
from surprise import BaselineOnly, Reader
from surprise.model_selection import cross_validate
from scipy.sparse import csr_matrix, diags

# path to dataset file
file_path = os.path.expanduser("C:/Users/AI-Lab/Desktop/推薦系統/ml-100k/ml-100k/u.data")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_file(file_path, reader=reader)



In [2]:
# Create a dummy sparse cosine similarity matrix
num_items = 1682  # Update this to the actual number of items in your dataset
density = 0.01  # The density of the matrix
data_points = np.random.choice([0, 1], size=(num_items, num_items), p=[1 - density, density])
cosine_sim_sparse = csr_matrix(data_points)
cosine_sim_sparse.setdiag(np.ones(num_items))



  self._set_arrayXarray(i, j, x)


In [3]:
# Define a custom algorithm using the cosine similarity
class CosineSimilarityBasedAlgorithm(AlgoBase):
    def __init__(self, cosine_sim, k=10):
        AlgoBase.__init__(self)
        self.cosine_sim = cosine_sim
        self.k = k  # Number of top similar items to consider
        self.similarity_cache = {}  # Cache for storing item similarities

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        return self

    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')

        if i not in self.similarity_cache:
            # Compute top-k similar items
            similarities = self.cosine_sim[i].toarray().ravel()
            top_k_items = np.argsort(similarities)[-self.k:][::-1]
            self.similarity_cache[i] = top_k_items

        neighbors = [(alt_iid, self.cosine_sim[i, alt_iid])
                     for alt_iid in self.similarity_cache[i]
                     if self.trainset.knows_item(alt_iid) and alt_iid != i]

        if not neighbors:
            raise PredictionImpossible('No neighbors')

        sim_total = weighted_sum = 0
        for alt_iid, similarity in neighbors:
          for (item, rating) in self.trainset.ur[u]:
            if item == alt_iid:
              similarity_value = similarity  # similarity is already an integer
              sim_total += similarity_value
              weighted_sum += similarity_value * rating
              break
          if sim_total == 0:
            raise PredictionImpossible('No neighbors with non-zero similarity')

        predicted_rating = weighted_sum / sim_total if sim_total else 0
        return predicted_rating

In [4]:
# Instantiate the custom algorithm with the precomputed cosine similarity matrix
algo = CosineSimilarityBasedAlgorithm(cosine_sim=cosine_sim_sparse, k=10)

# Evaluate the performance on the dataset
results = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=True)

Evaluating RMSE of algorithm CosineSimilarityBasedAlgorithm on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.1439  1.1422  1.1408  1.1423  0.0013  
Fit time          0.00    0.01    0.01    0.01    0.00    
Test time         6.67    6.46    6.45    6.53    0.10    


In [5]:
# Split the dataset into train and test sets
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)

# Train the algorithm on the training set
algo.fit(trainset)

# Make recommendations for all users in the test set
predictions = algo.test(testset)

In [6]:
# Organize predictions into a dictionary where keys are user IDs and values are lists of recommended item IDs
user_predictions = {}
for prediction in predictions:
    user_id = prediction.uid
    item_id = prediction.iid
    estimated_rating = prediction.est
    if user_id not in user_predictions:
        user_predictions[user_id] = []
    user_predictions[user_id].append((item_id, estimated_rating))

In [7]:
# Calculate NDCG for each user
user_true_ratings = {}
for uid, iid, true_r in testset:
    if uid not in user_true_ratings:
        user_true_ratings[uid] = {}
    user_true_ratings[uid][iid] = true_r

# calcuate every client NDCG
ndcg_scores = []
for uid, user_ratings in user_predictions.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    true_ratings = [user_true_ratings[uid][iid] if iid in user_true_ratings[uid] else 0 for iid, _ in user_ratings[:10]]
    estimated_ratings = [rating for _, rating in user_ratings[:10]]
    dcg = sum([(2 ** true - 1) / np.log2(i + 2) for i, true in enumerate(true_ratings)])
    idcg = sum([(2 ** true - 1) / np.log2(i + 2) for i, true in enumerate(sorted(true_ratings, reverse=True))])
    ndcg = dcg / idcg if idcg > 0 else 0
    ndcg_scores.append(ndcg)

average_ndcg = np.mean(ndcg_scores)
print("Average NDCG:", average_ndcg)

Average NDCG: 0.8211228297594961
