In [3]:
pip install scikit-surprise



In [25]:
from surprise import SVD, SVDpp, Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd
from collections import defaultdict
from surprise import SVDpp, Reader, Dataset, accuracy
import math
import numpy as np
from sklearn.metrics import ndcg_score


In [8]:
train_data_path = 'training_data.csv'
test_data_path = 'testing_data.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [11]:
#setting up surprise and calculations
def converter_surprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))  # ratings' range
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

trainset, testset = converter_surprise(train_df, test_df)

def calculate_rmse_mae(model, trainset, testset):
    model.fit(trainset)

    test_predictions = model.test(testset)

    rmse = accuracy.rmse(test_predictions)
    mae = accuracy.mae(test_predictions)

    return rmse, mae


In [15]:
#regular svd
svd_model = SVD()
svd_rmse, svd_mae = calculate_rmse_mae(svd_model, trainset, testset)

RMSE: 0.8695
MAE:  0.6682


In [16]:
# svdpp
svdpp_model = SVDpp()
svdpp_rmse, svdpp_mae = calculate_rmse_mae(svdpp_model, trainset, testset)

RMSE: 0.8584
MAE:  0.6594


In [None]:
#svdpp has better values so using that model

In [39]:
#top n
def get_top_n_recommendations(predictions, n):
    recommendations = defaultdict(list)
    actual_ratings = defaultdict(list)

    for user_id, item_id, actual_rating, estimated_rating, _ in predictions:
        recommendations[user_id].append((item_id, estimated_rating))
        actual_ratings[user_id].append((item_id, actual_rating))

    for user_id, recs in recommendations.items():
        recs.sort(key=lambda x: x[1], reverse=True)
        recommendations[user_id] = recs[:n]

    return recommendations, actual_ratings


In [40]:
#precision an recall
def calculate_precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_estimated_true = defaultdict(list)

    for user_id, _, true_rating, estimated_rating, _ in predictions:
        user_estimated_true[user_id].append((estimated_rating, true_rating))

    precision_values = dict()
    recall_values = dict()

    for user_id, ratings in user_estimated_true.items():
        ratings.sort(key=lambda x: x[0], reverse=True)

        relevant_items_count = sum((true_rating >= threshold) for _, true_rating in ratings)
        recommended_items_at_k = sum((estimated_rating >= threshold) for estimated_rating, _ in ratings[:k])
        relevant_and_recommended_at_k = sum(((true_rating >= threshold) and (estimated_rating >= threshold))
                                            for estimated_rating, true_rating in ratings[:k])

        precision_values[user_id] = relevant_and_recommended_at_k / recommended_items_at_k if recommended_items_at_k != 0 else 1
        recall_values[user_id] = relevant_and_recommended_at_k / relevant_items_count if relevant_items_count != 0 else 1

    avg_precision = sum(precision for precision in precision_values.values()) / len(precision_values)
    avg_recall = sum(recall for recall in recall_values.values()) / len(recall_values)

    return avg_precision, avg_recall

In [41]:
def calculate_dcg_at_k(scores):
    return scores[0] + sum(score / math.log(index, 2) for score, index in zip(scores[1:], range(2, len(scores) + 1)))

def calculate_ndcg_at_k(scores):
    ideal_dcg = calculate_dcg_at_k(sorted(scores, reverse=True))
    return calculate_dcg_at_k(scores) / ideal_dcg if ideal_dcg > 0.0 else 0.0


In [45]:
def evaluate_model_performance(model, trainset, testset, n=10):

    model.fit(trainset)

    predictions = model.test(testset)

    top_recs, true_ratings = get_top_n_recommendations(predictions, n)

    avg_precision, avg_recall = calculate_precision_recall_at_k(predictions, k=n)
    f_measure_avg = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) != 0 else 0

    ndcg_scores = dict()
    for user_id, recs in top_recs.items():
        scores = []
        for item_id, estimated_rating in recs:
            found = False
            original_user_ratings = true_ratings[user_id]
            for i, rating in original_user_ratings:
                if item_id == i:
                    scores.append(rating)
                    found = True
                    break
            if not found:
                scores.append(0)
        ndcg_scores[user_id] = calculate_ndcg_at_k(scores)

    avg_ndcg = sum(ndcg for ndcg in ndcg_scores.values()) / len(ndcg_scores)

    # storing reccomendations for future use in optional section
    recommendations_data = []

    for user_id, recs in top_recs.items():
        recs_row = {'userId': user_id}
        for i, (item_id, estimated_rating) in enumerate(recs):
            recs_row[f'rec{i+1}_movieId'] = item_id
            recs_row[f'rec{i+1}_rating'] = estimated_rating
        recommendations_data.append(recs_row)

    recommendations_df = pd.DataFrame(recommendations_data)

    return avg_precision, avg_recall, f_measure_avg, avg_ndcg, predictions, recommendations_df

model = SVDpp()
avg_precision, avg_recall, avg_f_measure, avg_ndcg, test_predictions, recommendations_df = evaluate_model_performance(
    model, trainset, testset, n=10
)

print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F-measure: {avg_f_measure}")
print(f"Average NDCG: {avg_ndcg}")


Average Precision: 0.790362347124642
Average Recall: 0.5442763504878859
Average F-measure: 0.6446321909076375
Average NDCG: 0.9539926393565159


In [46]:
#recommendations_df.to_csv('recommendations.csv', index=False)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>