In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score

from collections import Counter
from tqdm import tqdm
import random

In [2]:
random_seed = 42

In [3]:
data_path = '../../../data/All.csv'
df = pd.read_csv(data_path)

In [4]:
sampled_df = df['SessionId'].sample(n=1000, random_state=random_seed)
df = df[df['SessionId'].isin(sampled_df)]

In [5]:
last_items = df.loc[df.groupby('SessionId')['Time'].idxmax()]
last_items = last_items[['SessionId', 'ItemId']]

In [6]:
other_items = df.drop(last_items.index)
other_items = other_items[['SessionId', 'ItemId']]

In [7]:
user_item_matrix = other_items.pivot_table(index='SessionId', columns='ItemId', aggfunc='size', fill_value=0)

In [8]:
sparse_user_item = csr_matrix(user_item_matrix.values)

In [9]:
def calculate_ndcg_in_batches(sparse_matrix, last_items, session_index_mapping, batch_size):
    ndcg_scores = []
    num_sessions = sparse_matrix.shape[0]

    for start_row in tqdm(range(0, num_sessions, batch_size)):
        end_row = min(start_row + batch_size, num_sessions)
        sim_batch = cosine_similarity(sparse_matrix[start_row:end_row], sparse_matrix)

        for batch_index, global_index in enumerate(range(start_row, end_row)):
            session_id = list(session_index_mapping.keys())[list(session_index_mapping.values()).index(global_index)]
            
            if session_id in last_items['SessionId'].values:
                true_item = last_items.loc[last_items['SessionId'] == session_id, 'ItemId'].values[0]
                similar_sessions = sim_batch[batch_index]
                top_similar_indices = similar_sessions.argsort()[-101:-1] #100
                recommended_item_ids = []
                for idx in top_similar_indices:
                    session_data = sparse_matrix.getrow(idx)
                    interacted_items = session_data.indices
                    recommended_item_ids.extend(interacted_items)

                item_counts = Counter(recommended_item_ids)
                viewed_items = set(other_items.loc[other_items['SessionId'] == session_id, 'ItemId'])
                candidate_items = [item for item, count in item_counts.items() if item not in viewed_items]

                if len(candidate_items) > 100:
                    predicted_items_id = random.sample(candidate_items, 100) #100
                    relevance = np.isin(predicted_items_id, true_item).astype(int)
                    ndcg = ndcg_score([relevance], [relevance])
                    ndcg_scores.append(ndcg)
                else:
                    continue

    return ndcg_scores

In [10]:
session_index_mapping = {session_id: index for index, session_id in enumerate(df['SessionId'].unique())}

In [11]:
ndcgs = calculate_ndcg_in_batches(sparse_user_item, last_items, session_index_mapping, batch_size=1000)
average_ndcg = np.mean(ndcgs)
print("Average NDCG@100:", average_ndcg)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.77s/it]

Average NDCG@100: 0.00911854103343465



