In [151]:
import sys
from recommenders.datasets.python_splitters import python_chrono_split

from recommenders.datasets import covid_utils
from recommenders.models.tfidf.tfidf_utils import TfidfRecommender
from recommenders.datasets import movielens
# from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
import pandas as pd
from tqdm import tqdm
import torch

In [152]:
movies_df = pd.read_csv('/home/ee303/test/dataset/GENRES/ml-1m/movies.dat',
                        delimiter='::', engine= 'python', header=None,
                        names=['movie_name', 'genre'],encoding='latin1')
#preprocess movie.csv file
movies_df.reset_index(inplace=True)
movies_df.rename(columns={"index": "itemID"}, inplace=True)

In [153]:
#applying tfidf to genres
recommender=TfidfRecommender(id_col="itemID",tokenization_method='bert')
clean_movies=recommender.clean_dataframe(movies_df,cols_to_clean=["genre"],new_col_name="genres").drop(columns=["genre"])
tf, vectors_tokenized = recommender.tokenize_text(df_clean=clean_movies, text_col="genres")
recommender.fit(tf, vectors_tokenized)

In [154]:
df = movielens.load_pandas_df(size="1m", local_cache_path='./dataset/')

train, validate, test = python_chrono_split(df, ratio=[0.8,0.1,0.1], filter_by="user",col_user="userID", col_item="itemID", col_timestamp="timestamp")
userID_list = list(train['userID'].unique())

#creat rating matrix
r_matrix_train = train.pivot_table(index='userID', columns='itemID', values='rating')
r_matrix_validate = validate.pivot_table(index='userID', columns='itemID', values='rating')
r_matrix_test = test.pivot_table(index='userID', columns='itemID', values='rating')

In [155]:
#Generate recommendations based on the k most similar items to the last item rated by the user,according to timestamp,movies_df,movies info
def recommended_items(user_id, history, k):
    target_item=int(history[history["userID"] == user_id]["itemID"].iloc[-1])
    sim_mov=[t[1] for t in(recommender.recommendations[target_item])][:k]
    return sim_mov

In [156]:
def create_similarity_matrix(similarity_dict):
    # Find the total number of movies
    num_movies = max(similarity_dict.keys())

    # Initialize the similarity matrix with zeros
    similarity_matrix = [[0.0] * num_movies for _ in range(num_movies)]

    # Fill in the similarity values from the dictionary
    for movie_id, similarities in similarity_dict.items():
        for similarity in similarities:
            other_movie_id = similarity[1]
            similarity_value = similarity[0]
            similarity_matrix[movie_id - 1][other_movie_id - 1] = similarity_value

    return similarity_matrix

def are_list_consecutive(input_list):
    return input_list == list(range(min(input_list), max(input_list)+1))

K = 20
recommender.recommend_top_k_items(clean_movies, k=K)
item_sim_mat = torch.tensor(create_similarity_matrix(recommender.recommendations), dtype=torch.float32)

In [157]:
def dataframe_to_tensor(df, total_num_of_users, total_num_of_movies):
    # Initialize a tensor with zeros
    user_movie_tensor = torch.zeros(total_num_of_users, total_num_of_movies)

    # Iterate over DataFrame and fill in the tensor
    with tqdm(total=len(df)) as pbar:
        for index, row in df.iterrows():
            user_id = index
            for movie_id, rating in row.iteritems():
                user_movie_tensor[user_id - 1, movie_id - 1] = rating
            pbar.update()

    return user_movie_tensor

r_matrix_train_tensor = dataframe_to_tensor(r_matrix_train, len(userID_list), len(item_sim_mat))

  for movie_id, rating in row.iteritems():
100%|██████████| 6040/6040 [01:29<00:00, 67.51it/s]


In [158]:
r_matrix_predict = torch.matmul(r_matrix_train_tensor, item_sim_mat)

In [159]:
r_matrix_validate_tensor = dataframe_to_tensor(r_matrix_validate, len(userID_list), len(item_sim_mat))

  for movie_id, rating in row.iteritems():
100%|██████████| 6040/6040 [01:22<00:00, 73.41it/s]


In [160]:
def recall_at_k(r_matrix_predict, r_matrix_validate_tensor, k):
    # Get the number of users
    num_users = r_matrix_predict.size(0)
    
    # Initialize recall sum
    recall_sum = 0.0
    
    for user_idx in range(num_users):
        # Sort predicted ratings for the user
        predicted_ratings = r_matrix_predict[user_idx]
        _, top_indices = torch.topk(predicted_ratings, k)
        
        # Get the set of movies in the top K for the user
        top_movies_predicted = set(top_indices.numpy())
        
        # Get the set of actual rated movies for the user in the validation set
        actual_movies_rated = set(torch.nonzero(r_matrix_validate_tensor[user_idx]).flatten().numpy())
        
        # Calculate the intersection of predicted and actual movies
        intersection = top_movies_predicted.intersection(actual_movies_rated)
        
        # Calculate Recall@K for this user
        recall_at_k_user = len(intersection) / len(actual_movies_rated) if len(actual_movies_rated) > 0 else 0.0
        
        # Add to recall sum
        recall_sum += recall_at_k_user
    
    # Calculate average recall across all users
    recall_at_k_avg = recall_sum / num_users
    
    return recall_at_k_avg

k = 20  # Example value for k
recall_at_20 = recall_at_k(r_matrix_predict, r_matrix_validate_tensor, k)
print("Recall@20:", recall_at_20)

Recall@20: 0.005936479667556973


In [None]:

def actual_items(user_id, actual_rating_matrix):   
    user_ratings = actual_rating_matrix.loc[user_id]
    movies_rated = user_ratings.dropna().index.tolist()
    return movies_rated


In [None]:
def recall_at_k(user_id, train, k, evaluate_rating_matrix):
    actual_items_list = actual_items(user_id, evaluate_rating_matrix)
    recommended_items_list = recommended_items(user_id, train, k=k)
    matched_item = set(actual_items_list).intersection(set(recommended_items_list))

    recall = len(matched_item) / len(actual_items_list) if len(actual_items_list) > 0 else 0 
    return recall

def average_recall_at_k(userID_list, train, k, evaluate_rating_matrix=r_matrix_validate):
    total_recall = 0
    num_users = len(userID_list)
    
    with tqdm(total=len(userID_list)) as pbar:
        for user_id in userID_list:
            recall_at_k_value = recall_at_k(user_id, train, k, evaluate_rating_matrix)       
            total_recall += recall_at_k_value
            pbar.update(1)
        
    average_recall = total_recall / num_users if num_users > 0 else 0
    return average_recall


In [None]:
recommender.recommend_top_k_items(clean_movies, 10)
average_recall_at_k(userID_list, train, 10, r_matrix_validate)

100%|██████████| 6040/6040 [00:05<00:00, 1191.56it/s]


0.015911516912690847