In [1]:
import numpy as np
import pandas as pd
import math
from itertools import product
from tqdm import tqdm
import pickle
import csv
from matplotlib import pyplot as plt
import urllib
from PIL import Image
import random
from sklearn.metrics.pairwise import cosine_distances, linear_kernel

In [2]:
rating_df = pd.read_csv('../data/cleaned/cleaned_user_rating.csv')
rating_df_copy = rating_df.copy()
movies_df = pd.read_csv('../data/cleaned/cleaned_movies_details.csv')

rating_df.head()

rating_df_copy['user_id_number'], user_mapping = pd.factorize(rating_df_copy['user_id'])
rating_df_copy['movie_id_number'], movie_mapping = pd.factorize(rating_df_copy['movie_id'])
assert(rating_df_copy['user_id_number'].nunique() == rating_df_copy['user_id'].nunique())
assert(rating_df_copy['movie_id_number'].nunique() == rating_df_copy['movie_id'].nunique())

rating_df_copy.head()

Unnamed: 0,user_id,movie_id,rating,user_id_number,movie_id_number
0,ur127508339,tt7737800,8,0,0
1,ur6312156,tt7737800,8,1,0
2,ur150453978,tt7737800,1,2,0
3,ur18519281,tt7737800,3,3,0
4,ur0806494,tt7737800,7,4,0


In [3]:
n_users = rating_df_copy['user_id_number'].max() + 1
n_movies = rating_df_copy['movie_id_number'].max() + 1
print("Number of users: ", n_users)
print("Number of movies: ", n_movies)

Number of users:  173693
Number of movies:  18662


In [4]:
user_id_to_number = {user_mapping[i]: i for i in range(len(user_mapping))}
movie_id_to_number = {movie_mapping[i]: i for i in range(len(movie_mapping))}
number_to_user_id = {i: user_mapping[i] for i in range(len(user_mapping))}
number_to_movie_id = {i: movie_mapping[i] for i in range(len(movie_mapping))}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movies_df['movie_id'])}

In [5]:
user_id = 'ur6312156'
user_id_number = user_id_to_number[user_id]
print('User_id number: ', user_id_number)
assert(number_to_user_id[user_id_number] == user_id)

movie_id = 'tt0062292'
movie_id_number = movie_id_to_number[movie_id]
print('Movie_id number: ', movie_id_number)
assert(number_to_movie_id[movie_id_number] == movie_id)

User_id number:  1
Movie_id number:  18660


In [6]:
new_id_start = n_movies

# Function to assign or create a movie_id_number
def assign_movie_id_number(movie_id):
    global new_id_start
    if movie_id not in movie_id_to_number:
        movie_id_to_number[movie_id] = new_id_start
        number_to_movie_id[new_id_start] = movie_id
        new_id_start += 1
    return movie_id_to_number[movie_id]

# Apply the function to create the 'movie_id_number' column
movies_df['movie_id_number'] = movies_df['movie_id'].apply(assign_movie_id_number)

In [7]:
rating_data = rating_df_copy[['user_id_number', 'movie_id_number', 'rating']].to_numpy()
rating_data[:5]

array([[0, 0, 8],
       [1, 0, 8],
       [2, 0, 1],
       [3, 0, 3],
       [4, 0, 7]])

In [8]:
def split_ratings_by_user(data, train_ratio=0.6, val_ratio=0.2, random_state=420):
    """
    Split ratings for each user into train, validation, and test sets
    with 60:20:20 split and maintaining rating count distribution.
    """
    np.random.seed(random_state)
    
    # Initialize empty lists for split data
    train_data_list, val_data_list, test_data_list = [], [], []
    
    # Group ratings by user using a dictionary for better efficiency
    user_groups = {}
    for row in data:
        user_groups.setdefault(row[0], []).append(row)
    
    for user_ratings in user_groups.values():
        user_ratings = np.array(user_ratings)
        total_ratings = len(user_ratings)
        
        # Shuffle user ratings to ensure randomness
        np.random.shuffle(user_ratings)
        
        # Calculate split indices
        train_end = math.ceil(total_ratings * train_ratio)
        val_end = train_end + math.ceil(total_ratings * val_ratio)
        
        # Append splits
        train_data_list.append(user_ratings[:train_end])
        val_data_list.append(user_ratings[train_end:val_end])
        test_data_list.append(user_ratings[val_end:])
    
    # Concatenate all users' split data
    train_data = np.vstack(train_data_list)
    val_data = np.vstack(val_data_list)
    test_data = np.vstack(test_data_list)
    
    return train_data, val_data, test_data

In [9]:
train_data, val_data, test_data = split_ratings_by_user(rating_data)

In [10]:
movies_df['genres'] = movies_df['genres'].fillna('Unknown')
movies_df['director'] = movies_df['director'].fillna('Unknown')
movies_df['stars'] = movies_df['stars'].fillna('Unknown')
movies_df['duration'] = movies_df['duration'].fillna(0)  # Default duration as 0
movies_df['rating'] = movies_df['rating'].fillna(0.0)    # Default rating as 0.0
movies_df['oscar'] = movies_df['oscar'].fillna(0)        # Default oscar count as 0
movies_df['country'] = movies_df['country'].fillna('Unknown')
movies_df['overview'] = movies_df['overview'].fillna('No overview available')

movies_df['info'] = 'Genres: ' + movies_df['genres'] + '\n' \
                    + 'Director: ' + movies_df['director'] + '\n' \
                    + 'Cast: ' + movies_df['stars'] + '\n' \
                    + 'Duration: ' + movies_df['duration'].astype(str) + ' minutes \n' \
                    + 'Average rating: ' + movies_df['rating'].astype(str) + '\n' \
                    + 'Number of oscars: ' + movies_df['oscar'].astype(str) + '\n' \
                    + 'Country: ' + movies_df['country'] + '\n' \
                    + 'Overview: ' + movies_df['overview']

In [11]:
print(movies_df['info'][0])

Genres: Adventure,Drama,Fantasy,Horror
Director: Francesco Bertolini
Cast: Salvatore Papa,Arturo Pirovano,Giuseppe de Liguoro
Duration: 71 minutes 
Average rating: 7.0
Number of oscars: 0
Country: Italy
Overview: 1911 silent film and Italy's first full-length feature film, loosely adapted from "Inferno", the first canticle of Dante Alighieri's "Divine Comedy". It chronicles Dante's travel through the Circles of Hell, guided by the poet Virgil.


In [12]:
# Embedding of movie info

embeddings = np.load('../checkpoints/embeddings.npz')['embeds']
print(embeddings.shape)

(19108, 768)


In [13]:
def evaluate_recommendation(cf_model, test_data, n_movies, n_top=5):
    hit_ratios, ndcg_scores = [], []

    unique_users = np.unique(test_data[:, 0])
    for u in tqdm(unique_users, desc="Evaluating Users"):
        user_test_ratings = test_data[test_data[:, 0] == u]
        high_rated_test_movies = user_test_ratings[user_test_ratings[:, 2] >= 8, 1]

        if len(high_rated_test_movies) != 0:
            train_movies = cf_model.train_data[cf_model.train_data[:, 0] == u, 1]
            all_movies = np.arange(n_movies)
            unrated_movies = all_movies[~np.isin(all_movies, np.concatenate([train_movies, user_test_ratings[:, 1]]))]

            sampled_unrated_movies = np.random.choice(unrated_movies, 99, replace=False)

            hit_ratio = 0
            dcg = 0
            for high_rated_movie in high_rated_test_movies:
                candidate_movies = np.concatenate([[high_rated_movie], sampled_unrated_movies])
                movie_scores = {i: cf_model.pred(u, i) for i in candidate_movies}
                
                sorted_movies = sorted(movie_scores, key=movie_scores.get, reverse=True)[:n_top]

                if high_rated_movie in sorted_movies:
                    hit_ratio += 1
                    rank = sorted_movies.index(high_rated_movie) + 1
                    dcg += 1 / math.log2(rank + 1)

            hit_ratios.append(hit_ratio / len(high_rated_test_movies))
            ndcg_scores.append(dcg / len(high_rated_test_movies))

    return np.mean(hit_ratios), np.mean(ndcg_scores)

## Recommend based on movie_id

### Using cosine similarity of Google's embeddings

In [14]:
# Precompute cosine distances between all movie embeddings
cosine_dist_matrix = cosine_distances(embeddings)

In [15]:
def get_similar_movies(movieid, movies_df, movieid_to_index, cosine_dist_matrix, top_k=5):
    if movieid not in movieid_to_index:
        raise ValueError('Movie ID does not exist.')
    
    # Get the index of the target movie
    index = movieid_to_index[movieid]
    
    # Retrieve the similarity scores for the target movie
    similarity_scores = 1 - cosine_dist_matrix[index]
    
    # Exclude the target movie by setting its similarity score to a very low value
    similarity_scores[index] = -np.inf
    
    # Get the indices of the top-k most similar movies
    top_indices = np.argsort(similarity_scores)[-top_k:][::-1]  # Sort descending
    
    # Print target movie information
    movie_info = movies_df.iloc[index]
    print('Target movie name:', movie_info['title'])
    print('Target movie info:\n', movie_info['info'])
    print('------------------')
    
    # Retrieve and print information for the top-k similar movies
    for i, similar_index in enumerate(top_indices):
        similar_movie_info = movies_df.iloc[similar_index]
        print(f"Similar movie {i + 1} name: {similar_movie_info['title']}")
        print(f"Similar movie {i + 1} info:\n{similar_movie_info['info']}")
        print('------------------')

In [16]:
get_similar_movies('tt4154796', movies_df, movie_id_to_index, cosine_dist_matrix)

Target movie name: Avengers: Endgame
Target movie info:
 Genres: Action,Adventure,Drama,Sci-Fi
Director: Anthony Russo
Cast: Robert Downey Jr.,Chris Evans,Mark Ruffalo
Duration: 181 minutes 
Average rating: 8.4
Number of oscars: 0
Country: United States
Overview: After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
------------------
Similar movie 1 name: Avengers: Infinity War
Similar movie 1 info:
Genres: Action,Adventure,Sci-Fi
Director: Anthony Russo
Cast: Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo
Duration: 149 minutes 
Average rating: 8.4
Number of oscars: 0
Country: United States
Overview: The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos before his blitz of devastation and ruin puts an end to the universe.
------------------
Similar movie 2 name

### Evaluation

In [17]:
sim_matrix = linear_kernel(embeddings, embeddings)

In [18]:
class ContentBasedRecommender:
    def __init__(self, sim_matrix, train_data):
        self.sim_matrix = sim_matrix
        self.train_data = train_data

    def pred(self, user_id, movie_id):
        # Get movies rated by the user in training data
        user_train_ratings = self.train_data[self.train_data[:, 0] == user_id]
        if len(user_train_ratings) == 0:
            return 5  # No training data for user

        # Weighted average similarity score
        rated_movie_ids = user_train_ratings[:, 1].astype(int)
        rated_movie_scores = user_train_ratings[:, 2].astype(float)
        sim_scores = self.sim_matrix[movie_id, rated_movie_ids]

        # Weighted average
        return np.dot(sim_scores, rated_movie_scores) / np.sum(sim_scores)

In [21]:
cf_model = ContentBasedRecommender(sim_matrix, train_data)

hit_ratio, ndcg = evaluate_recommendation(cf_model, val_data, n_movies)
print(f"Validation - Hit Ratio: {hit_ratio:.4f}, NDCG: {ndcg:.4f}")

hit_ratio, ndcg = evaluate_recommendation(cf_model, test_data, n_movies)
print(f"Test - Hit Ratio: {hit_ratio:.4f}, NDCG: {ndcg:.4f}")

Evaluating Users: 100%|██████████| 16317/16317 [33:52<00:00,  8.03it/s]
Validation - Hit Ratio: 0.0790, NDCG: 0.0583
Evaluating Users: 100%|██████████| 16317/16317 [34:23<00:00,  7.91it/s]
Test - Hit Ratio: 0.0804, NDCG: 0.0584
