In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

np.random.seed(123)

In [2]:
ratings = pd.read_csv("../two_towers/data/ml-1m/ratings.dat", sep="::", header=None)
ratings.columns = ["user_id", "movie_id", "rating", "timestamp"]

movies = pd.read_csv("../two_towers/data/ml-1m/movies.dat", sep="::", header=None)
movies.columns = ["movie_id", "title", "genres"]

users = pd.read_csv("../two_towers/data/ml-1m/users.dat", sep="::", header=None)
users.columns = ["user_id", "gender", "age", "occupation", "zip_code"]

print(ratings.head())
print(movies.head())
print(users.head())


  ratings = pd.read_csv("../two_towers/data/ml-1m/ratings.dat", sep="::", header=None)


   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291
   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
   user_id gender  age  occupation zip_code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455


  movies = pd.read_csv("../two_towers/data/ml-1m/movies.dat", sep="::", header=None)
  users = pd.read_csv("../two_towers/data/ml-1m/users.dat", sep="::", header=None)


In [3]:
rand_userIds = np.random.choice(ratings['user_id'].unique(),
                               size=int(len(ratings['user_id'].unique())*0.1),
                               replace=False)
print(len(rand_userIds))
ratings = ratings.loc[ratings['user_id'].isin(rand_userIds)]

print(ratings.head())

604
     user_id  movie_id  rating  timestamp
799       10      2622       5  978228212
800       10       648       4  978224925
801       10      2628       3  978228408
802       10      3358       5  978226378
803       10      3359       3  978227125


In [4]:

from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

# Feature preprocessing functions
class FeatureProcessor:
    def __init__(self):
        self.user_encoders = {}
        self.movie_encoders = {}
        self.sentence_model = None
        self.user_features_cache = {}
        self.movie_features_cache = {}
        
    def prepare_user_features(self, users_df):
        """Prepare user features: gender, age, occupation one-hot encoding"""
        print("Preparing user features...")
        
        # Gender encoding (M=1, F=0)
        gender_encoded = (users_df['gender'] == 'M').astype(float)
        
        # Age one-hot encoding (7 categories: 1, 18, 25, 35, 45, 50, 56)
        age_onehot = pd.get_dummies(users_df['age'], prefix='age').astype(float)
        
        # Occupation one-hot encoding (0-20)
        occupation_onehot = pd.get_dummies(users_df['occupation'], prefix='occ').astype(float)
        
        # Combine all user features into a single DataFrame
        feature_columns = ['user_id']
        feature_data = [users_df['user_id'].values]
        
        # Add gender
        feature_columns.append('gender')
        feature_data.append(gender_encoded.values)
        
        # Add age features
        for col in age_onehot.columns:
            feature_columns.append(col)
            feature_data.append(age_onehot[col].values)
        
        # Add occupation features
        for col in occupation_onehot.columns:
            feature_columns.append(col)
            feature_data.append(occupation_onehot[col].values)
        
        # Create feature matrix
        feature_matrix = np.column_stack(feature_data)
        user_features = pd.DataFrame(feature_matrix, columns=feature_columns)
        
        # Ensure all feature columns (except user_id) are float
        for col in user_features.columns:
            if col != 'user_id':
                user_features[col] = user_features[col].astype(float)
        
        print(f"User features shape: {user_features.shape}")
        print(f"User feature columns: {list(user_features.columns)}")
        print(f"User feature dtypes:\n{user_features.dtypes}")
        
        # Cache features for quick lookup (keep on CPU)
        for _, row in user_features.iterrows():
            user_id = int(row['user_id'])
            # Get feature values excluding user_id and convert to numpy array
            feature_values = row.drop('user_id').values.astype(np.float32)
            features = torch.tensor(feature_values, dtype=torch.float32)
            self.user_features_cache[user_id] = features
            
        self.user_feature_dim = len(user_features.columns) - 1  # Exclude user_id
        print(f"User feature dimension: {self.user_feature_dim}")
        return user_features
    
    def prepare_movie_features(self, movies_df, device='cpu'):
        """Prepare movie features using sentence transformers for title and genres"""
        print("Preparing movie features with sentence transformers...")
        
        # Initialize sentence transformer
        if self.sentence_model is None:
            print("Loading sentence transformer model...")
            self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
        
        # Encode movie titles
        print("Encoding movie titles...")
        titles = movies_df['title'].tolist()
        title_embeddings = self.sentence_model.encode(
            titles, convert_to_tensor=True, device=device, batch_size=64
        )
        
        # Process genres and encode them
        print("Encoding movie genres...")
        genre_texts = []
        for genres_str in movies_df['genres']:
            # Convert pipe-separated genres to readable text
            genres_list = genres_str.split('|')
            genre_text = ' '.join(genres_list).replace("Children's", "Children")
            genre_texts.append(genre_text)
        
        genre_embeddings = self.sentence_model.encode(
            genre_texts, convert_to_tensor=True, device=device, batch_size=64
        )
        
        # Concatenate title and genre embeddings
        movie_embeddings = torch.cat([title_embeddings, genre_embeddings], dim=1)
        
        print(f"Title embeddings shape: {title_embeddings.shape}")
        print(f"Genre embeddings shape: {genre_embeddings.shape}")
        print(f"Combined movie embeddings shape: {movie_embeddings.shape}")
        
        # Normalize the embeddings for better training stability
        print("Normalizing movie embeddings...")
        movie_embeddings_normalized = F.normalize(movie_embeddings, p=2, dim=1)
        
        # Print normalization stats
        print(f"Original embeddings - mean: {movie_embeddings.mean().item():.4f}, std: {movie_embeddings.std().item():.4f}")
        print(f"Normalized embeddings - mean: {movie_embeddings_normalized.mean().item():.4f}, std: {movie_embeddings_normalized.std().item():.4f}")
        
        # Cache features for quick lookup (move to CPU for storage)
        for idx, row in movies_df.iterrows():
            movie_id = int(row['movie_id'])
            # Store normalized embeddings on CPU to avoid memory issues
            self.movie_features_cache[movie_id] = movie_embeddings_normalized[idx].cpu()
            
        self.movie_feature_dim = movie_embeddings_normalized.shape[1]
        print(f"Movie feature dimension: {self.movie_feature_dim}")
        return movie_embeddings_normalized
    
    def get_user_features(self, user_id):
        """Get cached user features"""
        return self.user_features_cache.get(user_id, torch.zeros(self.user_feature_dim))
    
    def get_movie_features(self, movie_id):
        """Get cached movie features"""
        return self.movie_features_cache.get(movie_id, torch.zeros(self.movie_feature_dim))


In [5]:
# Initialize feature processor and prepare features
print("Preparing user and movie features...")

# Initialize feature processor
feature_processor = FeatureProcessor()

# Prepare user features
user_features_df = feature_processor.prepare_user_features(users)

# Prepare movie features  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
movie_embeddings = feature_processor.prepare_movie_features(movies, device=device)

print(f"\nFeature preparation complete!")
print(f"User feature dimension: {feature_processor.user_feature_dim}")
print(f"Movie feature dimension: {feature_processor.movie_feature_dim}")


Preparing user and movie features...
Preparing user features...
User features shape: (6040, 30)
User feature columns: ['user_id', 'gender', 'age_1', 'age_18', 'age_25', 'age_35', 'age_45', 'age_50', 'age_56', 'occ_0', 'occ_1', 'occ_2', 'occ_3', 'occ_4', 'occ_5', 'occ_6', 'occ_7', 'occ_8', 'occ_9', 'occ_10', 'occ_11', 'occ_12', 'occ_13', 'occ_14', 'occ_15', 'occ_16', 'occ_17', 'occ_18', 'occ_19', 'occ_20']
User feature dtypes:
user_id    float64
gender     float64
age_1      float64
age_18     float64
age_25     float64
age_35     float64
age_45     float64
age_50     float64
age_56     float64
occ_0      float64
occ_1      float64
occ_2      float64
occ_3      float64
occ_4      float64
occ_5      float64
occ_6      float64
occ_7      float64
occ_8      float64
occ_9      float64
occ_10     float64
occ_11     float64
occ_12     float64
occ_13     float64
occ_14     float64
occ_15     float64
occ_16     float64
occ_17     float64
occ_18     float64
occ_19     float64
occ_20     float64


  return torch._C._cuda_getDeviceCount() > 0


Encoding movie titles...
Encoding movie genres...
Title embeddings shape: torch.Size([3883, 384])
Genre embeddings shape: torch.Size([3883, 384])
Combined movie embeddings shape: torch.Size([3883, 768])
Normalizing movie embeddings...
Original embeddings - mean: 0.0010, std: 0.0510
Normalized embeddings - mean: 0.0007, std: 0.0361
Movie feature dimension: 768

Feature preparation complete!
User feature dimension: 29
Movie feature dimension: 768


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97208 entries, 799 to 998118
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   user_id    97208 non-null  int64
 1   movie_id   97208 non-null  int64
 2   rating     97208 non-null  int64
 3   timestamp  97208 non-null  int64
dtypes: int64(4)
memory usage: 3.7 MB


In [7]:
ratings['rank_latest'] = ratings.groupby(['user_id'])['timestamp'] \
                                .rank(method = 'first',ascending=False)

train_ratings = ratings[ratings['rank_latest'].isin([1,2])]
validation_ratings = ratings[ratings['rank_latest'] == 1]
test_ratings = ratings[ratings['rank_latest'] == 2]

# drop columns that we no Longer need 
train_ratings = train_ratings[['user_id', 'movie_id', 'rating']]
test_ratings = test_ratings[['user_id','movie_id','rating']]

In [8]:
num_users = ratings['user_id'].max()+1
num_items = ratings['movie_id'].max()+1

all_movieIds = ratings['movie_id'].unique()

In [9]:
class MovieLensTrainDataset(Dataset):
    """MovieLens Pytorch Dataset for Training
    Args:
        ratings(pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    """
    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__ (self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]
    
    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user_id'], ratings['movie_id']))
        
        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)
        
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [10]:
class OptimizedMovieLensTrainDataset(Dataset):
    """Optimized MovieLens Dataset using Candidate Generator for Negative Sampling with Features
    Args:
        ratings(pd.DataFrame): Dataframe containing the movie ratings
        candidate_generator(CandidateGenerator): Optimized candidate generator
        feature_processor(FeatureProcessor): Feature processor for user and movie features
        num_negatives(int): Number of negative samples per positive sample
        negative_method(str): Method for generating negative candidates
        sampling_strategy(str): Strategy for negative sampling ('unique_per_user', 'per_positive', 'stratified')
    """
    def __init__(self, ratings, candidate_generator, feature_processor, num_negatives=4, negative_method="hybrid", sampling_strategy="unique_per_user"):
        self.ratings = ratings
        self.candidate_generator = candidate_generator
        self.feature_processor = feature_processor
        self.num_negatives = num_negatives
        self.negative_method = negative_method
        self.sampling_strategy = sampling_strategy
        self.user_features, self.movie_features, self.labels = self.get_dataset()
        
    def __len__(self):
        return len(self.user_features)
    
    def __getitem__(self, idx):
        return self.user_features[idx], self.movie_features[idx], self.labels[idx]
    
    def get_dataset(self):
        user_features_list, movie_features_list, labels = [], [], []
        
        print(f"Generating training dataset with {self.negative_method} negative sampling")
        print(f"Sampling strategy: {self.sampling_strategy}")
        
        # Process each user to generate negatives using candidate generator
        user_positive_items = self.ratings.groupby('user_id')['movie_id'].apply(list).to_dict()
        
        for user_id, positive_items in tqdm(user_positive_items.items(), desc="Processing users"):
            # Get user features once for this user
            user_feat = self.feature_processor.get_user_features(user_id)
            
            # Add positive samples
            for item_id in positive_items:
                movie_feat = self.feature_processor.get_movie_features(item_id)
                user_features_list.append(user_feat)
                movie_features_list.append(movie_feat)
                labels.append(1)
            
            # Generate negatives based on sampling strategy
            if self.sampling_strategy == "unique_per_user":
                self._add_unique_negatives_per_user(user_features_list, movie_features_list, labels, user_id, user_feat, positive_items)
            elif self.sampling_strategy == "per_positive":
                self._add_negatives_per_positive(user_features_list, movie_features_list, labels, user_id, user_feat, positive_items)
            elif self.sampling_strategy == "stratified":
                self._add_stratified_negatives(user_features_list, movie_features_list, labels, user_id, user_feat, positive_items)
            else:
                raise ValueError(f"Unknown sampling strategy: {self.sampling_strategy}")
        
        print(f"Generated {len(user_features_list)} samples ({labels.count(1)} positive, {labels.count(0)} negative)")
        print(f"Negative-to-positive ratio: {labels.count(0) / labels.count(1):.2f}")
        
        # Convert to tensors
        user_features_tensor = torch.stack(user_features_list)
        movie_features_tensor = torch.stack(movie_features_list)
        labels_tensor = torch.tensor(labels, dtype=torch.long)
        
        return user_features_tensor, movie_features_tensor, labels_tensor
    
    def _add_unique_negatives_per_user(self, user_features_list, movie_features_list, labels, user_id, user_feat, positive_items):
        """Add unique negative samples per user (most efficient approach)"""
        total_negatives_needed = len(positive_items) * self.num_negatives
        
        # Generate candidate pool
        negative_candidates = self.candidate_generator.generate_candidates(
            user_id, 
            method=self.negative_method, 
            num_candidates=min(total_negatives_needed * 2, 300)  # Get more candidates than needed
        )
        
        # Filter out positive items to ensure no overlap
        negative_candidates = [item for item in negative_candidates if item not in positive_items]
        
        # Sample unique negatives
        if len(negative_candidates) > 0:
            sample_size = min(total_negatives_needed, len(negative_candidates))
            sampled_negatives = np.random.choice(
                negative_candidates,
                size=sample_size,
                replace=False
            )
            
            # Add all sampled negatives for this user
            for neg_item in sampled_negatives:
                movie_feat = self.feature_processor.get_movie_features(neg_item)
                user_features_list.append(user_feat)
                movie_features_list.append(movie_feat)
                labels.append(0)
    
    def _add_negatives_per_positive(self, user_features_list, movie_features_list, labels, user_id, user_feat, positive_items):
        """Add negatives per positive item (original intended approach but fixed)"""
        # Generate candidate pool once per user
        negative_candidates = self.candidate_generator.generate_candidates(
            user_id,
            method=self.negative_method,
            num_candidates=min(len(positive_items) * self.num_negatives * 2, 300)
        )
        
        # Filter out positive items
        negative_candidates = [item for item in negative_candidates if item not in positive_items]
        
        if len(negative_candidates) == 0:
            return
        
        # For each positive item, sample unique negatives
        all_used_negatives = set()
        for _ in positive_items:
            # Available negatives (excluding already used ones)
            available_negatives = [item for item in negative_candidates if item not in all_used_negatives]
            
            if len(available_negatives) == 0:
                break
                
            # Sample negatives for this positive item
            sample_size = min(self.num_negatives, len(available_negatives))
            sampled_negatives = np.random.choice(
                available_negatives,
                size=sample_size,
                replace=False
            )
            
            # Add negatives and mark as used
            for neg_item in sampled_negatives:
                movie_feat = self.feature_processor.get_movie_features(neg_item)
                user_features_list.append(user_feat)
                movie_features_list.append(movie_feat)
                labels.append(0)
                all_used_negatives.add(neg_item)
    
    def _add_stratified_negatives(self, user_features_list, movie_features_list, labels, user_id, user_feat, positive_items):
        """Add stratified negatives from different candidate methods"""
        total_negatives_needed = len(positive_items) * self.num_negatives
        
        # Get negatives from different strategies
        methods = ["popularity", "collaborative", "content"]
        negatives_per_method = total_negatives_needed // len(methods)
        
        all_negatives = []
        
        for method in methods:
            method_candidates = self.candidate_generator.generate_candidates(
                user_id,
                method=method,
                num_candidates=negatives_per_method * 2
            )
            # Filter out positives and already selected negatives
            method_candidates = [item for item in method_candidates 
                               if item not in positive_items and item not in all_negatives]
            
            # Sample from this method
            if len(method_candidates) > 0:
                sample_size = min(negatives_per_method, len(method_candidates))
                sampled = np.random.choice(method_candidates, size=sample_size, replace=False)
                all_negatives.extend(sampled)
        
        # Fill remaining slots with random negatives if needed
        remaining_slots = total_negatives_needed - len(all_negatives)
        if remaining_slots > 0:
            available_items = self.candidate_generator.get_available_items(user_id)
            remaining_items = [item for item in available_items if item not in all_negatives]
            
            if len(remaining_items) > 0:
                sample_size = min(remaining_slots, len(remaining_items))
                additional_negatives = np.random.choice(remaining_items, size=sample_size, replace=False)
                all_negatives.extend(additional_negatives)
        
        # Add all negatives for this user
        for neg_item in all_negatives:
            movie_feat = self.feature_processor.get_movie_features(neg_item)
            user_features_list.append(user_feat)
            movie_features_list.append(movie_feat)
            labels.append(0)

In [11]:
from functools import lru_cache
from collections import defaultdict
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Global data structures for fast candidate generation
class CandidateGenerator:
    def __init__(self, train_ratings, movies, all_movieIds):
        self.train_ratings = train_ratings
        self.movies = movies
        self.all_movieIds = all_movieIds
        self.user_interacted_items = train_ratings.groupby('user_id')['movie_id'].apply(list).to_dict()
        
        # Pre-compute global data structures
        self._precompute_popularity()
        self._precompute_movie_genres()
        self._precompute_user_similarity()
        self._precompute_genre_profiles()
        
    def _precompute_popularity(self):
        """Pre-compute item popularity ranking"""
        self.item_popularity = self.train_ratings['movie_id'].value_counts()
        self.popular_items = self.item_popularity.index.tolist()
        
    def _precompute_movie_genres(self):
        """Create movie-to-genres dictionary for O(1) lookups"""
        self.movie_to_genres = {}
        for _, row in self.movies.iterrows():
            self.movie_to_genres[row['movie_id']] = row['genres'].split('|')
            
    def _precompute_user_similarity(self):
        """Pre-compute user similarity matrix using vectorized operations"""
        # Create user-item matrix
        users = list(self.user_interacted_items.keys())
        user_to_idx = {user: idx for idx, user in enumerate(users)}
        
        # Build sparse matrix
        rows, cols = [], []
        for user_id, items in self.user_interacted_items.items():
            user_idx = user_to_idx[user_id]
            for item in items:
                rows.append(user_idx)
                cols.append(item)
        
        # Create binary user-item matrix
        data = np.ones(len(rows))
        self.user_item_matrix = csr_matrix((data, (rows, cols)), 
                                          shape=(len(users), max(self.all_movieIds) + 1))
        
        # Compute user similarity (cosine similarity for efficiency)
        self.user_similarity = cosine_similarity(self.user_item_matrix)
        self.user_to_idx = user_to_idx
        self.idx_to_user = {idx: user for user, idx in user_to_idx.items()}
        
    def _precompute_genre_profiles(self):
        """Pre-compute genre profiles for users"""
        self.user_genre_profiles = {}
        all_genres = set()
        
        # Extract all unique genres
        for genres_list in self.movie_to_genres.values():
            all_genres.update(genres_list)
        
        self.all_genres = list(all_genres)
        self.genre_to_idx = {genre: idx for idx, genre in enumerate(self.all_genres)}
        
        # Build user genre profiles
        for user_id, items in self.user_interacted_items.items():
            genre_counts = defaultdict(int)
            for item in items:
                if item in self.movie_to_genres:
                    for genre in self.movie_to_genres[item]:
                        genre_counts[genre] += 1
            self.user_genre_profiles[user_id] = dict(genre_counts)
            
    @lru_cache(maxsize=1000)
    def get_available_items(self, user_id):
        """Cache available items for users"""
        interacted_items = set(self.user_interacted_items.get(user_id, []))
        all_items_set = set(self.all_movieIds)
        return list(all_items_set - interacted_items)
    
    def generate_popularity_candidates(self, user_id, num_candidates=100):
        """Optimized popularity-based candidate generation"""
        available_items = set(self.get_available_items(user_id))
        
        candidates = []
        for item in self.popular_items:
            if item in available_items:
                candidates.append(item)
                if len(candidates) >= num_candidates:
                    break
        
        return candidates[:num_candidates]
    
    def generate_collaborative_candidates(self, user_id, num_candidates=100):
        """Optimized collaborative filtering using pre-computed similarity"""
        if user_id not in self.user_to_idx:
            return self.get_available_items(user_id)[:num_candidates]
        
        user_idx = self.user_to_idx[user_id]
        available_items = set(self.get_available_items(user_id))
        
        # Get most similar users
        similarities = self.user_similarity[user_idx]
        similar_user_indices = np.argsort(similarities)[-51:-1]  # Top 50 similar users
        
        # Score candidates based on similar users
        candidate_scores = defaultdict(float)
        for similar_idx in similar_user_indices:
            similar_user_id = self.idx_to_user[similar_idx]
            similarity_score = similarities[similar_idx]
            
            if similarity_score > 0.1:  # Threshold
                for item in self.user_interacted_items.get(similar_user_id, []):
                    if item in available_items:
                        candidate_scores[item] += similarity_score
        
        # Sort by score and return top candidates
        sorted_candidates = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)
        return [item for item, score in sorted_candidates[:num_candidates]]
    
    def generate_content_candidates(self, user_id, num_candidates=100):
        """Optimized content-based filtering using pre-computed genre profiles"""
        user_genres = self.user_genre_profiles.get(user_id, {})
        if not user_genres:
            return self.get_available_items(user_id)[:num_candidates]
        
        available_items = set(self.get_available_items(user_id))
        
        # Score items by genre overlap
        candidate_scores = {}
        for item in available_items:
            if item in self.movie_to_genres:
                score = 0
                for genre in self.movie_to_genres[item]:
                    score += user_genres.get(genre, 0)
                if score > 0:
                    candidate_scores[item] = score
        
        # Sort by score and return top candidates
        sorted_candidates = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)
        return [item for item, score in sorted_candidates[:num_candidates]]
    
    def generate_hybrid_candidates(self, user_id, num_candidates=100):
        """Optimized hybrid candidate generation"""
        # Get candidates from each method
        pop_candidates = self.generate_popularity_candidates(user_id, num_candidates//3)
        collab_candidates = self.generate_collaborative_candidates(user_id, num_candidates//3)
        content_candidates = self.generate_content_candidates(user_id, num_candidates//3)
        
        # Combine and deduplicate using set operations
        hybrid_candidates = list(dict.fromkeys(pop_candidates + collab_candidates + content_candidates))
        
        # Fill remaining slots with random items
        remaining_slots = num_candidates - len(hybrid_candidates)
        if remaining_slots > 0:
            available_items = set(self.get_available_items(user_id))
            remaining_items = list(available_items - set(hybrid_candidates))
            if remaining_items:
                random_indices = np.random.choice(len(remaining_items), 
                                                size=min(remaining_slots, len(remaining_items)), 
                                                replace=False)
                hybrid_candidates.extend([remaining_items[i] for i in random_indices])
        
        return hybrid_candidates[:num_candidates]
    
    def generate_candidates(self, user_id, method="hybrid", num_candidates=100):
        """Main interface for candidate generation"""
        if method == "popularity":
            return self.generate_popularity_candidates(user_id, num_candidates)
        elif method == "collaborative":
            return self.generate_collaborative_candidates(user_id, num_candidates)
        elif method == "content":
            return self.generate_content_candidates(user_id, num_candidates)
        elif method == "hybrid":
            return self.generate_hybrid_candidates(user_id, num_candidates)
        else:
            # Random fallback
            available_items = self.get_available_items(user_id)
            indices = np.random.choice(len(available_items), 
                                     size=min(num_candidates, len(available_items)), 
                                     replace=False)
            return [available_items[i] for i in indices]

# Initialize the optimized candidate generator
print("Initializing optimized candidate generator...")
candidate_gen = CandidateGenerator(ratings, movies, all_movieIds)
print("Candidate generator initialized!")

# Backward compatibility function
def generate_candidates(user_id, method="hybrid", num_candidates=100):
    """Backward compatible interface"""
    return candidate_gen.generate_candidates(user_id, method, num_candidates)

Initializing optimized candidate generator...
Candidate generator initialized!


In [12]:
class NCF(nn.Module):
    """ Neural Collaborative Filtering (NCF) with Features
    
        Args:
            user_feature_dim (int): Dimension of user features
            movie_feature_dim (int): Dimension of movie features
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            feature_processor (FeatureProcessor): Feature processor for user and movie features
            candidate_generator (CandidateGenerator): Candidate generator for negative sampling
    """
    
    def __init__(self, user_feature_dim, movie_feature_dim, ratings, feature_processor, candidate_generator):
        super().__init__()
        
        # Store parameters
        self.user_feature_dim = user_feature_dim
        self.movie_feature_dim = movie_feature_dim
        self.ratings = ratings
        self.feature_processor = feature_processor
        self.candidate_generator = candidate_generator
        
        # Feature processing layers
        self.user_fc = nn.Linear(user_feature_dim, 64)
        self.movie_fc = nn.Linear(movie_feature_dim, 64)
        
        # NCF layers
        self.fc1 = nn.Linear(in_features=128, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, user_features, movie_features):
        """
        Forward pass with user and movie feature vectors
        
        Args:
            user_features: Tensor of user features [batch_size, user_feature_dim]
            movie_features: Tensor of movie features [batch_size, movie_feature_dim]
        """
        
        # Process features through initial layers
        user_processed = nn.ReLU()(self.user_fc(user_features))
        movie_processed = nn.ReLU()(self.movie_fc(movie_features))
        
        # Concat the processed features
        vector = torch.cat([user_processed, movie_processed], dim=-1)
        
        # Pass through dense layers
        vector = self.dropout(nn.ReLU()(self.fc1(vector)))
        vector = self.dropout(nn.ReLU()(self.fc2(vector)))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def compute_loss(self, batch):
        user_features, movie_features, labels = batch
        predicted_labels = self(user_features, movie_features)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def get_dataloader(self, batch_size=512, num_workers=4, num_negatives=4):
        dataset = OptimizedMovieLensTrainDataset(
            self.ratings, 
            self.candidate_generator,
            self.feature_processor,
            num_negatives=num_negatives,
            negative_method="hybrid",
            sampling_strategy="unique_per_user"
        )
        return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)

In [13]:
class OptimizedNCF(nn.Module):
    """ Optimized Neural Collaborative Filtering (NCF) with Features and Smart Negative Sampling
    
        Args:
            user_feature_dim (int): Dimension of user features
            movie_feature_dim (int): Dimension of movie features
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            feature_processor (FeatureProcessor): Feature processor for user and movie features
            candidate_generator (CandidateGenerator): Optimized candidate generator
            negative_method (str): Method for negative sampling
            sampling_strategy (str): Strategy for negative sampling
    """
    
    def __init__(self, user_feature_dim, movie_feature_dim, ratings, feature_processor, candidate_generator, 
                 negative_method="hybrid", sampling_strategy="unique_per_user"):
        super().__init__()
        
        # Store parameters
        self.user_feature_dim = user_feature_dim
        self.movie_feature_dim = movie_feature_dim
        self.ratings = ratings
        self.feature_processor = feature_processor
        self.candidate_generator = candidate_generator
        self.negative_method = negative_method
        self.sampling_strategy = sampling_strategy
        
        # Feature processing layers with batch normalization
        self.user_fc1 = nn.Linear(user_feature_dim, 128)
        self.user_bn1 = nn.BatchNorm1d(128)
        self.user_fc2 = nn.Linear(128, 64)
        self.user_bn2 = nn.BatchNorm1d(64)
        
        self.movie_fc1 = nn.Linear(movie_feature_dim, 256)
        self.movie_bn1 = nn.BatchNorm1d(256)
        self.movie_fc2 = nn.Linear(256, 128)
        self.movie_bn2 = nn.BatchNorm1d(128)
        self.movie_fc3 = nn.Linear(128, 64)
        self.movie_bn3 = nn.BatchNorm1d(64)
        
        # NCF layers
        self.fc1 = nn.Linear(in_features=128, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, user_features, movie_features):
        """
        Forward pass with user and movie feature vectors
        
        Args:
            user_features: Tensor of user features [batch_size, user_feature_dim]
            movie_features: Tensor of movie features [batch_size, movie_feature_dim]
        """
        
        # Process user features
        user_x = self.dropout(nn.ReLU()(self.user_bn1(self.user_fc1(user_features))))
        user_processed = nn.ReLU()(self.user_bn2(self.user_fc2(user_x)))
        
        # Process movie features
        movie_x = self.dropout(nn.ReLU()(self.movie_bn1(self.movie_fc1(movie_features))))
        movie_x = self.dropout(nn.ReLU()(self.movie_bn2(self.movie_fc2(movie_x))))
        movie_processed = nn.ReLU()(self.movie_bn3(self.movie_fc3(movie_x)))
        
        # Concat the processed features
        vector = torch.cat([user_processed, movie_processed], dim=-1)
        
        # Pass through NCF layers
        vector = self.dropout(nn.ReLU()(self.fc1(vector)))
        vector = self.dropout(nn.ReLU()(self.fc2(vector)))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def compute_loss(self, batch):
        user_features, movie_features, labels = batch
        predicted_labels = self(user_features, movie_features)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def get_dataloader(self, batch_size=512, num_workers=4, num_negatives=4):
        """Get DataLoader with optimized negative sampling"""
        dataset = OptimizedMovieLensTrainDataset(
            self.ratings, 
            self.candidate_generator,
            self.feature_processor,
            num_negatives=num_negatives,
            negative_method=self.negative_method,
            sampling_strategy=self.sampling_strategy
        )
        return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)

In [14]:
# Precompute validation candidates for faster validation
def precompute_validation_candidates(validation_ratings, candidate_method="hybrid", num_candidates=100):
    """
    Precompute candidates for all validation users to speed up validation
    
    Returns:
        dict: {user_id: [candidate_items]}
    """
    validation_users = validation_ratings['user_id'].unique()
    precomputed_candidates = {}
    
    print(f"Precomputing candidates for {len(validation_users)} validation users...")
    
    for user_id in tqdm(validation_users, desc="Precomputing candidates"):
        candidates = generate_candidates(user_id, method=candidate_method, num_candidates=num_candidates)
        precomputed_candidates[user_id] = candidates
    
    print(f"Precomputed candidates for {len(precomputed_candidates)} users")
    return precomputed_candidates

# Precompute candidates before training
validation_candidates = precompute_validation_candidates(validation_ratings, candidate_method="hybrid", num_candidates=100)

Precomputing candidates for 604 validation users...


Precomputing candidates:   0%|          | 0/604 [00:00<?, ?it/s]

Precomputed candidates for 604 users


In [15]:
# Updated validation function for features-based models
def validate_model_with_features(model, test_ratings, precomputed_candidates, device, total_users_to_test=20, k=10):
    """
    Validation function that works with features instead of IDs
    
    Args:
        model: The trained model (with feature processor)
        test_ratings: DataFrame with validation ratings
        precomputed_candidates: Dict of {user_id: [candidate_items]}
        device: torch device
        total_users_to_test: Number of users to test
        k: Top-k for hit ratio calculation
    """
    test_user_item_set = list(set(zip(test_ratings['user_id'], test_ratings['movie_id'])))
    hits = []
    ranks = []
    skipped_cases = 0
    total_cases = 0

    for (u, i) in tqdm(test_user_item_set[:total_users_to_test]):
        total_cases += 1
        
        # Check if user and movie are in our feature caches
        if u not in model.feature_processor.user_features_cache:
            print(f"Skipping user {u} - not in feature cache")
            skipped_cases += 1
            continue
            
        if i not in model.feature_processor.movie_features_cache:
            print(f"Skipping movie {i} - not in feature cache")
            skipped_cases += 1
            continue
        
        # Use precomputed candidates
        candidate_items = precomputed_candidates.get(u, [])
        
        # Make sure the test item is included
        if i not in candidate_items:
            candidate_items = candidate_items + [i]
        
        # Filter candidates to only include movies with features
        valid_candidates = [
            movie_id for movie_id in candidate_items 
            if movie_id in model.feature_processor.movie_features_cache
        ]
        
        if len(valid_candidates) == 0:
            print(f"No valid candidates for user {u}")
            skipped_cases += 1
            continue
            
        if i not in valid_candidates:
            print(f"Target movie {i} not in valid candidates for user {u}")
            skipped_cases += 1
            continue
        
        # Get user features
        user_feat = model.feature_processor.get_user_features(u).unsqueeze(0).to(device)
        
        # Score candidates using features
        predicted_scores = []
        for movie_id in valid_candidates:
            movie_feat = model.feature_processor.get_movie_features(movie_id).unsqueeze(0).to(device)
            with torch.no_grad():
                score = model(user_feat, movie_feat).item()
            predicted_scores.append(score)
        
        # Find rank and hit
        sorted_indices = sorted(range(len(predicted_scores)), key=lambda idx: predicted_scores[idx], reverse=True)
        sorted_items = [valid_candidates[idx] for idx in sorted_indices]
        relevant_item_rank = sorted_items.index(i) + 1
        ranks.append(relevant_item_rank)
        
        if relevant_item_rank <= k:
            hits.append(1)
        else:
            hits.append(0)
            
        # Debug: Print first few cases
        if len(hits) <= 3:
            target_score = predicted_scores[valid_candidates.index(i)] if i in valid_candidates else "N/A"
            print(f"User {u}, Movie {i}: Score={target_score}, Rank={relevant_item_rank}/{len(valid_candidates)}")

    print(f"\nValidation Summary:")
    print(f"Total test cases: {total_cases}")
    print(f"Skipped cases: {skipped_cases}")
    print(f"Valid cases processed: {len(hits)}")
    
    if len(hits) == 0:
        print("Warning: No valid test cases found!")
        return 0.0, 0.0, 0.0

    # Calculate metrics
    hits_tensor = torch.tensor(hits, dtype=torch.float32)
    ranks_tensor = torch.tensor(ranks, dtype=torch.float32)
    
    hit_ratio = hits_tensor.mean().item()
    mrr = torch.mean(1.0 / ranks_tensor).item()
    mean_rank = ranks_tensor.mean().item()
    
    print(f"Hit Ratio @ {k}: {hit_ratio:.3f}")
    print(f"Mean Rank: {mean_rank:.1f}")
    print(f"MRR: {mrr:.3f}")
    
    return hit_ratio, mrr, mean_rank


In [16]:
sampling_strategies = [
    ("unique_per_user", "hybrid"),
]

results = {}

for sampling_strategy, neg_method in sampling_strategies:
    print(f"\n{'='*70}")
    print(f"Training with {sampling_strategy.upper()} sampling + {neg_method.upper()} negatives")
    print(f"{'='*70}")
    
    # Create optimized model with CORRECT feature dimensions
    model = OptimizedNCF(
        user_feature_dim=feature_processor.user_feature_dim,
        movie_feature_dim=feature_processor.movie_feature_dim,
        ratings=train_ratings, 
        feature_processor=feature_processor,
        candidate_generator=candidate_gen,
        negative_method=neg_method, 
        sampling_strategy=sampling_strategy
    )

    # Set up training parameters
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters())
    dataloader = model.get_dataloader(batch_size=512, num_workers=4, num_negatives=4)

    # Training loop
    num_epochs = 30
    
    epoch_results = []
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0
        num_batches = 0
        
        for batch in tqdm(dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
            # Move batch to device
            user_input, item_input, labels = [x.to(device) for x in batch]
            batch_device = (user_input, item_input, labels)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass and compute loss
            loss = model.compute_loss(batch_device)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
        
        avg_loss = total_loss / num_batches
        
        # Validation phase
        model.eval()
        with torch.no_grad():
            hit_ratio, mrr, mean_rank = validate_model_with_features(
                model, validation_ratings, validation_candidates, device, 
                total_users_to_test=20, k=10
            )
        
        epoch_results.append({
            'epoch': epoch + 1,
            'train_loss': avg_loss,
            'hit_ratio': hit_ratio,
            'mean_rank': mean_rank
        })
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Training Loss: {avg_loss:.4f}")
        print(f"  Hit Ratio @ 10: {hit_ratio:.3f}")
        print(f"  Mean Rank: {mean_rank:.1f}")
        print("-" * 50)

    # Store results
    strategy_name = f"{sampling_strategy}_{neg_method}"
    results[strategy_name] = epoch_results
    
    print(f"Training with {strategy_name} completed!")
    print(f"Final Hit Ratio @ 10: {hit_ratio:.3f}")
    print(f"Final Mean Rank: {mean_rank:.1f}")

# Summary of results
print(f"\n{'='*70}")
print("FINAL RESULTS SUMMARY")
print(f"{'='*70}")

for strategy_name, epoch_results in results.items():
    final_result = epoch_results[-1]
    print(f"{strategy_name:30} | Hit Ratio: {final_result['hit_ratio']:.3f} | Mean Rank: {final_result['mean_rank']:.1f}")

print("\nAll experiments completed!")



Training with UNIQUE_PER_USER sampling + HYBRID negatives
Generating training dataset with hybrid negative sampling
Sampling strategy: unique_per_user


Processing users:   0%|          | 0/604 [00:00<?, ?it/s]

Generated 6040 samples (1208 positive, 4832 negative)
Negative-to-positive ratio: 4.00


Training Epoch 1/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.3749134838581085, Rank=19/101
User 3912, Movie 318: Score=0.3827151656150818, Rank=26/101
User 1878, Movie 1920: Score=0.38291895389556885, Rank=33/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.150
Mean Rank: 39.5
MRR: 0.053
Epoch 1/30
  Training Loss: 0.5509
  Hit Ratio @ 10: 0.150
  Mean Rank: 39.5
--------------------------------------------------


Training Epoch 2/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.31022658944129944, Rank=13/101
User 3912, Movie 318: Score=0.30032438039779663, Rank=55/101
User 1878, Movie 1920: Score=0.3212267756462097, Rank=38/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.150
Mean Rank: 30.0
MRR: 0.101
Epoch 2/30
  Training Loss: 0.4361
  Hit Ratio @ 10: 0.150
  Mean Rank: 30.0
--------------------------------------------------


Training Epoch 3/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.4091329872608185, Rank=12/101
User 3912, Movie 318: Score=0.32761064171791077, Rank=71/101
User 1878, Movie 1920: Score=0.40299496054649353, Rank=27/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.200
Mean Rank: 28.1
MRR: 0.107
Epoch 3/30
  Training Loss: 0.3899
  Hit Ratio @ 10: 0.200
  Mean Rank: 28.1
--------------------------------------------------


Training Epoch 4/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.4547901451587677, Rank=9/101
User 3912, Movie 318: Score=0.30193737149238586, Rank=72/101
User 1878, Movie 1920: Score=0.43352991342544556, Rank=28/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.300
Mean Rank: 23.4
MRR: 0.096
Epoch 4/30
  Training Loss: 0.3703
  Hit Ratio @ 10: 0.300
  Mean Rank: 23.4
--------------------------------------------------


Training Epoch 5/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.47724881768226624, Rank=9/101
User 3912, Movie 318: Score=0.23362430930137634, Rank=70/101
User 1878, Movie 1920: Score=0.4771513044834137, Rank=7/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.500
Mean Rank: 21.5
MRR: 0.139
Epoch 5/30
  Training Loss: 0.3544
  Hit Ratio @ 10: 0.500
  Mean Rank: 21.5
--------------------------------------------------


Training Epoch 6/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.5017149448394775, Rank=13/101
User 3912, Movie 318: Score=0.11771272122859955, Rank=72/101
User 1878, Movie 1920: Score=0.5119661092758179, Rank=12/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.350
Mean Rank: 21.3
MRR: 0.116
Epoch 6/30
  Training Loss: 0.3454
  Hit Ratio @ 10: 0.350
  Mean Rank: 21.3
--------------------------------------------------


Training Epoch 7/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.573489248752594, Rank=7/101
User 3912, Movie 318: Score=0.19524624943733215, Rank=55/101
User 1878, Movie 1920: Score=0.5662173628807068, Rank=8/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.400
Mean Rank: 20.5
MRR: 0.181
Epoch 7/30
  Training Loss: 0.3413
  Hit Ratio @ 10: 0.400
  Mean Rank: 20.5
--------------------------------------------------


Training Epoch 8/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.5708229541778564, Rank=12/101
User 3912, Movie 318: Score=0.06519521027803421, Rank=65/101
User 1878, Movie 1920: Score=0.6183537840843201, Rank=2/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.350
Mean Rank: 17.4
MRR: 0.203
Epoch 8/30
  Training Loss: 0.3299
  Hit Ratio @ 10: 0.350
  Mean Rank: 17.4
--------------------------------------------------


Training Epoch 9/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.6082326173782349, Rank=14/101
User 3912, Movie 318: Score=0.07057066261768341, Rank=67/101
User 1878, Movie 1920: Score=0.6689422726631165, Rank=2/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.350
Mean Rank: 19.1
MRR: 0.180
Epoch 9/30
  Training Loss: 0.3269
  Hit Ratio @ 10: 0.350
  Mean Rank: 19.1
--------------------------------------------------


Training Epoch 10/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.634990394115448, Rank=10/101
User 3912, Movie 318: Score=0.09014565497636795, Rank=59/101
User 1878, Movie 1920: Score=0.6708040237426758, Rank=6/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.450
Mean Rank: 17.4
MRR: 0.175
Epoch 10/30
  Training Loss: 0.3162
  Hit Ratio @ 10: 0.450
  Mean Rank: 17.4
--------------------------------------------------


Training Epoch 11/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.626845121383667, Rank=17/101
User 3912, Movie 318: Score=0.09019966423511505, Rank=60/101
User 1878, Movie 1920: Score=0.7156438231468201, Rank=5/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.450
Mean Rank: 17.3
MRR: 0.173
Epoch 11/30
  Training Loss: 0.3016
  Hit Ratio @ 10: 0.450
  Mean Rank: 17.3
--------------------------------------------------


Training Epoch 12/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.41987335681915283, Rank=23/101
User 3912, Movie 318: Score=0.06340239197015762, Rank=65/101
User 1878, Movie 1920: Score=0.6922929883003235, Rank=8/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.650
Mean Rank: 15.9
MRR: 0.218
Epoch 12/30
  Training Loss: 0.2992
  Hit Ratio @ 10: 0.650
  Mean Rank: 15.9
--------------------------------------------------


Training Epoch 13/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.5880275964736938, Rank=18/101
User 3912, Movie 318: Score=0.05856947600841522, Rank=64/101
User 1878, Movie 1920: Score=0.7012561559677124, Rank=5/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.500
Mean Rank: 15.4
MRR: 0.201
Epoch 13/30
  Training Loss: 0.2933
  Hit Ratio @ 10: 0.500
  Mean Rank: 15.4
--------------------------------------------------


Training Epoch 14/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.6087841391563416, Rank=17/101
User 3912, Movie 318: Score=0.06736541539430618, Rank=63/101
User 1878, Movie 1920: Score=0.8232242465019226, Rank=2/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.550
Mean Rank: 16.0
MRR: 0.183
Epoch 14/30
  Training Loss: 0.2828
  Hit Ratio @ 10: 0.550
  Mean Rank: 16.0
--------------------------------------------------


Training Epoch 15/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.6367135643959045, Rank=15/101
User 3912, Movie 318: Score=0.05487886816263199, Rank=65/101
User 1878, Movie 1920: Score=0.8138770461082458, Rank=2/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.550
Mean Rank: 15.2
MRR: 0.153
Epoch 15/30
  Training Loss: 0.2772
  Hit Ratio @ 10: 0.550
  Mean Rank: 15.2
--------------------------------------------------


Training Epoch 16/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.6722294092178345, Rank=17/101
User 3912, Movie 318: Score=0.05882631614804268, Rank=61/101
User 1878, Movie 1920: Score=0.8332768678665161, Rank=2/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.500
Mean Rank: 17.4
MRR: 0.139
Epoch 16/30
  Training Loss: 0.2674
  Hit Ratio @ 10: 0.500
  Mean Rank: 17.4
--------------------------------------------------


Training Epoch 17/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.553834855556488, Rank=18/101
User 3912, Movie 318: Score=0.05139699950814247, Rank=61/101
User 1878, Movie 1920: Score=0.8670216798782349, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.400
Mean Rank: 16.9
MRR: 0.219
Epoch 17/30
  Training Loss: 0.2582
  Hit Ratio @ 10: 0.400
  Mean Rank: 16.9
--------------------------------------------------


Training Epoch 18/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.6991475224494934, Rank=10/101
User 3912, Movie 318: Score=0.05008773133158684, Rank=61/101
User 1878, Movie 1920: Score=0.9223535060882568, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.450
Mean Rank: 15.7
MRR: 0.192
Epoch 18/30
  Training Loss: 0.2636
  Hit Ratio @ 10: 0.450
  Mean Rank: 15.7
--------------------------------------------------


Training Epoch 19/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.7045149207115173, Rank=12/101
User 3912, Movie 318: Score=0.06868652999401093, Rank=59/101
User 1878, Movie 1920: Score=0.7712274193763733, Rank=6/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.600
Mean Rank: 16.0
MRR: 0.139
Epoch 19/30
  Training Loss: 0.2596
  Hit Ratio @ 10: 0.600
  Mean Rank: 16.0
--------------------------------------------------


Training Epoch 20/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.8025133609771729, Rank=9/101
User 3912, Movie 318: Score=0.04011140391230583, Rank=64/101
User 1878, Movie 1920: Score=0.9016074538230896, Rank=2/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.500
Mean Rank: 16.6
MRR: 0.162
Epoch 20/30
  Training Loss: 0.2531
  Hit Ratio @ 10: 0.500
  Mean Rank: 16.6
--------------------------------------------------


Training Epoch 21/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.6881752014160156, Rank=18/101
User 3912, Movie 318: Score=0.04775451496243477, Rank=58/101
User 1878, Movie 1920: Score=0.8799587488174438, Rank=3/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.450
Mean Rank: 16.2
MRR: 0.119
Epoch 21/30
  Training Loss: 0.2492
  Hit Ratio @ 10: 0.450
  Mean Rank: 16.2
--------------------------------------------------


Training Epoch 22/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.8704597353935242, Rank=6/101
User 3912, Movie 318: Score=0.04305202141404152, Rank=61/101
User 1878, Movie 1920: Score=0.8603580594062805, Rank=4/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.500
Mean Rank: 15.9
MRR: 0.147
Epoch 22/30
  Training Loss: 0.2427
  Hit Ratio @ 10: 0.500
  Mean Rank: 15.9
--------------------------------------------------


Training Epoch 23/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.8192331194877625, Rank=9/101
User 3912, Movie 318: Score=0.04926421865820885, Rank=55/101
User 1878, Movie 1920: Score=0.894798994064331, Rank=2/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.650
Mean Rank: 15.4
MRR: 0.144
Epoch 23/30
  Training Loss: 0.2381
  Hit Ratio @ 10: 0.650
  Mean Rank: 15.4
--------------------------------------------------


Training Epoch 24/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.655774712562561, Rank=15/101
User 3912, Movie 318: Score=0.05851385369896889, Rank=60/101
User 1878, Movie 1920: Score=0.9485018253326416, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.300
Mean Rank: 17.0
MRR: 0.160
Epoch 24/30
  Training Loss: 0.2435
  Hit Ratio @ 10: 0.300
  Mean Rank: 17.0
--------------------------------------------------


Training Epoch 25/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.7790047526359558, Rank=13/101
User 3912, Movie 318: Score=0.06829635053873062, Rank=52/101
User 1878, Movie 1920: Score=0.9454501867294312, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.550
Mean Rank: 14.4
MRR: 0.177
Epoch 25/30
  Training Loss: 0.2317
  Hit Ratio @ 10: 0.550
  Mean Rank: 14.4
--------------------------------------------------


Training Epoch 26/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.6422460675239563, Rank=15/101
User 3912, Movie 318: Score=0.07014188170433044, Rank=52/101
User 1878, Movie 1920: Score=0.9712156653404236, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.450
Mean Rank: 16.0
MRR: 0.178
Epoch 26/30
  Training Loss: 0.2320
  Hit Ratio @ 10: 0.450
  Mean Rank: 16.0
--------------------------------------------------


Training Epoch 27/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.7106415629386902, Rank=14/101
User 3912, Movie 318: Score=0.05613015219569206, Rank=61/101
User 1878, Movie 1920: Score=0.9705905914306641, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.450
Mean Rank: 16.5
MRR: 0.153
Epoch 27/30
  Training Loss: 0.2301
  Hit Ratio @ 10: 0.450
  Mean Rank: 16.5
--------------------------------------------------


Training Epoch 28/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.7642590999603271, Rank=14/101
User 3912, Movie 318: Score=0.05563516169786453, Rank=59/101
User 1878, Movie 1920: Score=0.9794411659240723, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.500
Mean Rank: 16.5
MRR: 0.156
Epoch 28/30
  Training Loss: 0.2245
  Hit Ratio @ 10: 0.500
  Mean Rank: 16.5
--------------------------------------------------


Training Epoch 29/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.7726737260818481, Rank=15/101
User 3912, Movie 318: Score=0.10774926841259003, Rank=50/101
User 1878, Movie 1920: Score=0.946782648563385, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.400
Mean Rank: 17.3
MRR: 0.150
Epoch 29/30
  Training Loss: 0.2225
  Hit Ratio @ 10: 0.400
  Mean Rank: 17.3
--------------------------------------------------


Training Epoch 30/30:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/20 [00:00<?, ?it/s]

User 5608, Movie 3160: Score=0.42996346950531006, Rank=16/101
User 3912, Movie 318: Score=0.04836602881550789, Rank=59/101
User 1878, Movie 1920: Score=0.9466111063957214, Rank=1/101

Validation Summary:
Total test cases: 20
Skipped cases: 0
Valid cases processed: 20
Hit Ratio @ 10: 0.450
Mean Rank: 15.0
MRR: 0.168
Epoch 30/30
  Training Loss: 0.2269
  Hit Ratio @ 10: 0.450
  Mean Rank: 15.0
--------------------------------------------------
Training with unique_per_user_hybrid completed!
Final Hit Ratio @ 10: 0.450
Final Mean Rank: 15.0

FINAL RESULTS SUMMARY
unique_per_user_hybrid         | Hit Ratio: 0.450 | Mean Rank: 15.0

All experiments completed!


In [17]:
print("Testing model performance on test set...")

def precompute_test_candidates(test_ratings, candidate_method="hybrid", num_candidates=100):
    test_users = test_ratings['user_id'].unique()
    precomputed_candidates = {}
    
    print(f"Precomputing candidates for {len(test_users)} test users...")
    
    for user_id in tqdm(test_users, desc="Precomputing test candidates"):
        candidates = generate_candidates(user_id, method=candidate_method, num_candidates=num_candidates)
        precomputed_candidates[user_id] = candidates
    
    return precomputed_candidates

test_candidates = precompute_test_candidates(test_ratings, candidate_method="hybrid", num_candidates=100)

model.eval()
with torch.no_grad():
    print(f"Evaluating on test set...")
    test_hit_ratio, test_mrr, test_mean_rank = validate_model_with_features(
        model, test_ratings, test_candidates, device, 
        total_users_to_test=50, k=10
    )

print(f"TEST SET RESULTS:")
print(f"Hit Ratio @ 10: {test_hit_ratio:.3f}")
print(f"Mean Rank: {test_mean_rank:.1f}")
print(f"MRR: {test_mrr:.3f}")


Testing model performance on test set...
Precomputing candidates for 604 test users...


Precomputing test candidates:   0%|          | 0/604 [00:00<?, ?it/s]

Evaluating on test set...


  0%|          | 0/50 [00:00<?, ?it/s]

User 2898, Movie 2502: Score=0.8982254266738892, Rank=4/101
User 4689, Movie 3793: Score=0.9394497275352478, Rank=1/101
User 3138, Movie 1580: Score=0.1508067548274994, Rank=38/101

Validation Summary:
Total test cases: 50
Skipped cases: 0
Valid cases processed: 50
Hit Ratio @ 10: 0.640
Mean Rank: 15.6
MRR: 0.287
TEST SET RESULTS:
Hit Ratio @ 10: 0.640
Mean Rank: 15.6
MRR: 0.287


In [18]:
class ColdStartRecommender:
    """
    Cold Start Recommendation System for new users
    
    Handles recommendations for users with limited or no interaction history
    by leveraging user demographics, initial ratings, and content-based filtering.
    """
    
    def __init__(self, trained_model, feature_processor, candidate_generator, movies_df):
        self.model = trained_model
        self.feature_processor = feature_processor
        self.candidate_generator = candidate_generator
        self.movies_df = movies_df
        self.device = next(trained_model.parameters()).device
        
    def create_user_features(self, user_demographics):
        """
        Create user feature vector from demographics
        
        Args:
            user_demographics: dict with keys: 'gender', 'age', 'occupation'
                - gender: 'M' or 'F'
                - age: int (1, 18, 25, 35, 45, 50, 56)
                - occupation: int (0-20)
        
        Returns:
            torch.Tensor: User feature vector
        """
        gender_encoded = 1.0 if user_demographics['gender'] == 'M' else 0.0
        
        # Create age one-hot (7 categories)
        age_categories = [1, 18, 25, 35, 45, 50, 56]
        age_onehot = [1.0 if user_demographics['age'] == cat else 0.0 for cat in age_categories]
        
        # Create occupation one-hot (21 categories: 0-20)
        occupation_onehot = [1.0 if user_demographics['occupation'] == i else 0.0 for i in range(21)]
        
        # Combine all features
        feature_vector = [gender_encoded] + age_onehot + occupation_onehot
        
        return torch.tensor(feature_vector, dtype=torch.float32)
    
    def get_similar_users_by_demographics(self, user_demographics, top_k=50):
        """
        Find users with similar demographics for collaborative filtering
        
        Args:
            user_demographics: dict with user demographic info
            top_k: number of similar users to return
            
        Returns:
            list: user_ids of similar users
        """
        similar_users = []
        
        # Simple demographic matching - can be made more sophisticated
        for user_id, cached_features in self.feature_processor.user_features_cache.items():
            # Check gender match (first feature)
            gender_match = (cached_features[0].item() == (1.0 if user_demographics['gender'] == 'M' else 0.0))
            
            # Check age category match (positions 1-7)
            age_categories = [1, 18, 25, 35, 45, 50, 56]
            user_age_idx = age_categories.index(user_demographics['age']) if user_demographics['age'] in age_categories else -1
            if user_age_idx >= 0:
                age_match = cached_features[1 + user_age_idx].item() == 1.0
            else:
                age_match = False
            
            # Check occupation match (positions 8-28)
            occ_match = False
            if 0 <= user_demographics['occupation'] <= 20:
                occ_match = cached_features[8 + user_demographics['occupation']].item() == 1.0
            
            # Score based on matches (prioritize age and occupation)
            score = 0
            if gender_match: score += 1
            if age_match: score += 2
            if occ_match: score += 3
            
            if score >= 2:  # Require at least age or occupation match
                similar_users.append((user_id, score))
        
        # Sort by score and return top_k
        similar_users.sort(key=lambda x: x[1], reverse=True)
        return [user_id for user_id, _ in similar_users[:top_k]]
    
    def generate_cold_start_candidates(self, user_demographics, user_ratings=None, num_candidates=100):
        """
        Generate candidate movies for cold start scenario
        
        Args:
            user_demographics: dict with user demographic info
            user_ratings: list of (movie_id, rating) tuples for initial ratings
            num_candidates: number of candidates to generate
            
        Returns:
            list: candidate movie IDs
        """
        candidates = []
        
        if user_ratings is None or len(user_ratings) == 0:
            # Pure cold start - no ratings yet
            # Use popularity + demographic-based recommendations
            
            # Get popular movies
            popular_candidates = self.candidate_generator.generate_popularity_candidates(
                user_id=-1,  # dummy user_id
                num_candidates=num_candidates//2
            )
            candidates.extend(popular_candidates)
            
            # Get recommendations based on similar users' preferences
            similar_users = self.get_similar_users_by_demographics(user_demographics)
            if similar_users:
                # Get popular movies among similar users
                similar_user_movies = []
                for similar_user_id in similar_users[:10]:  # Top 10 similar users
                    user_movies = self.candidate_generator.user_interacted_items.get(similar_user_id, [])
                    similar_user_movies.extend(user_movies)
                
                # Count frequency and get most popular among similar users
                from collections import Counter
                movie_counts = Counter(similar_user_movies)
                demographic_candidates = [movie_id for movie_id, _ in movie_counts.most_common(num_candidates//2)]
                candidates.extend(demographic_candidates)
        
        else:
            # Warm cold start - user has some initial ratings
            # Use content-based recommendations based on liked movies
            
            liked_movies = [movie_id for movie_id, rating in user_ratings if rating >= 4]
            
            if liked_movies:
                # Content-based recommendations using movie genres
                content_candidates = []
                
                # Get genres of liked movies
                liked_genres = []
                for movie_id in liked_movies:
                    if movie_id in self.candidate_generator.movie_to_genres:
                        liked_genres.extend(self.candidate_generator.movie_to_genres[movie_id])
                
                # Count genre preferences
                from collections import Counter
                genre_preferences = Counter(liked_genres)
                
                # Find movies with similar genres
                for movie_id, genres in self.candidate_generator.movie_to_genres.items():
                    if movie_id not in liked_movies:  # Don't recommend already rated movies
                        score = sum(genre_preferences.get(genre, 0) for genre in genres)
                        if score > 0:
                            content_candidates.append((movie_id, score))
                
                # Sort by content score and take top candidates
                content_candidates.sort(key=lambda x: x[1], reverse=True)
                candidates.extend([movie_id for movie_id, _ in content_candidates[:num_candidates//2]])
            
            # Add some popular movies as backup
            popular_candidates = self.candidate_generator.generate_popularity_candidates(
                user_id=-1,
                num_candidates=num_candidates//2
            )
            candidates.extend(popular_candidates)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_candidates = []
        for movie_id in candidates:
            if movie_id not in seen:
                seen.add(movie_id)
                unique_candidates.append(movie_id)
        
        return unique_candidates[:num_candidates]
    
    def recommend_for_new_user(self, user_demographics, user_ratings=None, num_recommendations=10):
        """
        Generate recommendations for a new user
        
        Args:
            user_demographics: dict with keys 'gender', 'age', 'occupation'
            user_ratings: list of (movie_id, rating) tuples for initial ratings (optional)
            num_recommendations: number of recommendations to return
            
        Returns:
            list: list of (movie_id, title, predicted_score) tuples
        """
        # Create user feature vector
        user_features = self.create_user_features(user_demographics)
        user_features = user_features.unsqueeze(0).to(self.device)  # Add batch dimension
        
        # Generate candidate movies
        candidates = self.generate_cold_start_candidates(
            user_demographics, 
            user_ratings, 
            num_candidates=min(200, len(self.movies_df))
        )
        
        # Score candidates using the NCF model
        movie_scores = []
        
        self.model.eval()
        with torch.no_grad():
            for movie_id in candidates:
                if movie_id in self.feature_processor.movie_features_cache:
                    # Get movie features
                    movie_features = self.feature_processor.get_movie_features(movie_id)
                    movie_features = movie_features.unsqueeze(0).to(self.device)
                    
                    # Predict score using NCF model
                    score = self.model(user_features, movie_features).item()
                    
                    # Get movie title
                    movie_title = self.movies_df[self.movies_df['movie_id'] == movie_id]['title'].iloc[0]
                    
                    movie_scores.append((movie_id, movie_title, score))
        
        # Sort by predicted score and return top recommendations
        movie_scores.sort(key=lambda x: x[2], reverse=True)
        
        # Filter out movies user has already rated
        if user_ratings:
            rated_movie_ids = {movie_id for movie_id, _ in user_ratings}
            movie_scores = [(mid, title, score) for mid, title, score in movie_scores 
                           if mid not in rated_movie_ids]
        
        return movie_scores[:num_recommendations]
    
    def get_onboarding_movies(self, num_movies=10):
        """
        Get diverse, popular movies for new user onboarding/rating collection
        
        Args:
            num_movies: number of movies to return for rating
            
        Returns:
            list: list of (movie_id, title, genres) tuples
        """
        # Get popular movies from different genres for diversity
        popular_movies = self.candidate_generator.generate_popularity_candidates(
            user_id=-1, 
            num_candidates=100
        )
        
        # Group by genres to ensure diversity
        genre_movies = {}
        selected_movies = []
        
        for movie_id in popular_movies:
            if movie_id in self.candidate_generator.movie_to_genres:
                movie_genres = self.candidate_generator.movie_to_genres[movie_id]
                movie_title = self.movies_df[self.movies_df['movie_id'] == movie_id]['title'].iloc[0]
                movie_genres_str = self.movies_df[self.movies_df['movie_id'] == movie_id]['genres'].iloc[0]
                
                # Add to genre groups
                for genre in movie_genres:
                    if genre not in genre_movies:
                        genre_movies[genre] = []
                    genre_movies[genre].append((movie_id, movie_title, movie_genres_str))
        
        # Select diverse movies (one from each genre initially)
        used_genres = set()
        for genre, movies in genre_movies.items():
            if len(selected_movies) < num_movies and genre not in used_genres:
                selected_movies.append(movies[0])  # Take the most popular from this genre
                used_genres.add(genre)
        
        # Fill remaining slots with most popular movies
        for movie_id in popular_movies:
            if len(selected_movies) >= num_movies:
                break
            
            movie_title = self.movies_df[self.movies_df['movie_id'] == movie_id]['title'].iloc[0]
            movie_genres_str = self.movies_df[self.movies_df['movie_id'] == movie_id]['genres'].iloc[0]
            
            movie_tuple = (movie_id, movie_title, movie_genres_str)
            if movie_tuple not in selected_movies:
                selected_movies.append(movie_tuple)
        
        return selected_movies[:num_movies]

# Initialize the cold start recommender with the trained model
cold_start_recommender = ColdStartRecommender(
    trained_model=model,
    feature_processor=feature_processor,
    candidate_generator=candidate_gen,
    movies_df=movies
)

print("Cold Start Recommender initialized successfully!")


Cold Start Recommender initialized successfully!


In [19]:
# DEMONSTRATION: Cold Start Recommendation Examples

print("="*80)
print("COLD START RECOMMENDATION SYSTEM DEMONSTRATION")
print("="*80)

# Example 1: Pure Cold Start - New user with only demographics
print("\n1. PURE COLD START SCENARIO")
print("-" * 50)

new_user_demographics = {
    'gender': 'M',     # Male
    'age': 25,         # 25 years old  
    'occupation': 4    # College/grad student (based on MovieLens occupation codes)
}

print(f"New User Demographics: {new_user_demographics}")

# Get recommendations without any ratings
cold_start_recommendations = cold_start_recommender.recommend_for_new_user(
    user_demographics=new_user_demographics,
    user_ratings=None,  # No ratings yet
    num_recommendations=10
)

print(f"\nTop 10 Cold Start Recommendations:")
for i, (movie_id, title, score) in enumerate(cold_start_recommendations, 1):
    print(f"{i:2d}. {title:<50} (Score: {score:.3f})")

# Example 2: Get onboarding movies for initial rating collection
print(f"\n\n2. ONBOARDING MOVIES FOR RATING COLLECTION")
print("-" * 50)

onboarding_movies = cold_start_recommender.get_onboarding_movies(num_movies=8)

print("Movies to show new user for initial ratings (diverse genres):")
for i, (movie_id, title, genres) in enumerate(onboarding_movies, 1):
    print(f"{i}. {title:<40} | Genres: {genres}")

# Example 3: Warm Cold Start - User has provided some initial ratings
print(f"\n\n3. WARM COLD START SCENARIO")
print("-" * 50)

# Simulate user rating some of the onboarding movies
initial_ratings = [
    (onboarding_movies[0][0], 5),  # Loved the first movie
    (onboarding_movies[1][0], 4),  # Liked the second movie
    (onboarding_movies[2][0], 2),  # Didn't like the third movie
    (onboarding_movies[3][0], 4),  # Liked the fourth movie
]

print("User's initial ratings:")
for movie_id, rating in initial_ratings:
    movie_title = movies[movies['movie_id'] == movie_id]['title'].iloc[0]
    print(f"  {movie_title:<50} - Rating: {rating}/5")

# Get improved recommendations based on initial ratings
warm_recommendations = cold_start_recommender.recommend_for_new_user(
    user_demographics=new_user_demographics,
    user_ratings=initial_ratings,
    num_recommendations=10
)

print(f"\nTop 10 Recommendations after initial ratings:")
for i, (movie_id, title, score) in enumerate(warm_recommendations, 1):
    print(f"{i:2d}. {title:<50} (Score: {score:.3f})")

print(f"\n\n4. COMPARISON: Different User Demographics")
print("-" * 50)

# Example with different demographics
female_user_demographics = {
    'gender': 'F',     # Female
    'age': 45,         # 45 years old
    'occupation': 0    # Other/not specified
}

print(f"Different User Demographics: {female_user_demographics}")

female_recommendations = cold_start_recommender.recommend_for_new_user(
    user_demographics=female_user_demographics,
    user_ratings=None,
    num_recommendations=5
)

print(f"\nTop 5 Recommendations for different demographic:")
for i, (movie_id, title, score) in enumerate(female_recommendations, 1):
    print(f"{i}. {title:<50} (Score: {score:.3f})")

print(f"\n{'='*80}")
print("Cold Start Recommendation Demonstration Complete!")
print(f"{'='*80}")


COLD START RECOMMENDATION SYSTEM DEMONSTRATION

1. PURE COLD START SCENARIO
--------------------------------------------------
New User Demographics: {'gender': 'M', 'age': 25, 'occupation': 4}

Top 10 Cold Start Recommendations:
 1. Platoon (1986)                                     (Score: 0.991)
 2. Doctor Zhivago (1965)                              (Score: 0.988)
 3. Sling Blade (1996)                                 (Score: 0.981)
 4. Maltese Falcon, The (1941)                         (Score: 0.978)
 5. Apocalypse Now (1979)                              (Score: 0.977)
 6. Rebel Without a Cause (1955)                       (Score: 0.976)
 7. Producers, The (1968)                              (Score: 0.969)
 8. Life Is Beautiful (La Vita � bella) (1997)         (Score: 0.967)
 9. X-Men (2000)                                       (Score: 0.966)
10. Akira (1988)                                       (Score: 0.960)


2. ONBOARDING MOVIES FOR RATING COLLECTION
-------------------------