### Cold Start Analysis for Recommender Systems ###

Recommender systems struggle to provide meaningful recommendations in cold start scenarios where new users or items have little to no interaction data. This notebook explores strategies to address these challenges through simulations and performance analysis.


In [24]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import time

import os, sys
import time
import pandas as pd
import matplotlib.pyplot as plt
# Add the parent directory (one level up) to the Python path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, parent_dir)

import pipeline

# Load datasets
ratings = pd.read_csv('../storage/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp'])
metadata = pd.read_csv('../storage/u.item', sep='|', encoding='latin-1', 
                        names=['item', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
                               'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                               'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                               'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

# Basic exploratory analysis
print(f"Number of users: {ratings['user'].nunique()}")
print(f"Number of movies: {ratings['item'].nunique()}")
print(f"Number of ratings: {len(ratings)}")

# Join ratings with metadata
ratings_metadata = ratings.merge(metadata, on='item')
ratings_metadata.head()


Number of users: 943
Number of movies: 1682
Number of ratings: 100000


Unnamed: 0,user,item,rating,timestamp,title,release_date,video_release_date,IMDb_URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,875747190,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,883888671,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,879138235,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,876503793,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Function to simulate a single new user's ratings
def generate_new_user_ratings(movie_ids, num_ratings=None, start_time=None, end_time=None, user_id=944):
    """
    Generate ratings for a new user with random movies, ratings, and timestamps.
    """
    # Set default timestamp range
    if start_time is None:
        start_time = int(time.time()) - (5 * 365 * 24 * 60 * 60)  # 5 years ago
    if end_time is None:
        end_time = int(time.time())
    
    # Validate timestamp range
    if start_time >= end_time:
        raise ValueError("start_time must be earlier than end_time")
    
    # Set default number of ratings
    if num_ratings is None:
        num_ratings = random.randint(1, 10)
    
    # Ensure we don't exceed available movies
    num_ratings = min(num_ratings, len(movie_ids))
    
    # Randomly select movies
    selected_movies = random.sample(movie_ids.tolist(), num_ratings)
    
    # Generate random ratings (1 to 5)
    ratings = np.random.randint(1, 6, size=num_ratings)
    
    # Generate random timestamps within range
    timestamps = np.random.randint(start_time, end_time, size=num_ratings)
    
    # Create DataFrame
    new_user_df = pd.DataFrame({
        'user': [user_id] * num_ratings,
        'item': selected_movies,
        'rating': ratings,
        'timestamp': timestamps
    })
    
    # Sort by timestamp
    return new_user_df.sort_values('timestamp').reset_index(drop=True)

# Function to simulate multiple new users
def create_multiple_new_users(movie_ids, num_users, start_time=None, end_time=None):
    """
    Create multiple new users, each with random ratings and timestamps.
    """
    # Validate inputs
    if num_users <= 0:
        raise ValueError("num_users must be greater than 0")
    
    # Collect all user ratings
    all_user_ratings = []
    for user_num in range(num_users):
        user_id = 944 + user_num  # Assign unique user ID
        user_df = generate_new_user_ratings(
            movie_ids, 
            start_time=start_time, 
            end_time=end_time,
            user_id=user_id
        )
        all_user_ratings.append(user_df)
    
    # Combine all users' ratings into a single DataFrame
    return pd.concat(all_user_ratings, ignore_index=True)


In [34]:
movie_ids = np.unique(ratings['item'])

# Simulate a single new user
new_user_df = generate_new_user_ratings(movie_ids)

# Simulate multiple new users
multiple_users_df = create_multiple_new_users(movie_ids, num_users=5)


user
948    10
947     8
944     6
946     5
945     4
Name: count, dtype: int64