In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

In [27]:
df = pd.read_pickle('../sparse_ratings.pkl')

In [28]:
movies_info = pd.read_csv("../suitable_movies.csv")

In [29]:
movies_info.set_index("movieId", inplace=True)

In [30]:
def filter_users(df=df, n=1, categories=4):
    """
    Find users who have rated at least n movies from every genre and simulate their ratings by limiting to n per genre.
    
    Args:
    df (DataFrame): User-item matrix with users as columns and movies as rows, and ratings as values.
    movies_info (DataFrame): Data about movies including genres and average ratings.
    n (int): Number of movies per genre to include in the simulation.
    
    Returns:
    DataFrame: A new DataFrame with simulated user ratings.
    """
    # Create a DataFrame to track the genre of each movie
    movie_genres = movies_info['suitable_genre']
    df_ = df.copy()
    valid_users = []
    for user in df_.columns:
        genres = {
            "Action": 0,
            "Adventure": 0,
            "Animation": 0,
            "Comedy": 0,
            "Crime": 0,
            "Drama": 0,
            "Fantasy": 0,
            "Horror": 0,
            "Mystery": 0,
            "Romance": 0,
            "Sci-Fi": 0,
            "Thriller": 0
        }
        user_data = df_[df_[user] != 0.0][user]
        movie_genres = movies_info[movies_info.index.isin(user_data.index)]['suitable_genre']
        for movie_genre in movie_genres:
            genres[movie_genre] += 1
        #print(genres)
        one_more_than_n = False
        valid_user_counter = 0
        for genre, count in genres.items():
            if count >= n:
                valid_user_counter += 1
            if count > n: one_more_than_n = True
        # if the user has at least n movies in each category and they have at least one movie to be predicted - we say they are valid for our purposes and we store their id (column name of the df_)
        if valid_user_counter >= categories and one_more_than_n:
            valid_users.append(user)
            #print("valid user - >", user)
    return valid_users

In [31]:
from sklearn.utils import shuffle


def filter_ratings(valid_users, df=df, n=2, categories=4):
    filtered = []
    for user_id in valid_users:
        genres = {
            "Action": 0,
            "Adventure": 0,
            "Animation": 0,
            "Comedy": 0,
            "Crime": 0,
            "Drama": 0,
            "Fantasy": 0,
            "Horror": 0,
            "Mystery": 0,
            "Romance": 0,
            "Sci-Fi": 0,
            "Thriller": 0
        }
        user_ratings = df[user_id][df[user_id] != 0.0]
        user_genres = movies_info[movies_info.index.isin(user_ratings.index)]['suitable_genre']
        #looks like double work but here we are ensuring only the genres that were rated at least n times are used
        for movie_genre in user_genres:
            genres[movie_genre] += 1
        unique_user_genres = [key for key, value in genres.items() if value >= n]
        # then making sure that we are picking the genres randomly (we will cut it later)
        user_genres = shuffle(user_genres)
        # for every unique genre the user has rated - pick n random movies
        lst_filtered_movies = []
        lst_filtered_out_movies = []
        # making sure every genre is represented the same amount
        for genre in unique_user_genres[:categories]:
            #print(genre)
            movies_of_genre = user_genres[user_genres == genre].index.values
            df_movies_of_genre = shuffle(pd.DataFrame(data=movies_of_genre))
            #print(df_movies_of_genre[:n])
            #revise this - this can't be right - need to make it more general
            lst_filtered_movies.append(df_movies_of_genre[:n])
        lst_filtered_movies = pd.concat(lst_filtered_movies).values
        for movie_id in user_ratings.index.values:
            if movie_id not in lst_filtered_movies:
                lst_filtered_out_movies.append(movie_id)
        filtered.append([user_id, [x[0] for x in lst_filtered_movies], lst_filtered_out_movies])
    return_df = pd.DataFrame(data=filtered, columns=['userId', 'testMovies', 'evalMovies'])

    return return_df.set_index('userId')

In [None]:
import time

all_ns = [i for i in range(2, 5)]
all_cats = [i for i in range(5, 12)]
for n in all_ns:
    beginning = time.time()
    print(beginning)
    for cat in all_cats:
        eval_df = filter_ratings(filter_users(df=df, n=n, categories=cat), df=df, n=n, categories=cat)
        eval_df.to_pickle(f'n{n}_c{cat}')
        print(f'n{n}_c{cat}: {eval_df}')
        print(f'time: {time.time() - beginning}')

1714557091.9434223
