<h1>Assignment 2</h1>

In [95]:
import os
import pandas as pd
import numpy as np

LINKS_PATH = os.path.join(os.getcwd(), 'movie', 'links.csv')
TAGS_PATH = os.path.join(os.getcwd(), 'movie', 'tags.csv')
MOVIES_PATH = os.path.join(os.getcwd(), 'movie', 'movies.csv')
RATINGS_PATH = os.path.join(os.getcwd(), 'movie', 'ratings.csv')

def load_data(path):
    return pd.read_csv(path)

<h4>Loading data</h4>

In [96]:
ratings = load_data(RATINGS_PATH)

In [97]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


<h2>Task 1 + 2</h2>

<p>1. The first aggregation approach is the average method. The main idea behind this
approach is that all members are considered equals. So, the rating of an item for a group
of users will be given be averaging the scores of an item across all group members.</p>

<p>2. The second aggregation method is the least misery method, where one member can act
as a veto for the rest of the group. In this case, the rating of an item for a group of users is
computed as the minimum score assigned to that item in all group members
recommendations.</p>

<p>First let's get helper functions for predicting movie ratings from the previous assignment and prepare data</p>

In [98]:
# Helper functions from the previous assignment

# min_common_percentage = 0.1
min_common_items = 2

# Implementation of Pearson correlation
def get_similarity_between_two_items(user1, user2):
    # Find common items
    common_items = user1.notna() & user2.notna()
    
    common_items_count = common_items.sum()
    if common_items_count == 0:
        return 0  # No common items, no correlation
    
    # We implement a treshold to avoid meaningless correlations
    # Approach 1: Common items percentage
    # total_items_user1 = user1.count()
    # total_items_user2 = user2.count()
    # common_percentage_user1 = common_items_count / total_items_user1
    # common_percentage_user2 = common_items_count / total_items_user2
    # if common_percentage_user1 < min_common_percentage or common_percentage_user2 < min_common_percentage:
    #     return 0  # Not enough common items for a meaningful correlation

    # Approach 2: Common items count
    if common_items_count < min_common_items:
        return 0  # Not enough common items for a meaningful correlation
    
    # Get the common items
    user1_common = user1[common_items]
    user2_common = user2[common_items]
    
    # Pearson correlation requires at least 2 common items
    if len(user1_common) < 2:
        return 0 
    
    # Calculate the Pearson correlation coefficient
    correlation = user1_common.corr(user2_common)
    
    if np.isnan(correlation):
        return 0  # Handle NaN values
    
    return max(correlation, 0)  # Return a non-negative correlation

def predict_rating(user_id, movie_id, ratings_by_users):
    # If the user has already rated the movie, return the known rating
    if not np.isnan(ratings_by_users.loc[user_id, movie_id]):
        return ratings_by_users.loc[user_id, movie_id]

    # Get the users who rated the movie
    users_who_rated = ratings_by_users[ratings_by_users[movie_id].notna()].index

    # Calculate the similarities and the weighted ratings
    similarities = [get_similarity_between_two_items(ratings_by_users.loc[user_id], ratings_by_users.loc[other_user_id]) for other_user_id in users_who_rated]
    weighted_ratings = [similarity * (ratings_by_users.loc[other_user_id, movie_id] - ratings_by_users.loc[other_user_id].mean()) for other_user_id, similarity in zip(users_who_rated, similarities)]

    # If no one else rated the movie, return the mean rating of the user
    if sum(similarities) == 0:
        return ratings_by_users.loc[user_id].mean()

    # Return the weighted average rating, ensuring it is within the range of 0 - 5
    return max(min((sum(weighted_ratings) / sum(similarities)) + ratings_by_users.loc[user_id].mean(), 5), 0)


In [99]:
# Copying the ratings dataframe to a new dataframe for further processing
movie_ratings = ratings.copy()

In [100]:
# Making a pivot table to get the ratings of each movie by each user
ratings_by_users = movie_ratings.pivot_table(index='userId', columns='movieId', values='rating', aggfunc='first')
ratings_by_users.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


<p>Implementing average agregation and least misery approaches for a set of users<p>

In [101]:
def group_rating_prediction(userIds, ratings_by_users, strategy='average'):
    # Get only the data about the users we are interested in
    ratings_by_users = ratings_by_users.loc[userIds]

    # Remove the movies that no one has rated
    ratings_by_users = ratings_by_users.dropna(axis=1, how='all')

    # Fill the missing ratings of every user by predicting them
    for user_id in userIds:
        for movie_id in ratings_by_users.columns:
            ratings_by_users.loc[user_id, movie_id] = predict_rating(user_id, movie_id, ratings_by_users)
    

    if(strategy == 'average'):
        # Get the average rating for every movie
        movie_ratings_average = ratings_by_users.mean(axis=0)

    if(strategy == 'least_misery'):
        # Get the least misery rating for every movie
        movie_ratings_average = ratings_by_users.min(axis=0)

    # Sort the movies by their average rating
    movie_ratings_average = movie_ratings_average.sort_values(ascending=False)

    return movie_ratings_average


<p>Produce a group of 3 users, and for this group, show the top-10 recommendations, i.e.,
the 10 movies with the highest prediction scores that (i) the average method suggests,
and (ii) the least misery method suggest</p>

In [102]:
# Predictions for the given group of users
userIds = [1, 2, 3]
average_predictions = group_rating_prediction(userIds, ratings_by_users, strategy='average')
least_misery_predictions = group_rating_prediction(userIds, ratings_by_users, strategy='least_misery')

  c /= stddev[:, None]
  c /= stddev[None, :]


  c /= stddev[:, None]
  c /= stddev[None, :]


In [103]:
# Show the top 10 movies for the given group of users by average rating
average_predictions.head(10)

movieId
4518     5.000000
5181     5.000000
7991     5.000000
2851     5.000000
5919     5.000000
70946    5.000000
5746     5.000000
3703     5.000000
6835     5.000000
26409    4.833333
dtype: float64

In [104]:
# Show the top 10 movies for the given group of users by least misery rating
least_misery_predictions.head(10)

movieId
2851     5.0
5746     5.0
5919     5.0
6835     5.0
7991     5.0
4518     5.0
3703     5.0
5181     5.0
70946    5.0
26409    4.5
dtype: float64