<h1>Assignment 2</h1>

In [1]:
import os
import pandas as pd
import numpy as np

LINKS_PATH = os.path.join(os.getcwd(), 'movie', 'links.csv')
TAGS_PATH = os.path.join(os.getcwd(), 'movie', 'tags.csv')
MOVIES_PATH = os.path.join(os.getcwd(), 'movie', 'movies.csv')
RATINGS_PATH = os.path.join(os.getcwd(), 'movie', 'ratings.csv')

def load_data(path):
    return pd.read_csv(path)

<h4>Loading data</h4>

In [2]:
ratings = load_data(RATINGS_PATH)

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


<h2>Part A</h2>

<p>1. The first aggregation approach is the average method. The main idea behind this
approach is that all members are considered equals. So, the rating of an item for a group
of users will be given be averaging the scores of an item across all group members.</p>

<p>2. The second aggregation method is the least misery method, where one member can act
as a veto for the rest of the group. In this case, the rating of an item for a group of users is
computed as the minimum score assigned to that item in all group members
recommendations.</p>

<p>First let's get helper functions for predicting movie ratings from the previous assignment and prepare data</p>

In [4]:
# Helper functions from the previous assignment

# min_common_percentage = 0.1
min_common_items = 2

# Implementation of Pearson correlation
def get_similarity_between_two_items(user1, user2):
    # Find common items
    common_items = user1.notna() & user2.notna()
    
    common_items_count = common_items.sum()
    if common_items_count == 0:
        return 0  # No common items, no correlation
    
    # We implement a treshold to avoid meaningless correlations
    # Approach 1: Common items percentage
    # total_items_user1 = user1.count()
    # total_items_user2 = user2.count()
    # common_percentage_user1 = common_items_count / total_items_user1
    # common_percentage_user2 = common_items_count / total_items_user2
    # if common_percentage_user1 < min_common_percentage or common_percentage_user2 < min_common_percentage:
    #     return 0  # Not enough common items for a meaningful correlation

    # Approach 2: Common items count
    if common_items_count < min_common_items:
        return 0  # Not enough common items for a meaningful correlation
    
    # Get the common items
    user1_common = user1[common_items]
    user2_common = user2[common_items]
    
    # Pearson correlation requires at least 2 common items
    if len(user1_common) < 2:
        return 0 
    
    # Calculate the Pearson correlation coefficient
    correlation = user1_common.corr(user2_common)
    
    if np.isnan(correlation):
        return 0  # Handle NaN values
    
    return max(correlation, 0)  # Return a non-negative correlation

def predict_rating(user_id, movie_id, ratings_by_users):
    # If the user has already rated the movie, return the known rating
    if not np.isnan(ratings_by_users.loc[user_id, movie_id]):
        return ratings_by_users.loc[user_id, movie_id]

    # Get the users who rated the movie
    users_who_rated = ratings_by_users[ratings_by_users[movie_id].notna()].index

    # Calculate the similarities and the weighted ratings
    similarities = [get_similarity_between_two_items(ratings_by_users.loc[user_id], ratings_by_users.loc[other_user_id]) for other_user_id in users_who_rated]
    weighted_ratings = [similarity * (ratings_by_users.loc[other_user_id, movie_id] - ratings_by_users.loc[other_user_id].mean()) for other_user_id, similarity in zip(users_who_rated, similarities)]

    # If no one else rated the movie, return the mean rating of the user
    if sum(similarities) == 0:
        return ratings_by_users.loc[user_id].mean()

    # Return the weighted average rating, ensuring it is within the range of 0 - 5
    return max(min((sum(weighted_ratings) / sum(similarities)) + ratings_by_users.loc[user_id].mean(), 5), 0)


In [5]:
# Copying the ratings dataframe to a new dataframe for further processing
movie_ratings = ratings.copy()

In [6]:
# Making a pivot table to get the ratings of each movie by each user
ratings_by_users = movie_ratings.pivot_table(index='userId', columns='movieId', values='rating', aggfunc='first')
ratings_by_users.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


<p>Implementing average agregation and least misery approaches for a set of users<p>

In [7]:
# The function has some extensive parameters to allow for more flexibility
# userIds: The ids of the users we want to predict the ratings for
# all_users_ratings: The movie ratings by all users
# predict_na: If True, the function will predict the ratings of the movies that the users haven't rated (True by default)
# strategy: The strategy to use for the prediction. Can be 'average' or 'least_misery'
# min_num_of_rated_movies: The minimum number rates a movie should have to be considered

def group_rating_prediction(userIds, all_users_ratings, strategy='average', useFullDataset=False):
    # Get only the data about the users we are interested in
    group_users_ratings = all_users_ratings.loc[userIds]

    # Remove the movies that no one has rated
    group_users_ratings = group_users_ratings.dropna(axis=1, how='all')

    # Dataset to use for prediction
    if(useFullDataset):
        dataset_for_prediction = all_users_ratings.copy()
    else:
        dataset_for_prediction = group_users_ratings.copy()

    # Predict the individual ratings of the movies that the users haven't rated
    for user_id in userIds:
        for movie_id in group_users_ratings.columns:
            group_users_ratings.loc[user_id, movie_id] = predict_rating(user_id, movie_id, dataset_for_prediction)
    
    if(strategy == 'average'):
        # Get the average rating for every movie
        movie_ratings_average = group_users_ratings.mean(axis=0)

    if(strategy == 'least_misery'):
        # Get the least misery rating for every movie
        movie_ratings_average = group_users_ratings.min(axis=0)

    # Sort the movies by their average rating
    movie_ratings_average = movie_ratings_average.sort_values(ascending=False)

    return movie_ratings_average


<p>Produce a group of 3 users, and for this group, show the top-10 recommendations, i.e.,
the 10 movies with the highest prediction scores that (i) the average method suggests,
and (ii) the least misery method suggest</p>

In [8]:
# Predictions for the given group of users
userIds = [1, 2, 3]
average_predictions = group_rating_prediction(userIds, ratings_by_users, strategy='average')
least_misery_predictions = group_rating_prediction(userIds, ratings_by_users, strategy='least_misery')

In [9]:
# Show the top 10 movies for the given group of users by average rating
average_predictions.head(10)

movieId
2851     4.649425
70946    4.649425
5181     4.649425
849      4.649425
5746     4.649425
5919     4.649425
6835     4.649425
7991     4.649425
3703     4.649425
4518     4.649425
dtype: float64

In [10]:
# Show the top 10 movies for the given group of users by least misery rating
least_misery_predictions.head(10)

movieId
7991     3.948276
3703     3.948276
5919     3.948276
5764     3.948276
5746     3.948276
26409    3.948276
849      3.948276
5181     3.948276
2851     3.948276
1587     3.948276
dtype: float64

<p> The ratings are the same for all of the top 10 suggestions. This is caused by the fact that we used the specified group of users to predict missing movie values. There were a lot of movies rated only by one user and in this case the prediction fucntion returns the mean rating of the user<p>

<p>To avoid this kind of situation we can use the whole dataset to predict the missing movie ratings(since we use weighted ratings based on person correlation they should be accurate)</p>

<p> Let's do it </p>

In [11]:
# Using the full dataset for prediction
userIds = [1, 2, 3]
q = group_rating_prediction(userIds, ratings_by_users, strategy='average', useFullDataset=True)
least_misery_predictions = group_rating_prediction(userIds, ratings_by_users, strategy='least_misery', useFullDataset=True)


In [12]:
average_predictions.head(10)

movieId
2851     4.649425
70946    4.649425
5181     4.649425
849      4.649425
5746     4.649425
5919     4.649425
6835     4.649425
7991     4.649425
3703     4.649425
4518     4.649425
dtype: float64

In [13]:
least_misery_predictions.head(10)

movieId
70946    5.000000
3703     4.980555
1587     4.500000
4518     4.026903
2288     4.000000
2851     3.948276
6835     3.948276
5181     3.948276
5919     3.948276
5764     3.948276
dtype: float64

<p>As we can see now the ratings look better, but it required more computing to produce the results</p>

<h2>Part B</h2>

Define a way for counting the disagreements between the users in a group, and propose a method that takes disagreements into account when computing suggestions for the group

 <h3>Idea:</h3>

 We propose a user-based way of computing suggestions with considering disagreements. The basic idea is to assign weights to each user in a group based on how closely their preferences align with others. The rationale behind this is that users who frequently disagree with the majority of the group should have less influence on the final recommendations. 

 By assigning higher importance to users whose ratings align with the majority, we aim to filter out the impact of individuals who may provide random or intentionally false ratings, or those whose preferences significantly differs from the majority of the group. So with this, this method is able to ignore outliers, as it will assign near-zero weight for these users.

In this way, the ratings from these users may not significantly impact the outcome, which is a potential drawback for them. However, this approach could prove advantageous for the majority of the group, ultimately leading to higher overall satisfaction.

<h3>Details of implementation</h3>

One way to capture disagreements is to consider the variance or diversity in user ratings within the group. The idea is to, calculate a disagreement matrix, for a group of users, that represents the differences in ratings between each pair of users. One way to measure this difference is by computing the squared differences between their ratings for common items. When computing recommendations for the group, we use a weighted averaging aggregiation, where the weights are inversely proportional to the disagreement factor for a user, which is the average of the mentioned squared differences with all other users. The rationale is that users who tend to disagree less should have more influence on the group recommendations.


Before calculating this disagreement matrix, we make the individual predictions for all users in the group.

In [14]:
def calculate_disagreement_matrix(ratings_by_relevant_user_matrix: pd.DataFrame) -> np.ndarray:
    # Get the number of users and movies
    number_of_users = ratings_by_relevant_user_matrix.shape[0]

    # Initialize the disagreement matrix
    disagreement_matrix = np.zeros((number_of_users, number_of_users))

    # Calculate the disagreement matrix
    for user1 in range(number_of_users):
        for user2 in range(user1 + 1, number_of_users):
            # Get the ratings of the two users
            user1_ratings = ratings_by_relevant_user_matrix.iloc[user1]
            user2_ratings = ratings_by_relevant_user_matrix.iloc[user2]

            # Get the common items (there should be no NaN values because we predict them, but we check just in case)
            common_items = user1_ratings.notna() & user2_ratings.notna()

            if common_items.sum() == 0:
                continue
            
            # Calculate the squared difference between the ratings
            disagreement = ((user1_ratings[common_items] - user2_ratings[common_items]) ** 2).sum() / common_items.sum()

            # Add the disagreement to the matrix
            disagreement_matrix[user1, user2] = disagreement
            disagreement_matrix[user2, user1] = disagreement

    return disagreement_matrix

def group_rating_prediction_with_disagreement(userIds, ratings_by_users):
    # Get only the data about the users we are interested in
    ratings_by_relevant_users = ratings_by_users.loc[userIds]

    # Remove the movies that no one has rated
    ratings_by_relevant_users = ratings_by_relevant_users.dropna(axis=1, how='all')

    # Predict the individual ratings of the movies that the users haven't rated
    for user_id in userIds:
        for movie_id in ratings_by_relevant_users.columns:
            ratings_by_relevant_users.loc[user_id, movie_id] = predict_rating(user_id, movie_id, ratings_by_users.copy())
    
    # Calculate the disagreement matrix
    disagreement_matrix = calculate_disagreement_matrix(ratings_by_relevant_users)

    # user weights will be inversely proportional to the average of their disagreements
    weights = 1 / (1 + disagreement_matrix.mean(axis=1))

    # the more disagreed the user was with the others the less weight his rating will have
    weighted_average = pd.Series(index=ratings_by_relevant_users.columns)
    for movie_id in ratings_by_relevant_users.columns:
        weighted_average[movie_id] = (ratings_by_relevant_users[movie_id] * weights).sum() / len(userIds)
    
    # transform the ratings to the range of 0 - 5
    weighted_average = weighted_average * 5 / weighted_average.max()
    
    return weighted_average.sort_values(ascending=False)

In [15]:
group_rating_prediction_with_disagreement(userIds, ratings_by_users).head(10)

movieId
70946    5.000000
3703     4.992078
1587     4.824318
2288     4.714071
101      4.577329
6835     4.571495
5746     4.571495
5181     4.571495
5919     4.571495
2502     4.460394
dtype: float64