# Assignment 2 - DATA.ML.360

In [176]:
%pip install pandas
%pip install scipy

import pandas as pd
import numpy as np
from functools import lru_cache
from scipy.spatial.distance import cosine

You should consider upgrading via the '/Users/laurira/uni/recsys/assignment2/a2_venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/laurira/uni/recsys/assignment2/a2_venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


#### We start by creating the rating matrix and adding fuctionalities from assignment 1.

In [177]:
# Read the data file and see how it looks like
df = pd.read_csv('u.data', sep='\t', header=None)

# Add column names and check few rows of the dataset
df.columns = ["user_id", "item_id", "rating", "timestamp"]
df = df.drop("timestamp", axis=1)

# Create a matrix from the ratings. Each row represents an user and each column an item/movie.
rating_matrix = df.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

# Cahce the results of the pearson correlations so that they don't need to be calculated each time.
@lru_cache(maxsize=None)
# Calculate the Pearson correlation between two users ratings
def pearson_correlation(user1_id, user2_id):
    # Get common movies that both users have rated
    common_movies = rating_matrix.columns[(rating_matrix.loc[user1_id] > 0) & (rating_matrix.loc[user2_id] > 0)]

    # Check if the common_movies array is empty
    if len(common_movies) == 0:
        return 0 

    # Get the ratings for the given users matching the common movies
    user1_data = rating_matrix.loc[user1_id, common_movies]
    user2_data = rating_matrix.loc[user2_id, common_movies]

    # Compute the Pearson correlation between the ratings of the two users
    pearson_correlation = np.corrcoef(user1_data, user2_data)[0][1]

    # Check if the correlation is a valid number, else return 0 (no correlation)
    # (cases where all the values are the same -> division happens with 0 -> we get nan)
    if np.isnan(pearson_correlation):
        return 0

    return pearson_correlation

def predict_rating(user_id, item_id):
    # Get the users' ratings for the active user
    active_user_ratings = rating_matrix.loc[user_id]

    # Find neighbor ids who have rated the same item
    item_ratings = rating_matrix[item_id]
    neighbors = item_ratings[item_ratings > 0].index

    # Calculate Pearson correlations for each neighbor
    neighbor_correlations = {}
    for neighbor_id in neighbors:
        correlation = pearson_correlation(user_id, neighbor_id)
        neighbor_correlations[neighbor_id] = correlation
    
    # Get the top 10 closest matching neighbors
    top_neighbors = sorted(neighbor_correlations, key=neighbor_correlations.get, reverse=True)[:10]

    # Initialize variables for prediction
    weighted_rating_sum = 0
    similarity_sum = 0

    # Calculate the prediction for active user
    for neighbor_id in top_neighbors:

        # Get the neighbors rating for the chosen item
        neighbor_item_rating = rating_matrix.at[neighbor_id, item_id]

        # Get every given rating of the neighbor
        all_neighbor_ratings = rating_matrix.loc[neighbor_id][rating_matrix.loc[neighbor_id] > 0]

        # Calculate the difference between the chosen item rating and the user average
        neighbor_mean = np.mean(all_neighbor_ratings)
        rating_difference = neighbor_item_rating - neighbor_mean

        # Combine the rating differences.
        # The neighbor similarity calculated earlier is used as the weight.
        weighted_rating_sum += rating_difference * neighbor_correlations[neighbor_id]
        similarity_sum += abs(neighbor_correlations[neighbor_id])

    # Avoid the case where we might divide by zero
    if similarity_sum == 0:
        return 0
    
    # Calculate the prediction for the active user
    active_user_mean = np.mean(active_user_ratings)
    prediction = active_user_mean + (weighted_rating_sum / similarity_sum)

    return prediction

def create_recommendations(target_user):
    # movie_recommendations = {}
    recommendations_data = []

    # Go through all the movies
    for movie_id in rating_matrix.columns:
        # Calculate a prediction for a movie that the target user hasn't rated yet
        if rating_matrix.at[target_user, movie_id] == 0:
            prediction = predict_rating(target_user, movie_id)
            # movie_recommendations[movie_id] = prediction
            recommendations_data.append((target_user, movie_id, prediction))

    # Get the top 10 matching movies from the recommendations and include the ratings
    # top_recommendations = sorted(movie_recommendations, key=movie_recommendations.get, reverse=True)[:10]
    # top_recommendations = sorted(movie_recommendations.items(), key=lambda x: x[1], reverse=True)[:10]
    # recommendations_df = pd.DataFrame(recommendations_data, columns=['user_id', 'movie_id', 'rating'])
    # top_recommendations = recommendations_df.sort_values(by='rating', ascending=False).head(10)
    top_recommendations = sorted(recommendations_data, key=lambda x: x[2], reverse=True)[:10]


    return top_recommendations

### Part A: Group recommendations with aggregation methods

First we get a set of users and create recommendations for each one.

In [178]:
# Get user & item -> get existing rating or predict one -> calculate average rating for each item

# Select a set of users
users = [1, 2, 4]

# Compute movie recommendations for each user in the group
recommendations = []

for user_id in users:
    recommendations.extend(create_recommendations(user_id))

# Create a dataframe from the ratings
rating_df = pd.DataFrame(recommendations, columns=["user_id", "movie_id", "predicted_rating"])

display(rating_df)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0,user_id,movie_id,predicted_rating
0,1,1309,2.96064
1,1,1308,2.628168
2,1,814,2.486344
3,1,1536,2.288455
4,1,851,2.11894
5,1,1467,2.095579
6,1,1500,2.009591
7,1,1599,2.00208
8,1,408,1.9951
9,1,1642,1.888174


#### Average Aggregation

For the average aggregation method we group the dataframe, get a single list of movies and calculate the average ratings for each item.

In [179]:
# Group the recommendations by movies, and calculate the average rating for each
average_ratings_df = rating_df.groupby('movie_id')['predicted_rating'].mean()

# Sort the values in decreasing oreder
average_ratings_df = average_ratings_df.sort_values(ascending=False)

display(average_ratings_df)

movie_id
1659    3.116439
599     3.027092
1309    2.960640
1621    2.901682
1304    2.864226
1308    2.628168
1661    2.603498
1502    2.533295
1486    2.329112
1493    2.329112
1494    2.329112
1536    2.288455
814     2.262801
1678    2.220782
851     2.118940
1500    2.009591
408     1.995100
1467    1.890090
1599    1.778536
1643    1.753606
1642    1.682665
1639    1.662766
1463    1.640278
1653    1.430860
Name: predicted_rating, dtype: float64

Get the top 10 recommendations with the average method for the three selected users

In [180]:
average_ratings_df.head(10)

movie_id
1659    3.116439
599     3.027092
1309    2.960640
1621    2.901682
1304    2.864226
1308    2.628168
1661    2.603498
1502    2.533295
1486    2.329112
1493    2.329112
Name: predicted_rating, dtype: float64

#### Least Misery method

In the least misery method, instead of calculating averages we just get the minimum rating from each user.

In [181]:
# Group the recommendations by movies, and get the min rating for each
miserable_ratings_df = rating_df.groupby('movie_id')['predicted_rating'].min()

# Sort the values in descending order
miserable_ratings_df = miserable_ratings_df.sort_values(ascending=False)

display(miserable_ratings_df)

movie_id
1659    3.116439
599     3.027092
1309    2.960640
1621    2.864226
1304    2.864226
1308    2.628168
1661    2.603498
1502    2.533295
1486    2.329112
1493    2.329112
1494    2.329112
1536    2.288455
1678    2.183326
851     2.118940
814     2.039258
1500    2.009591
408     1.995100
1643    1.753606
1467    1.684601
1639    1.662766
1463    1.640278
1599    1.554993
1642    1.477156
1653    1.430860
Name: predicted_rating, dtype: float64

Get the top 10 recommendations with the least misery method for the three selected users

In [182]:
miserable_ratings_df.head(10)

movie_id
1659    3.116439
599     3.027092
1309    2.960640
1621    2.864226
1304    2.864226
1308    2.628168
1661    2.603498
1502    2.533295
1486    2.329112
1493    2.329112
Name: predicted_rating, dtype: float64

### Part B: Counting Disagreements

Using cosinine similarity to calulcate disagreements.

In this implementation I chose to calculate similairties between two users using the cosinine similarity. It's a fairly straightforward way of calculating how similar two users are. 0 means no similarity, 1 means perfect similarity. To measure disagreements, I inverted the similarity score by subtracting it from 1. That way we get a number that indicates the differnce between the users.

To compute the disagreements modify the predicted ratings for the selected user group. We form all the possible pairs from the users and calculate a disimilarity score for each. Then this score is used as a weight to find a disagreement rating for each movie we have created predicitons for.

This method is useful when creating group recommendations, because it takes the difference between the users ratings into account. This way we get more balanced recommendations, they arent't just formatted by highest / lowest scroes in a group.

In [183]:
# Create a matrix where user_id is the index, movie_id as columns, and predicted_rating as values
prediction_matrix = rating_df.pivot(index='user_id', columns='movie_id', values='predicted_rating').fillna(0)

# Calculate cosine similarity between two users
def cosine_similarity(user1_id, user2_id):
    # Get the predicted ratings for the two users
    user1_ratings = prediction_matrix.loc[user1_id]
    user2_ratings = prediction_matrix.loc[user2_id]

    # Calculate cosine similarity
    similarity = 1 - cosine(user1_ratings, user2_ratings)

    return similarity

# Calculate disagreement score based on cosine similarity
def calculate_cosinine_disagreement(user1_id, user2_id):
    # Get the cosine similarity result
    cosine_sim = cosine_similarity(user1_id, user2_id)

    # Calculate disagreement score as 1 - cosine similarity
    disagreement_score = 1 - cosine_sim

    return disagreement_score


In [201]:
# Modify the predictd ratings to take disagreements between users into account
def modify_predicted_ratings(users, rating_df):
    # total disagreement score
    disagreement_score = 0

    # Go through each user pair and calucalte a disagreement score
    for i in range(len(users)):
        for j in range(i + 1, len(users)):
            user1_id = users[i]
            user2_id = users[j]

            # Get the disagreement score for the current user pair and add it to the total
            disagreement_score = disagreement_score + calculate_cosinine_disagreement(user1_id, user2_id)

    if len(users) > 1:
        # Get the number of unique pairs
        unique_user_pairs = (len(users) * (len(users) - 1)) / 2

        # Calculate the average disagreement score
        average_disagreement_score = disagreement_score / unique_user_pairs
    else:
        # With only one user -> reurnt the just the score
        average_disagreement_score = disagreement_score

    # Use the average disagreement score as a weight to adjust the predicted ratings
    disagreement_weight = 1 -average_disagreement_score
    rating_df['disagreement_rating'] = rating_df['predicted_rating'] * disagreement_weight

    return rating_df

disagreement_rating_df = modify_predicted_ratings(users, rating_df)
display(disagreement_rating_df)


Unnamed: 0,user_id,movie_id,predicted_rating,disagreement_rating
0,1,1309,2.96064,0.596247
1,1,1308,2.628168,0.52929
2,1,814,2.486344,0.500728
3,1,1536,2.288455,0.460875
4,1,851,2.11894,0.426736
5,1,1467,2.095579,0.422031
6,1,1500,2.009591,0.404714
7,1,1599,2.00208,0.403201
8,1,408,1.9951,0.401796
9,1,1642,1.888174,0.380262


In [203]:
# Get the highest scoring movies
top_disagreement_movies = disagreement_rating_df.sort_values(by='disagreement_rating', ascending=False)

# Show the top 10 for the chosen group of users
top_disagreement_movies[['movie_id', 'disagreement_rating']].head(10)

Unnamed: 0,movie_id,disagreement_rating
20,1659,0.627624
21,599,0.60963
0,1309,0.596247
10,1621,0.591917
23,1621,0.57683
22,1304,0.57683
1,1308,0.52929
24,1661,0.524322
25,1502,0.510184
2,814,0.500728
