In [3]:
import pandas as pd
import numpy as np

(a) Download the MovieLens 100K rating dataset from
https://grouplens.org/datasets/movielens/ (the small dataset recommended for
education and development). Read the dataset, display the first few rows to understand
it, and display the count of ratings (rows) in the dataset to be sure that you download it
correctly.

In [4]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

print("Links Dataset:")
display(links.head())

print("\nMovies Dataset:")
display(movies.head())

print("\nRatings Dataset:")
display(ratings.head())

print("\nTags Dataset:")
display(tags.head())

rating_count = ratings.shape[0]
print(f"\nTotal number of ratings: {rating_count}")

Links Dataset:


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0



Movies Dataset:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



Ratings Dataset:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931



Tags Dataset:


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200



Total number of ratings: 100836


(b) Implement the user-based collaborative filtering approach, using the Pearson
correlation function for computing similarities between users (4 points),

In [None]:

user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

centered_user_item_matrix = user_item_matrix.sub(user_item_matrix.mean(axis=1), axis=0)

def pearson(user1, user2):
    common_ratings = ~user1.isna() & ~user2.isna()
    user1_common = user1[common_ratings]
    user2_common = user2[common_ratings]
    
    if len(user1_common) == 0:
        return 0

    numerator = np.sum((user1_common - user1_common.mean()) * (user2_common - user2_common.mean()))
    denominator = np.sqrt(np.sum((user1_common - user1_common.mean()) ** 2) * np.sum((user2_common - user2_common.mean()) ** 2))
    
    if denominator == 0:
        return 0

    return numerator / denominator

user_similarity_matrix = pd.DataFrame(index=user_item_matrix.index, columns=user_item_matrix.index)

#Dataframes for user matrixes
for user_a in user_item_matrix.index:
    for user_b in user_item_matrix.index:
        if user_a == user_b:
            user_similarity_matrix.loc[user_a, user_b] = 1  
        elif pd.isna(user_similarity_matrix.loc[user_a, user_b]):
            similarity = pearson(centered_user_item_matrix.loc[user_a], centered_user_item_matrix.loc[user_b])
            user_similarity_matrix.loc[user_a, user_b] = similarity
            user_similarity_matrix.loc[user_b, user_a] = similarity

user_similarity_matrix = user_similarity_matrix.astype(float)

print(user_similarity_matrix.head())




userId       1    2         3         4         5             6         7    \
userId                                                                        
1       1.000000  0.0  0.079819  0.207983  0.268749 -2.916358e-01 -0.118773   
2       0.000000  1.0  0.000000  0.000000  0.000000  0.000000e+00 -0.991241   
3       0.079819  0.0  1.000000  0.000000  0.000000  7.850462e-17  0.000000   
4       0.207983  0.0  0.000000  1.000000 -0.336525  1.484982e-01  0.542861   
5       0.268749  0.0  0.000000 -0.336525  1.000000  4.316590e-02  0.158114   

userId       8         9         10   ...       601           602       603  \
userId                                ...                                     
1       0.469668  0.918559 -0.037987  ...  0.091574 -1.183502e-17 -0.061503   
2       0.000000  0.000000  0.037796  ... -0.387347  0.000000e+00 -1.000000   
3       0.000000  0.000000  0.000000  ...  0.000000  0.000000e+00  0.433200   
4       0.117851  0.000000  0.485794  ... -0.222113

(c) the prediction function presented in class for predicting movies scores (4 points).

In [None]:
def predict_rating(user_id, movie_id, n_neighbors=5):
    user_ratings = user_item_matrix.loc[user_id]
    user_mean_rating = user_ratings.mean()
    similarities = user_item_matrix[user_id].sort_values(ascending=False)
    neighbors = similarities.iloc[1:n_neighbors+1].index
    num = 0
    den = 0
    for n in neighbors:
        neighbor_ratings = user_item_matrix.loc[n]
        if pd.notna(neighbor_ratings[movie_id]):
            neighbor_mean_rating = neighbor_ratings.mean()
            similarity = similarities[n]
            rating_diff = neighbor_ratings[movie_id] - neighbor_mean_rating
            num += similarity * rating_diff
            den += abs(similarity)
    if den == 0:
        return user_mean_rating
    
    predicted_rating = user_mean_rating + (num / den)
    return predicted_rating

#Parameters to get the predicted rating for a user
user_id = 1
movie_id = 47
predicted_rating = predict_rating(user_id, movie_id)
print(f"Predicted rating for User {user_id} on Movie {movie_id}: {predicted_rating}")

Predicted rating for User 1 on Movie 47: 4.619712643678161


(d) Design and implement a new similarity function for computing similarities between
users. Explain why this similarity function is useful for the collaborative filtering
approach. Hint: Exploiting ideas from related works are highly encouraged. 4 points

In [19]:
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# Calculated Adjusted Cosine Similarity
def adjusted_cosine_similarity(user1, user2):
    # Identify movies that both users have rated
    common_items = ~user1.isna() & ~user2.isna()
    user1_common = user1[common_items]
    user2_common = user2[common_items]
    
    # If no common ratings, return 0 (no similarity)
    if len(user1_common) == 0:
        return 0

    user1_mean = user1_common.mean()
    user2_mean = user2_common.mean()
    
    user1_adj = user1_common - user1_mean
    user2_adj = user2_common - user2_mean
    # Calculate the dot product of the two users' ratings
    # Calculate the denominator by taking the product of each user's rating vector magnitudes
    numerator = np.dot(user1_adj, user2_adj)
    denominator = np.sqrt(np.sum(user1_adj ** 2)) * np.sqrt(np.sum(user2_adj ** 2))
    
    # Return 0 if the denominator is zero (to avoid division by zero), otherwise return cosine similarity
    if denominator == 0:
        return 0

    return numerator / denominator

# Initialize an empty matrix to store user-user similarity scores
user_similarity_matrix_adjusted_cosine = pd.DataFrame(index=user_item_matrix.index, columns=user_item_matrix.index)

# Populate the user similarity matrix using cosine similarity
for user_a in user_item_matrix.index:
    for user_b in user_item_matrix.index:
        if user_a == user_b:
            user_similarity_matrix_adjusted_cosine.loc[user_a, user_b] = 1  
        elif pd.isna(user_similarity_matrix_adjusted_cosine.loc[user_a, user_b]):
            similarity = adjusted_cosine_similarity(user_item_matrix.loc[user_a], user_item_matrix.loc[user_b])
            user_similarity_matrix_adjusted_cosine.loc[user_a, user_b] = similarity
            user_similarity_matrix_adjusted_cosine.loc[user_b, user_a] = similarity

user_similarity_matrix_adjusted_cosine = user_similarity_matrix_adjusted_cosine.astype(float)

print(user_similarity_matrix_adjusted_cosine.head())




userId       1    2         3         4         5         6         7    \
userId                                                                    
1       1.000000  0.0  0.079819  0.207983  0.268749 -0.291636 -0.118773   
2       0.000000  1.0  0.000000  0.000000  0.000000  0.000000 -0.991241   
3       0.079819  0.0  1.000000  0.000000  0.000000  0.000000  0.000000   
4       0.207983  0.0  0.000000  1.000000 -0.336525  0.148498  0.542861   
5       0.268749  0.0  0.000000 -0.336525  1.000000  0.043166  0.158114   

userId       8         9         10   ...       601       602       603  \
userId                                ...                                 
1       0.469668  0.918559 -0.037987  ...  0.091574  0.000000 -0.061503   
2       0.000000  0.000000  0.037796  ... -0.387347  0.000000 -1.000000   
3       0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.433200   
4       0.117851  0.000000  0.485794  ... -0.222113  0.396641  0.090090   
5       0.028347  0.0000

Cosine similarity is a useful measure of user similarity in collaborative filtering systems because it focuses on the direction of user ratings rather than their absolute values. This means that it can identify users with similar tastes regardless of how high or low their ratings tend to be. In addition, cosine similarity works well on sparse datasets (e.g. when users have not rated the same items) and is computationally lightweight, so it scales to large user and product datasets.

(e) Use the user-based collaborative filtering approach to produce group
recommendations. Specifically, first compute the movies recommendations for each
user in the group, and then aggregate the lists of the individual users, to produce a
single list of movies for the group. You will implement two well established aggregation
methods for producing the group recommendations.

The second aggregation method is the least misery method, where one member can act
as a veto for the rest of the group. In this case, the rating of an item for a group of user is computed as the minimum score assigned to that item in all group members
recommendations. 3 points. 

Use the MovieLens 100K rating dataset for checking the correctness of your solutions.

In [None]:
# Assuming `user_similarity_matrix_adjusted_cosine` and `user_similarity_matrix_pearson` are defined
# We combine the matrices (e.g., as an average) or choose one, depending on your design preferencey

# Merge movies and ratings on 'movieId' to get movie titles alongside user ratings
movies_ratings_df = pd.merge(movies, ratings, on='movieId')

# Create the pivot table, filling in missing ratings with 0 or NaN as needed
movies_ratings_pivot = pd.pivot_table(movies_ratings_df, index='userId', columns='title', values='rating', fill_value=0)

# Function to get individual user recommendations based on the similarity matrix
def get_user_recommendations(user_id, top_n=10):
    # Assuming `movies_ratings_pivot` contains movie ratings for each user (rows: userId, columns: movies)
    user_ratings = movies_ratings_pivot.loc[user_id]
    similar_users = user_similarity_matrix[user_id].drop(index=user_id)  # exclude self
    similar_users = similar_users[similar_users > 0]  # consider only positive similarity scores
    
    # Calculate weighted average ratings
    weighted_ratings = movies_ratings_pivot.mul(similar_users, axis=0).sum() / similar_users.sum()
    return weighted_ratings.nlargest(top_n)

# Example users in a group
# Choosing the target user
selected_user = 1  

# Look for the similar user and make group of ten
similar_users = user_similarity_matrix.loc[selected_user].sort_values(ascending=False).iloc[1:11].index

# Collect recommendations for each group member
group_recommendations = {}
for user in similar_users:
    group_recommendations[user] = get_user_recommendations(user, top_n=10)

# Aggregate recommendations for the group

# Average method
average_recommendations = pd.concat(group_recommendations.values(), axis=1).mean(axis=1).nlargest(10)

# Least Misery method
least_misery_recommendations = pd.concat(group_recommendations.values(), axis=1).min(axis=1).nlargest(10)

# Display the top 10 movies for each aggregation method
print("Top 10 Group Recommendations (Average Method):")
print(average_recommendations)

print("\nTop 10 Group Recommendations (Least Misery Method):")
print(least_misery_recommendations)


Top 10 Group Recommendations (Average Method):
title
Matrix, The (1999)                               2.940061
Shawshank Redemption, The (1994)                 2.929792
Forrest Gump (1994)                              2.824209
Pulp Fiction (1994)                              2.748806
Gladiator (2000)                                 2.687751
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)        2.652910
Godfather, The (1972)                            2.508568
Fight Club (1999)                                2.485033
Lord of the Rings: The Two Towers, The (2002)    2.452373
Star Wars: Episode IV - A New Hope (1977)        2.451160
dtype: float64

Top 10 Group Recommendations (Least Misery Method):
title
Gladiator (2000)                                 2.687751
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)        2.652910
Forrest Gump (1994)                              2.606086
Matrix, The (1999)                               2.541704
Shawshank Redemption, The (1994)                 2.513323
God


(f) Define a way for counting the disagreements between the users in a group and
propose a method that takes disagreements into account when computing suggestions
for the group. Implement your method and explain why it is useful when producing
group recommendations. Prepare also a short presentation (about 5 slides) to show
how your method works. 6 points

In [None]:
def form_group(user_id, group_size=5, similarity_matrix='adjusted_cosine'):
    if similarity_matrix == 'pearson':
        user_similarities = user_similarity_matrix.loc[user_id].drop(index=user_id)
    elif similarity_matrix == 'adjusted_cosine':
        user_similarities = user_similarity_matrix_adjusted_cosine.loc[user_id].drop(index=user_id)
    else:
        raise ValueError("Invalid similarity_matrix parameter. Use 'pearson' or 'adjusted_cosine'.")

    similar_users = user_similarities.nlargest(group_size).index
    group_user_ids = [user_id] + list(similar_users) 
    
    return group_user_ids


def calculate_disagreements(group_user_ids):
    group_ratings = user_item_matrix.loc[group_user_ids]
    disagreements = group_ratings.std(axis=0, skipna=True)
    return disagreements


def group_recommendations(group_user_ids, disagreement_threshold=1.0):
    disagreements = calculate_disagreements(group_user_ids)
    agreed_movies = disagreements[disagreements <= disagreement_threshold].index
    group_ratings = user_item_matrix.loc[group_user_ids, agreed_movies]
    average_ratings = group_ratings.mean(axis=0, skipna=True)
    recommendations = pd.DataFrame({'movieId': average_ratings.index, 'average_rating': average_ratings.values})
    recommendations = recommendations.merge(movies[['movieId', 'title']], on='movieId', how='left')
    recommendations = recommendations.sort_values(by='average_rating', ascending=False).reset_index(drop=True)
    recommendations = recommendations[['title', 'movieId', 'average_rating']]
    
    return recommendations


#Forms a group around a target user, e.g., user ID 1, using adjusted cosine or pearson
group_user_ids = form_group(user_id=1, group_size=5, similarity_matrix='adjusted_cosine')
print("Formed Group User IDs:", group_user_ids)

#Generates recommendations for the group, considering disagreement
recommendations = group_recommendations(group_user_ids, disagreement_threshold=1.0)
print("Group Recommendations:")
print(recommendations.head(10))


Formed Group User IDs: [1, 146, 550, 106, 333, 598]
Group Recommendations:
                                          title  movieId  average_rating
0                              Inception (2010)    79132        5.000000
1                            Matrix, The (1999)     2571        4.833333
2              Shawshank Redemption, The (1994)      318        4.750000
3                              Gladiator (2000)     3578        4.500000
4                              Toy Story (1995)        1        4.000000
5                           Forrest Gump (1994)      356        4.000000
6                                   Thor (2011)    86332        4.000000
7                Guardians of the Galaxy (2014)   112852        4.000000
8  I Still Know What You Did Last Summer (1998)     2338        2.000000
