# Assignment 1 - DATA.ML.360

In [47]:
%pip install pandas

import pandas as pd
import numpy as np

You should consider upgrading via the '/Users/laurira/uni/recsys/assignment1/a1_venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


### Part A: Understanding the dataset

MovieLens 100K dataset downloaded from https://grouplens.org/datasets/movielens/100k/

Frist we download the dataset and check what it looks like to get a general understanding.

In [48]:
# Read the data file and see how it looks like
df = pd.read_csv('u.data', sep='\t', header=None)
df.head()

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [49]:
# Add column names and check few rows of the dataset
df.columns = ["user_id", "item_id", "rating", "timestamp"]
df = df.drop("timestamp", axis=1)
df.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [50]:
# Make sure there are 100k rows in the dataset
rows = len(df)
print("Total rows in the dataset: " + str(rows))

Total rows in the dataset: 100000


### Part B: User-Based Collaborative Filtering

Next we implement the user-based collaborative filtering using the Pearson correlation

In [51]:
# Create a matrix from the ratings. Each row represents an user and each column an item/movie.
rating_matrix = df.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

# Make sure that the matrix looks correct
rating_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# Calculate the Pearson correlation between two users ratings
def pearson_correlation(user1_id, user2_id):
    # Get common movies that both users have rated
    common_movies = rating_matrix.columns[(rating_matrix.loc[user1_id] > 0) & (rating_matrix.loc[user2_id] > 0)]

    # Get the ratings for the given users matching the common movies
    user1_data = rating_matrix.loc[user1_id, common_movies]
    user2_data = rating_matrix.loc[user2_id, common_movies]

    # Compute the Pearson correlation between the ratings of the two users
    pearson_correlation = np.corrcoef(user1_data, user2_data)[0][1]

    return pearson_correlation

# Check the result
corr = pearson_correlation(244, 327)
print(corr)

0.22944594807279053


### Part C: Making Predictions

In [53]:
def predict_rating(user_id, item_id):
    # Get the users' ratings for the active user
    active_user_ratings = rating_matrix.loc[user_id]

    # Find neighbor ids who have rated the same item
    item_ratings = rating_matrix[item_id]
    neighbors = item_ratings[item_ratings > 0].index

    # Calculate Pearson correlations for each neighbor
    neighbor_correlations = {}
    for neighbor_id in neighbors:
        correlation = pearson_correlation(user_id, neighbor_id)
        neighbor_correlations[neighbor_id] = correlation
    
    # Get the top 10 closest matching neighbors
    top_neighbors = sorted(neighbor_correlations, key=neighbor_correlations.get, reverse=True)[:10]

    # Initialize variables for prediction
    weighted_rating_sum = 0
    similarity_sum = 0

    # Calculate the prediction for active user
    for neighbor_id in top_neighbors:

        # Get the neighbors rating for the chosen item
        neighbor_item_rating = rating_matrix.at[neighbor_id, item_id]

        # Get every given rating of the neighbor
        all_neighbor_ratings = rating_matrix.loc[neighbor_id][rating_matrix.loc[neighbor_id] > 0]

        # Calculate the difference between the chosen item rating and the user average
        neighbor_mean = np.mean(all_neighbor_ratings)
        rating_difference = neighbor_item_rating - neighbor_mean

        # Combine the rating differences.
        # The neighbor similarity calculated earlier is used as the weight.
        weighted_rating_sum += rating_difference * neighbor_correlations[neighbor_id]
        similarity_sum += abs(neighbor_correlations[neighbor_id])

    # Avoid the case where we might divide by zero
    if similarity_sum == 0:
        return 0

    # Calculate the prediction for the active user
    active_user_mean = np.mean(active_user_ratings)
    prediction = active_user_mean + (weighted_rating_sum / similarity_sum)

    return prediction


# Create a test prediction
pred = predict_rating(1,10)
print("Prediction result: " + str(pred))

Prediction result: 0.4887040270688252


### Part D: Creating Recommendations

We start by picking an user

In [54]:
# Initialize target user
target_user = 1

Then we get 10 most similar users to our target user

In [58]:
user_correlations = {}

# Calculate pearson correlations to each user 
for user_id in rating_matrix.index:
    correlation = pearson_correlation(target_user, user_id)
    user_correlations[user_id] = correlation

# Sort the dataset so that we get the top 10 most similar users
similar_users = sorted(user_correlations, key=user_correlations.get, reverse=True)[:10]

# Check the result
print("Ten most similar users to user " + str(target_user) + ": ")
print(similar_users)
    

Ten most similar users to user 1: 
[1, 39, 531, 511, 520, 34, 485, 510, 240, 134]


Next we get 10 most relevant movies for a target user

In [60]:
movie_recommendations = {}

# Go through all the movies
for movie_id in rating_matrix.columns:
    # Calculate a prediction for a movie that the target user hasn't rated yet
    if rating_matrix.at[target_user, movie_id] == 0:
        prediction = predict_rating(target_user, movie_id)
        movie_recommendations[movie_id] = prediction

# Get the top 10 matching movies from the recommendations
top_recommendations = sorted(movie_recommendations, key=movie_recommendations.get, reverse=True)[:10]

print("Ten best recommendations for user " + str(target_user) + ": ")
print(top_recommendations)

Ten best recommendations for user 1: 
[275, 285, 430, 654, 603, 632, 963, 651, 656, 487]


### Part E: Jaccard Similarity

Jaccard Similarity is another way of calculating similarities betweent two users. It's useful in collaborative filtering because its a very simple, yet effective way of measuring the similarity between two users preferences. Essentially it takes two sets, in this case movie ratings by two different users. Then it calculates the ratio between the movies that both users have rated and those that they have not rated.

There are lot of discussion and articles about the method. This is what I based my implementation off of: https://www.learndatasci.com/glossary/jaccard-similarity/

In [61]:
# Calculate the Jaccard Similarity
def jaccard_similarity(user1_id, user2_id):
    # Get the movies rated by each user
    user1_movies = set(rating_matrix.loc[user1_id][rating_matrix.loc[user1_id] > 0].index)
    user2_movies = set(rating_matrix.loc[user2_id][rating_matrix.loc[user2_id] > 0].index)
    
    # Intersection of movies both users have rated
    intersection = len(user1_movies.intersection(user2_movies))

    # Union of movies that neither of the users have rated
    union = len(user1_movies) + len(user2_movies) - intersection

    # Calculate similarity
    similarity = intersection / union
    return similarity

jacc_sim = jaccard_similarity(244, 327)
print(jacc_sim)

0.2669902912621359
