# Install and load necesary packages

In [1]:


import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [2]:

df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [3]:


from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
item_popularity = np.zeros(n_items)
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
    item_popularity[row[2]-1] =  item_popularity[row[2]-1] + 1
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
testsize = 0
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    if item_popularity[row[2]-1] > 30:
        test_ds[row[1]-1, row[2]-1] = row[3]
        testsize = testsize + 1
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

print("Testsize = " + str(testsize))

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Testsize = 17678


# Utils

In [4]:

# you can use this devaluate Utils here, and you can also implement your own MAE and RMSE calculation. 

EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

In [5]:

MAE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.



In [8]:
# Write your code here
# You are required to implement the required solution here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 
#################################### ADJUSTED EUCLIDEAN DISTANCE ################
# defining adjusted Euclidean Distance function
def adjusted_euclidean_distance(user1, user2, V_max, V_min):
    # common items rated by 2 users
    common_items = np.where((user1 > 0) & (user2 > 0))[0]
    # if there are no common items 
    if len(common_items) == 0:
        return 0
    
    # Ratings for the common items
    r_u_s = user1[common_items]
    r_v_s = user2[common_items]
    
    # adjusted Euclidean distance using the formula given 
    numerator = np.sqrt(np.sum((r_u_s - r_v_s) ** 2))
    denominator = np.sqrt(len(common_items) * (V_max - V_min) ** 2)
    
    return 1 - (numerator / denominator)

#  finding K nearest neighbors
def find_k_nearest_neighbors(user_id, k, train_ds, V_max, V_min):
    similarities = []
    # Calculate adjusted Euclidean distance between the user and all other users
    for other_user_id in range(train_ds.shape[0]):
        if user_id != other_user_id:
            sim = adjusted_euclidean_distance(train_ds[user_id], train_ds[other_user_id], V_max, V_min)
            similarities.append((other_user_id, sim))
    # Sort users based on similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:k]

# Function to predict rating
def predict_rating(user_id, item_id, k, train_ds, V_max, V_min):
    neighbors = find_k_nearest_neighbors(user_id, k, train_ds, V_max, V_min)
    total_sim = weighted_sum = 0
    # if we get C_0 then we have perfect similarity
    C_0 = [(neighbor_id, sim) for neighbor_id, sim in neighbors if sim == 1]
    
    if len(C_0) > 0:
        # Calculate weighted sum for users who have rated the same items with identical ratings.
        k_0 = 1 / sum([abs(sim) for _, sim in C_0])
        for neighbor_id, sim in C_0:
            if train_ds[neighbor_id, item_id] > 0:
                weighted_sum += train_ds[neighbor_id, item_id] * k_0
                total_sim += k_0
    else:
        # Calculate weighted sum for other neighbors
        k = 1 / sum([abs(sim) for _, sim in neighbors])
        for neighbor_id, sim in neighbors:
            #If the neighbor has rated the item, their rating is multiplied by their similarity score
            if train_ds[neighbor_id, item_id] > 0:
                weighted_sum += train_ds[neighbor_id, item_id] * sim
                # the absolute value of similarity score is then added to is added to total_sim 
                total_sim += abs(sim)
    
    # Calculate the user's mean rating
    user_mean = np.mean(train_ds[user_id, np.where(train_ds[user_id] > 0)])
    # Return predicted rating (average if total similarity is 0)
    return user_mean if total_sim == 0 else weighted_sum / total_sim

# Main collaborative filtering function
def user_knn_cf(train_ds, test_ds, k, V_max, V_min):
    
    # Initialize an array to store the predicted ratings
    predictions = np.zeros_like(test_ds)
    
    # Predict ratings for each user-item pair in the test dataset
    for user_id in range(test_ds.shape[0]):
        for item_id in range(test_ds.shape[1]):
            # Check if the user has rated the item in the test dataset
            if test_ds[user_id, item_id] > 0:
                # Predict the rating for the user-item pair using the predict_rating function
                predicted_rating = predict_rating(user_id, item_id, k, train_ds, V_max, V_min)
                # Store the predicted rating in the predictions array
                predictions[user_id, item_id] = predicted_rating
    
    # Return the array of predicted ratings
    return predictions


# Provided evaluate function
def evaluate(test_ds, predicted_ds):
    mask_test_ds = test_ds > 0
    # Calculate Mean Absolute Error (MAE) and Root Mean Square Error (RMSE)
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))
    return MAE, RMSE

k = 3
V_max = np.max(train_ds)
V_min = np.min(train_ds)

# Perform collaborative filtering using user-based k-NN with adjusted Euclidean distance
predicted_ds = user_knn_cf(train_ds, test_ds, k, V_max, V_min)
# Evaluate the performance of the model
MAE, RMSE = evaluate(test_ds, predicted_ds)

print(f'MAE: {MAE}')
print(f'RMSE: {RMSE}')



MAE: 0.8406522030083707
RMSE: 1.064624030504147


In [9]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8406522030083707, RMSE: 1.064624030504147


In [7]:

################################### Just to see how pearson correlation performs and to use in the presentation #######

# we first start by defining the pearson correlation function 

def pearson_correlation(user1, user2):
    # we then find the common items rated by 2 users 
    common_items = np.where((user1 > 0) & (user2 > 0))[0]
    # if no common item between the users, then return 0 
    if len(common_items) == 0:
        return 0

    # if there are common items then following loop 
    
    # Rating for the common items from both the users are taken 
    user1_ratings = user1[common_items]
    user2_ratings = user2[common_items]

    # we calculate the means of the common items rated by the users
    mean_user1 = np.mean(user1_ratings)
    mean_user2 = np.mean(user2_ratings)

    # numerator and denominator for Pearson correlation
    numerator = np.sum((user1_ratings - mean_user1) * (user2_ratings - mean_user2))
    denominator = np.sqrt(np.sum((user1_ratings - mean_user1) ** 2)) * np.sqrt(np.sum((user2_ratings - mean_user2) ** 2))
    
    # if denom = 0 then the frac is invalid so return 0 
    if denominator == 0:
        return 0

    # Calculate Pearson correlation coefficient
    return numerator / denominator

# Define function to find K nearest neighbors
def find_k_nearest_neighbors(user_id, k, train_ds):
    similarities = []
    # Calculate similarity the users and all the other users using pearson correlation
    for other_user_id in range(train_ds.shape[0]):
        if user_id != other_user_id:
            sim = pearson_correlation(train_ds[user_id], train_ds[other_user_id])
            similarities.append((other_user_id, sim))
            
    # Similarities are sorted in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Return the top K most similar neighbors
    return similarities[:k]

# Define function to predict rating
def predict_rating(user_id, item_id, k, train_ds):
    
    # Find K nearest neighbors for the user
    neighbors = find_k_nearest_neighbors(user_id, k, train_ds)
    total_sim = 0
    weighted_sum = 0
    # Iterate through neighbors
    for neighbor_id, sim in neighbors:
        # Check if the neighbor rated the item
        if train_ds[neighbor_id, item_id] > 0:
            # Add weighted rating to weighted_sum
            weighted_sum += train_ds[neighbor_id, item_id] * sim
            total_sim += abs(sim)
    # Predict the rating for the item
    if total_sim == 0:
        # If no similar neighbors, return the mean rating of the user
        return np.mean(train_ds[user_id, np.where(train_ds[user_id] > 0)])
    return weighted_sum / total_sim

# Define the main collaborative filtering function
def user_knn_cf(train_ds, test_ds, k):
    # Initialize an array to store predictions
    predictions = np.zeros_like(test_ds)
    # Iterate through each user in the test set
    for user_id in range(test_ds.shape[0]):
        # Iterate through each item for the user
        for item_id in range(test_ds.shape[1]):
            # Check if the user rated the item
            if test_ds[user_id, item_id] > 0:
                # Predict the rating for the item using KNN
                predicted_rating = predict_rating(user_id, item_id, k, train_ds)
                # Store the predicted rating in the predictions array
                predictions[user_id, item_id] = predicted_rating
    # Return the predictions
    return predictions

# Define the value of K (number of neighbors)
k = 3

# Generate predictions using the KNN collaborative filtering method
predicted_ds = user_knn_cf(train_ds, test_ds, k)

# Evaluate the predictions using the provided evaluation function
MAE, RMSE = evaluate(test_ds, predicted_ds)

# Print the results
print(f'MAE: {MAE}')
print(f'RMSE: {RMSE}')


MAE: 0.8524958864884756
RMSE: 1.0844743305110824
