# Install and load necesary packages

In [1]:
# Install and load these

import pandas as pd
import numpy as np  
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [3]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
# train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
# test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [4]:
# Evaluate using this function
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Implementation of Model

In [5]:


MAE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

# predefined parameter for lambda value
# Higher values of lamda - more weight to the average rating differences. i.e, more towards personalised CF
# Lower values of lamda - more weight to user similarities. i.e, more towards slope one
lamnbda = 0.2

# Picking one active user to compare. Dataset is too huge, so to illustrate personalised weighted schema one active user is picked
active_user = 0
EPSILON = 1e-9


# To store centered cosine similarity
# As only calculating for one active user1-d array is used 
cos_similarities = np.zeros(train_ds.shape[0])

# masking active user
active_user_ratings = train_ds[active_user]

for i in range(train_ds.shape[0]):
    # Checking similarity with only other user, checking with same will always have high similarity
    if i != active_user:
        other_user_ratings = train_ds[i]
        
        # corrated ratings index, np.logical - https://numpy.org/doc/stable/reference/generated/numpy.logical_and.html and https://www.geeksforgeeks.org/numpy-logical_and-python/
        corrated_ratings = np.logical_and(active_user_ratings > 0, other_user_ratings > 0)
        
        # Atleast one item should be corrated
        if np.sum(corrated_ratings) > 0:
            
            # pearson correlation 
            sub_mean_active_ratings = active_user_ratings[corrated_ratings] - np.mean(active_user_ratings[corrated_ratings])
            sub_mean_other_user_ratings = other_user_ratings[corrated_ratings] - np.mean(other_user_ratings[corrated_ratings])
            
            sub_mean_active_ratings_sq = np.square(sub_mean_active_ratings)
            sub_mean_other_user_ratings_sq = np.square(sub_mean_other_user_ratings)
            
            sub_mean_active_ratings_sq_sumof_sq =  np.sqrt(np.sum(sub_mean_active_ratings_sq))
            sub_mean_other_user_ratings_sq_sumof_sq =  np.sqrt(np.sum(sub_mean_other_user_ratings_sq))
            
            # formula from lectorial and lab to calculate centered cosine similarity
            sim = np.sum(sub_mean_active_ratings * sub_mean_other_user_ratings) / (sub_mean_active_ratings_sq_sumof_sq * sub_mean_other_user_ratings_sq_sumof_sq + EPSILON)
            
            # Add to array
            cos_similarities[i] = sim


# compute deviations and cardinality
# size -item*item
dev = np.zeros((n_items, n_items))
cardinality = np.zeros((n_items, n_items))

for item_j in range(n_items):
    for item_i in range(n_items):
        
        # every item i the user u has not rated
        if item_i != item_j:
            corrated_index = np.logical_and(train_ds[:, item_i] > 0, train_ds[:, item_j] > 0)
            # compute cardinality 
            card_S = np.sum(corrated_index)
            # skip if card_S is 0
            if card_S == 0:
                continue
            # u_i - rating to item i, and u_j - rating to item j
            u_j_ratings = train_ds[corrated_index, item_j]
            u_i_ratings = train_ds[corrated_index, item_i]
            
            # computing given equation for deviation
            slope_one_lamnbda = lamnbda * np.sum((u_j_ratings - u_i_ratings) / card_S)
            
            # exponet - cosine similarity value, base -2
            expo = cos_similarities[np.where(corrated_index)[0]]
            personalised_lamnbda = (1 - lamnbda)* (np.sum((u_j_ratings - u_i_ratings) * np.power(2, expo))/(np.sum(np.power(2,expo))*card_S)+EPSILON)
            
            # deviation as per given equation
            dev[item_j, item_i] = slope_one_lamnbda + personalised_lamnbda
            cardinality[item_j, item_i] = card_S


# Predict Ratings
prediction_matrix =np.zeros((n_users, n_items))
for user in range(n_users):
    for item in range(n_items):
        # only rated by active user and only co rated items
        mask_items = np.where(np.logical_and(train_ds[user] > 0, cardinality[item] > 0))[0]
        
        # If null, cannot predict
        if len(mask_items) > 0:
            # prediction accornding to given equation
            prediction = np.sum((dev[mask_items, item] + train_ds[user, mask_items]) * cardinality[
                mask_items, item]) / np.sum(cardinality[mask_items, item])
            prediction_matrix[user, item] = prediction
            # clip, lesser than zero to zero, larger than 5 to 5
            prediction_matrix[user, item] = np.clip(prediction_matrix[user, item], 0, 5)

MAE, RMSE = evaluate(test_ds, prediction_matrix)
print("MAE:", MAE) # got 0.87
print("RMSE:", RMSE) # got 1.11

MAE: 0.8743892654312901
RMSE: 1.1141573193432484


In [6]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8743892654312901, RMSE: 1.1141573193432484
