# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [2]:
# Please don't change this cell

df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

# obtain top 500 users and top 500 items
user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(500).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(500).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1,186,302,3,891717742
3,244,51,2,880606923
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467


# Split dataset

## Randomly select one rating from each user as test set

In [3]:
# Please don't change this cell

# remap user and item ID
df['user_id'] = df.groupby('user_id').ngroup()
df['item_id'] = df.groupby('item_id').ngroup()

test_df = df.groupby('user_id').sample(1, random_state=1024)
train_df = df[~df.index.isin(test_df.index)]

In [4]:
# Please don't change this cell

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
avg_num = df.groupby('user_id').size().mean()
density = df.shape[0] / (n_users * n_items)
min_ratings = df.rating.min()
max_ratings = df.rating.max()

print("The number of users: {}" .format(n_users))
print("The number of items: {}" .format(n_items))
print("Avg. # of rated Items/User: {}" .format(avg_num))
print("Density of data: {}" .format(density))
print("Ratings Range: {} - {}" .format(min_ratings, max_ratings))

The number of users: 500
The number of items: 500
Avg. # of rated Items/User: 129.914
Density of data: 0.259828
Ratings Range: 1 - 5


In [5]:
# Please don't change this cell

# Convert the format of datasets to matrices
# Train dataset
df_zeros = pd.DataFrame({
    'user_id': np.tile(np.arange(0, n_users), n_items), 
    'item_id': np.repeat(np.arange(0, n_items), n_users), 
    'rating': 0})
train_ds = df_zeros.merge(train_df, 
                          how='left', 
                          on=['user_id', 'item_id']).fillna(0.).pivot_table(
                              values='rating_y', 
                              index='user_id', 
                              columns='item_id').values
                           
# Test dataset
test_ds = df_zeros.merge(test_df, 
                         how='left', 
                         on=['user_id', 'item_id']).fillna(0.).pivot_table(
                             values='rating_y', 
                             index='user_id', 
                             columns='item_id').values

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

Construct the rating matrix based on train_df:
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [4. 3. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 4. 0.]]
Construct the rating matrix based on test_df:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [6]:
# Please don't change this cell
EPSILON = 1e-9

def user_corr(imputed_train_ds):
    '''
    Function for calculating user's similarity
    '''
    active_user_pearson_corr = np.zeros((imputed_train_ds.shape[0], imputed_train_ds.shape[0]))

    # Compute Pearson Correlation Coefficient of All Pairs of Users between active set and training dataset
    for i, user_i_vec in enumerate(imputed_train_ds):
        for j, user_j_vec in enumerate(imputed_train_ds):

            # ratings corated by the current pair od users
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # corrated item index, skip if there are no corrated ratings
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
            if len(corrated_index) == 0:
                continue

            # average value of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

            # compute pearson corr
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
            active_user_pearson_corr[i][j] = sim

    return active_user_pearson_corr

def predict(test_ds, imputed_train_ds, user_corr, k=20):
    '''
    Function for predicting ratings in test_ds
    '''

    # Predicting ratings of test set
    predicted_ds = np.zeros_like(test_ds)

    for (i, j), rating in np.ndenumerate(test_ds):

        if rating > 0:

            # only predict ratings on test set
            sim_user_ids = np.argsort(user_corr[i])[-1:-(k + 1):-1]

            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = user_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds[sim_user_ids]
            
            mask_rateditem_user = imputed_train_ds[i] != 0
            num_rated_items = mask_rateditem_user.astype(np.float32)
            user_mean = np.sum(imputed_train_ds[i, mask_rateditem_user]) / (num_rated_items.sum() + EPSILON)

            mask_nei_rated_items = sim_users != 0
            num_rated_per_user = mask_nei_rated_items.astype(np.float32)
            num_per_user = num_rated_per_user.sum(axis=1)

            sum_per_user = sim_users.sum(axis=1)
            sim_user_mean = sum_per_user / (num_per_user + EPSILON)
            
            mask_rated_j = sim_users[:, j] > 0
                            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)

            predicted_ds[i, j] = np.clip(user_based_pred, 0, 5)
            
    return predicted_ds

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Baseline - KNN based recommendation (Similarity Metric: Pearson Correlation Coefficient)

In [7]:
# Please don't change this cell

user_pearson_corr = user_corr(train_ds)
predicted_ds = predict(test_ds, train_ds, user_pearson_corr, k=20)

In [8]:
# Please don't change this cell

MAE, RMSE = evaluate(test_ds, predicted_ds)

print("===================== Baseline Result =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8471711011333851, RMSE: 1.092846045041526


# Your Solution
(Put all your implementation for your solution in the following cell only)

In [9]:
from sklearn.model_selection import train_test_split
import math


#====================Split Dataset ===================

train_df, test_df = train_test_split(df, test_size=0.2)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
test_ds = pd.DataFrame(test_ds)

train_ds, test_ds

print(test_ds)

#======================IMPLEMENT THE EXISTING SOLUTION ===================

# User-based
# P(t) is item's popularity: P(t)=|U(t)| where U(t) is number of users rating on item t
P = 500

# m is the total numbers of users in database
m= 1000

# w(t) = log (m/P(t)) 
# popularity significance weight of item t :
w = math.log (m/P)



#================= Calculate the similarity  (centered cosine similarity) between every two users ===============

#Gamma is for significant weighting
GAMMA = 30

#Epsilon is a very small number to prevent the error of division by zero
EPSILON = 1e-9

#define user-user similarity
np_user_pearson_corr = np.zeros((n_users, n_users))

#calcualate the similarity for every pair of users by using nested for loop statement
for a, user_a_vec in enumerate(train_ds.values):
    for u, user_u_vec in enumerate(train_ds.values):

        # define the masks to find the set of items rated by users a and u, respectively
        mask_a = user_a_vec > 0
        mask_u = user_u_vec > 0

        #find the co-rated items set which voted by user a or user u
        corrated_index = np.union1d(np.where(mask_a), np.where(mask_u))
        
        # skip if there are no co-rated ratings a and u 
        # then leave the default zero as the similarity between a and u
        if len(corrated_index) == 0:
            continue
      
        # calculate average rating for row (user a and u)
        mean_user_a = np.sum(user_a_vec) / (np.sum(np.clip(user_a_vec, 0, 1)) + EPSILON)
        mean_user_u = np.sum(user_u_vec) / (np.sum(np.clip(user_u_vec, 0, 1)) + EPSILON)
        
        # perform the centered tranformation for ratings in the co-rated item set by subtracting the avergae rating of a and u, respectively
        user_a_sub_mean = user_a_vec[corrated_index] - mean_user_a
        user_u_sub_mean = user_u_vec[corrated_index] - mean_user_u
        
        # perform the centred cosine similarity betweeb user a and u
        # perform the sqaure part
        r_ua_sub_r_a_sq = np.square(user_a_sub_mean)
        r_uu_sub_r_u_sq = np.square(user_u_sub_mean)
        
        # perform the square root part
        r_ua_sum_sqrt = np.sqrt(np.sum(r_ua_sub_r_a_sq * w * w))
        r_uu_sum_sqrt = np.sqrt(np.sum(r_uu_sub_r_u_sq * w * w))
        
        # perform similarity
        sim = np.sum(w * w * user_a_sub_mean * user_u_sub_mean) / (r_ua_sum_sqrt * r_uu_sum_sqrt + EPSILON)

        # calculate significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim
        np_user_pearson_corr[a][u] = weighted_sim
np_user_pearson_corr

print(np_user_pearson_corr)


#========= EVALUATATE THE IMPLEMENTATION BY PREDICTIING THE RATINGS IN TEST SET (test_ds) ==========

# define the prediction matrix to store the predicted ratings
np_predictions = np.zeros((n_users, n_items))

# K is the number of neighbours
K = 20

# Epsilon is a very small number to prevent the error of division by zero
EPSILON = 1e-9

# predict every rating in test_ds using the for loop statement 
for (a, u), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        
        # find top-k most similar users as the current user, remove itself
        sim_user_ids = np.argsort(np_user_pearson_corr[a])[-(K + 1):-1]
        
        # the actual similarity values for the top-k neighbours
        sim_val = np_user_pearson_corr[a][sim_user_ids]
        
        #find the rows for the top-k neighbours
        sim_users = train_ds.values[sim_user_ids]
        
        #calculate the average rating for the target user a, and the top-k neighbours (users)
        user_mean = np.sum(train_ds.values[a]) / (np.sum(np.clip(train_ds.values[a], 0, 1)) + EPSILON)
        
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_u = sim_users[:, u] > 0
        
   
        # calcualte the predicted rating: sim(a,u) * (r_ui - mean_u)
        sim_r_sum_mean = sim_val[mask_rated_u] * (sim_users[mask_rated_u, u] - sim_user_mean[mask_rated_u])
        
        np_predictions[a][u] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_u]) + EPSILON)
        
        # clip the predicted rating into the correct range of the ratings (from 0 to 5)
        np_predictions[a][u] = np.clip(np_predictions[a][u], 0, 5)
        
    np_predictions[a][u]    

#========== generate the recommendation items list for user a, based on the Top-N highest prediction value======

# define a composite recommendation value for every item t
# ComRV(t) = P(t) * mean_R(t)
comRV= 500 * mean_user_a
print("Composite Recommendation Value is", comRV)

    

#================== EVALUATE THE PREDICTION BY USING MAE =====================

labels = test_ds.values    

#calculate the absoblute error between every prediction and every rating correspondingly
absolute_error = np.abs(np_predictions - labels)

# clip the rating in the labels (test set) into 1 and 0
# then use this to  calculate the number of rating in the test set
weight = np.clip(labels, 0, 1)

# absoulte error on rated ratings
abs_error = absolute_error * weight

# calcualte MAE
MAE = np.sum(abs_error) / np.sum(weight)



#================= EVALUATE THE PREDICTION BY USING RMSE ======================
labels = test_ds.values

# calculate squared values for evry prediction and every rating (in the test set) correspondingly. 
squared_error = np.square(np_predictions - labels)

# clip the rating in the labels (test set) into 1 and 0
# then use this to  calculate the number of rating in the test set
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# calculate RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))


     0    1    2    3    4    5    6    7    8    9    ...  490  491  492  \
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0  3.0  0.0  ...  0.0  0.0  0.0   
4    0.0  0.0  4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
495  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  5.0  0.0   
496  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0  ...  0.0  0.0  0.0   
497  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
498  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
499  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

     493  494  495  496  497  498  499  
0    0.0  0.0  0.0  0.0  0.0  0.0 

## Print the MAE and RMSE of Your Implementation

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7654304847345743, RMSE: 0.9846462286564435
