# Assignment 3

In [1]:
import pandas as pd
import numpy as np

# Load MovieLens 100K dataset into a dataframe of pandas
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
# Select 500 most active users and 500 most active items from the dataset
n_most_active_users = 500
n_most_active_items = 500

user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(n_most_active_users).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(n_most_active_items).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

In [3]:
# Map new internal ID for items
i_ids = df['item_id'].unique().tolist()
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))
df['item_id'] = df['item_id'].map(item_dict)

# Split Dataset

In [4]:
# The number of training users and active users
n_training_users = 300
n_active_users = n_most_active_users - n_training_users

# The number of GIVEN ratings for active users
GIVEN = 20

# Randomly select users from the most active users as training set
random_uids = np.random.choice(df.user_id.unique(), n_training_users, replace=False)
train_df = df[df['user_id'].isin(random_uids)]
# Map new internal ID for all users in the training set
u_ids = train_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
train_df['user_id'] = train_df['user_id'].map(user_dict)

# The rest of users are active users for testing
remain_df = df[~df['user_id'].isin(random_uids)]
# Map new internal ID for all active users
u_ids = remain_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
remain_df['user_id'] = remain_df['user_id'].map(user_dict)

# Randomly select GIVEN ratings for active users
active_df = remain_df.groupby('user_id').sample(n=GIVEN, random_state=1024)

test_df = remain_df[~remain_df.index.isin(active_df.index)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = train_df['user_id'].map(user_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain_df['user_id'] = remain_df['user_id'].map(user_dict)


In [5]:
# Convert the format of datasets to matrices
df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_training_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_training_users), 'rating': 0})
train_ds = df_zeros.merge(train_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_active_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_active_users), 'rating': 0})
active_ds = df_zeros.merge(active_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')
test_ds = df_zeros.merge(test_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

train_ds, active_ds, test_ds

(item_id  0    1    2    3    4    5    6    7    8    9    ...  490  491  492  \
 user_id                                                    ...                  
 0        0.0  2.0  0.0  4.0  0.0  4.0  4.0  0.0  0.0  2.0  ...  0.0  4.0  4.0   
 1        0.0  0.0  4.0  4.0  4.0  0.0  0.0  4.0  0.0  0.0  ...  0.0  0.0  0.0   
 2        4.0  0.0  0.0  2.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 3        4.0  0.0  5.0  0.0  1.0  0.0  3.0  2.0  0.0  0.0  ...  0.0  0.0  0.0   
 4        0.0  0.0  0.0  0.0  5.0  5.0  0.0  3.0  5.0  0.0  ...  0.0  4.0  0.0   
 ...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 295      0.0  0.0  0.0  0.0  0.0  0.0  0.0  5.0  4.0  5.0  ...  0.0  0.0  4.0   
 296      0.0  0.0  5.0  4.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0   
 297      0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 298      0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  3.0  4.0  0.0   
 299      0.0  0

In [6]:
# Predicting All Missing Data in training set
imputed_train_ds = train_ds.values.copy()
imputed_train_ds

array([[0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Implementation to predict the missing values

In [7]:
## The following parameters are required in the given report
LAMBDA = 0.7    # λ
GAMMA = 10      # γ
DELTA = 10      # δ
ITA = 0.7       # η
THETA = 0.7     # θ
EPSILON = 1e-9

## Get the size of user-item matrix
n_users = imputed_train_ds.shape[0]
n_items = imputed_train_ds.shape[1]


## Compute User-based Pearson Correlation Coefficient

# Create the user-user matrix
user_pearson_corr = np.zeros((n_users, n_users))

# Loop to calculate similarity for each pair of users
for i, user_i_vec in enumerate(imputed_train_ds):
    for j, user_j_vec in enumerate(imputed_train_ds):
        
        # In case of the user versus themselves, the similarity is 1
        if i == j:
            user_pearson_corr[i][j] = 0
            
        # In case the similarity is already computed, simply assign its value to the cell
        elif j < i:
            user_pearson_corr[i][j] = user_pearson_corr[j][i]
            
        # Otherwise, calculate the similarity
        else:
            # Find a list of indices that two users both rated
            user_i_rated_indices = np.array(user_i_vec) > 0
            user_j_rated_indices = np.array(user_j_vec) > 0
            corrated_items_index = np.intersect1d(np.where(user_i_rated_indices), np.where(user_j_rated_indices))
            
            length = len(corrated_items_index)
            
            if length == 0:
                user_pearson_corr[i][j] = 0
            else:   
                # Extract an array of ratings from two users for corrated items
                corrated_items_user_i_rated = user_i_vec[corrated_items_index]
                corrated_items_user_j_rated = user_j_vec[corrated_items_index]
                
                # Calculate average rating for two users
                user_i_avg_rating = sum(corrated_items_user_i_rated) / length
                user_j_avg_rating = sum(corrated_items_user_j_rated) / length

                # Calculate the subtraction of ratings and average rating for two users
                user_i_rating_sub = corrated_items_user_i_rated - user_i_avg_rating
                user_j_rating_sub = corrated_items_user_j_rated - user_j_avg_rating

                # Calculate the covariance
                covariance = sum(user_i_rating_sub*user_j_rating_sub)

                # Calculate the sum of square for rating subtraction for two users
                user_i_sq_sum = sum(np.square(user_i_rating_sub))
                user_j_sq_sum = sum(np.square(user_j_rating_sub))

                # Calculate the standard deviation for two users
                user_i_sqrt = np.sqrt(user_i_sq_sum)
                user_j_sqrt = np.sqrt(user_j_sq_sum)
                
                user_std = EPSILON
                
                if user_i_sqrt != 0 and user_j_sqrt != 0:
                    user_std = user_i_sqrt * user_j_sqrt

                # Calculate the similarity
                sim = covariance / user_std

                # Re-calculate with the correlation significance weighting factor
                weighted_sim = (min(len(corrated_items_index), GAMMA)/GAMMA) * sim

                # Assign value in the user-user matrix
                user_pearson_corr[i][j] = weighted_sim
                
                
## Compute Item-based Pearson Correlation Coefficient

# Create the item-item matrix
item_pearson_corr = np.zeros((n_items, n_items))

# Loop to calculate similarity for each pair of items
for i, item_i_vec in enumerate(imputed_train_ds.T):
    for j, item_j_vec in enumerate(imputed_train_ds.T):
        
        # In case of the item versus itself, the similarity is 1
        if i == j:
            item_pearson_corr[i][j] = 0
            
        # In case the similarity is already computed, simply assign its value to the cell
        elif j < i:
            item_pearson_corr[i][j] = item_pearson_corr[j][i]

        # Otherwise, calculate the similarity
        else:
            # Find a list of indices that two items were both rated
            item_i_rated_indices = np.array(item_i_vec) > 0
            item_j_rated_indices = np.array(item_j_vec) > 0
            corrated_users_index = np.intersect1d(np.where(item_i_rated_indices), np.where(item_j_rated_indices))
            
            length = len(corrated_users_index)
            
            if length == 0:
                item_pearson_corr[i][j] = 0
            else:
                # Extract an array of ratings for two items by corrated users
                item_i_rating_by_corrated_users = item_i_vec[corrated_users_index]
                item_j_rating_by_corrated_users = item_j_vec[corrated_users_index]
                
                # Calculate average rating for two items
                item_i_avg_rating = sum(item_i_rating_by_corrated_users) / length
                item_j_avg_rating = sum(item_j_rating_by_corrated_users) / length

                # Calculate the subtraction of ratings and average rating for two items
                item_i_rating_sub = item_i_rating_by_corrated_users - item_i_avg_rating
                item_j_rating_sub = item_j_rating_by_corrated_users - item_j_avg_rating

                # Calculate the covariance
                covariance = sum(item_i_rating_sub*item_j_rating_sub)

                # Calculate the sum of square for rating subtraction for two items
                item_i_sq_sum = sum(np.square(item_i_rating_sub))
                item_j_sq_sum = sum(np.square(item_j_rating_sub))

                # Calculate the standard deviation for two items
                item_i_sqrt = np.sqrt(item_i_sq_sum)
                item_j_sqrt = np.sqrt(item_j_sq_sum)
                
                item_std = EPSILON
                if item_i_sqrt != 0 and item_j_sqrt != 0:
                    item_std = item_i_sqrt * item_j_sqrt

                # Calculate the similarity
                sim = covariance / item_std

                # Re-calculate with the correlation significance weighting factor
                weighted_sim = (min(len(corrated_users_index), DELTA)/DELTA) * sim
                
                # Assign value in the item-item matrix
                item_pearson_corr[i][j] = weighted_sim
            
            
## Prediction of missing values

np_predictions = imputed_train_ds.copy()

# Loop to predict ratings in user-item matrix
for (i, j), rating in np.ndenumerate(imputed_train_ds):
    
    # If a rating data is missing, predict its value
    if rating == 0:
        # Initialize prediction value
        pred_value = 0
        
        # Calculate set of similar users for user i
        mask_sim_users = user_pearson_corr[i] > ITA
        users_indices = [index for index, x in enumerate(mask_sim_users) if x]
        
        # Remove the user from set of similar users if that user did not rate item j
        sim_users_indices = []
        for index in users_indices:
            if imputed_train_ds[index][j] > 0:
                sim_users_indices.append(index)

        sim_users = user_pearson_corr[i][sim_users_indices]
        sim_users_count = len(sim_users)
        
        # Calculate set of similar items for item j
        mask_sim_items = item_pearson_corr[j] > THETA
        items_indices = [index for index, x in enumerate(mask_sim_items) if x]
        
        # Remove the item from set of similar items if that item was not rated by user i
        sim_items_indices = []
        for index in items_indices:
            if imputed_train_ds[i][index] > 0:
                sim_items_indices.append(index)
                
        sim_items = item_pearson_corr[j][sim_items_indices]
        sim_items_count = len(sim_items)
        
        # Case 1: Both S(u) and S(j) are not empty
        if sim_users_count != 0 and sim_items_count != 0:
            
            # Calculate average rating for current user and item
            avg_rating_i = sum(imputed_train_ds[i])/sum(np.clip(imputed_train_ds[i], 0, 1))
            avg_rating_j = sum(imputed_train_ds.T[j])/sum(np.clip(imputed_train_ds.T[j], 0, 1))
            
            # Extract a rating list of user i for set of similar items and item j for set of similar 
            rating_item_j_sim_users = imputed_train_ds.T[j][sim_users_indices]
            rating_user_i_sim_items = imputed_train_ds[i][sim_items_indices]
            
            # Get the list of items that user i rated and list of users rated item j
            user_i_rated_indices = imputed_train_ds[i] > 0
            item_j_rated_indices = imputed_train_ds.T[j] > 0
            
            # Calculate average rating for each user in set of similar users and for each item in set of similar items
            avg_rating_sim_users = []
            for index, element in enumerate(imputed_train_ds):
                if index in sim_users_indices:
                    user_index_rated_indices = element > 0
                    corrated_items = np.intersect1d(np.where(user_i_rated_indices), np.where(user_index_rated_indices))
                    avg_rating_user = sum(element[corrated_items]) / len(corrated_items)
                    avg_rating_sim_users.append(avg_rating_user)
                    
            avg_rating_sim_items = []
            for index, element in enumerate(imputed_train_ds.T):
                if index in sim_items_indices:
                    item_index_rated_indices = element > 0
                    corrated_users = np.intersect1d(np.where(item_j_rated_indices), np.where(item_index_rated_indices))
                    avg_rating_item = sum(element[corrated_users]) / len(corrated_users)
                    avg_rating_sim_items.append(avg_rating_item)
                    
            # Calculate subtraction of rating and average rating
            rating_sub_avg_sim_users = rating_item_j_sim_users - avg_rating_sim_users
            rating_sub_avg_sim_items = rating_user_i_sim_items - avg_rating_sim_items
            
            # Calculate the sum of sim multiply by sub
            sum_sim_multiply_sub_user = sum(sim_users*rating_sub_avg_sim_users)
            sum_sim_multiply_sub_item = sum(sim_items*rating_sub_avg_sim_items)
            
            # Calculate prediction value for item and user
            pred_user = avg_rating_i + sum_sim_multiply_sub_user / sum(sim_users)
            pred_item = avg_rating_j + sum_sim_multiply_sub_item / sum(sim_items)
            
            # Calculate the prediction value
            pred_value = LAMBDA * pred_user + (1 - LAMBDA) * pred_item
            
        # Case 2: S(u) not empty and S(j) empty
        elif sim_users_count != 0 and sim_items_count == 0:
            
            # Calculate average rating for the current user
            avg_rating_i = sum(imputed_train_ds[i])/sum(np.clip(imputed_train_ds[i], 0, 1))
            
            # Get the list of items that user i rated
            user_i_rated_indices = imputed_train_ds[i] > 0
            
            # Calculate average rating for each user in set of similar users
            avg_rating_sim_users = []
            for index, element in enumerate(imputed_train_ds):
                if index in sim_users_indices:
                    user_index_rated_indices = element > 0
                    corrated_items = np.intersect1d(np.where(user_i_rated_indices), np.where(user_index_rated_indices))
                    avg_rating_user = sum(element[corrated_items])/len(corrated_items)
                    avg_rating_sim_users.append(avg_rating_user)
                    
            # Extract a rating list of item j for set of similar users
            rating_item_j_sim_users = imputed_train_ds.T[j][sim_users_indices]
            
            # Calculate subtraction of rating and average rating
            rating_sub_avg_sim_users = rating_item_j_sim_users - avg_rating_sim_users
            
            # Calculate the sum of sim multiply by sub
            sum_sim_multiply_sub_user = sum(sim_users*rating_sub_avg_sim_users)
            
            # Calculate the prediction value
            pred_value = avg_rating_i + sum_sim_multiply_sub_user/sum(sim_users)
            
        # Case 3: S(u) empty and S(j) not empty
        elif sim_users_count == 0 and sim_items_count != 0:
            
            # Calculate average rating for the current item
            avg_rating_j = sum(imputed_train_ds.T[j])/sum(np.clip(imputed_train_ds.T[j], 0, 1))
            
            # Get the list of users rated item j
            item_j_rated_indices = imputed_train_ds.T[j] > 0
            
            # Calculate average rating for each item in set of similar items
            avg_rating_sim_items = []
            for index, element in enumerate(imputed_train_ds.T):
                if index in sim_items_indices:
                    item_index_rated_indices = element > 0
                    corrated_users = np.intersect1d(np.where(item_j_rated_indices), np.where(item_index_rated_indices))
                    avg_rating_item = sum(element[corrated_users])/len(corrated_users)
                    avg_rating_sim_items.append(avg_rating_item)
                    
            # Extract a rating list of user i for set of similar items
            rating_user_i_sim_items = imputed_train_ds[i][sim_items_indices]
            
            # Calculate subtraction of rating and average rating
            rating_sub_avg_sim_items = rating_user_i_sim_items - avg_rating_sim_items
            
            # Calculate the sum of sim multiply by sub
            sum_sim_multiply_sub_item = sum(sim_items*rating_sub_avg_sim_items)
            
            # Calculate the prediction value
            pred_value = avg_rating_j + sum_sim_multiply_sub_item/sum(sim_items)
            
        # Case 4: Both S(u) and S(j) are empty, do nothing as prediction value is already initialized to 0
        
        # Assign prediction value to the according cell
        np_predictions[i][j] = pred_value
        np_predictions[i][j] = np.clip(np_predictions[i][j],0,5)
        
imputed_train_ds = np_predictions.copy()

# Evaluation

### Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set

In [8]:
imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,4.425074,2.000000,3.402373,4.0,4.351020,4.000000,4.000000,0.000000,0.000000,2.000000,...,2.645597,4.000000,4.000000,3.483082,3.000000,3.000000,2.907920,5.000000,2.351318,1.496572
1,5.000000,0.000000,4.000000,4.0,4.000000,0.000000,0.000000,4.000000,0.000000,0.000000,...,3.474811,0.000000,0.000000,4.689655,2.990424,0.000000,4.033333,3.554829,3.079832,3.282890
2,4.000000,3.872283,4.256618,2.0,0.000000,2.986031,0.000000,0.000000,0.000000,3.498245,...,1.729128,0.000000,0.000000,4.778645,3.990424,2.751974,1.663048,4.373419,0.000000,2.507190
3,4.000000,0.000000,5.000000,0.0,1.000000,1.067479,3.000000,2.000000,0.000000,0.000000,...,2.261941,3.128846,4.241216,4.000000,1.000000,0.000000,3.033333,4.296300,0.000000,2.000000
4,5.000000,3.753006,0.000000,0.0,5.000000,5.000000,0.000000,3.000000,5.000000,0.000000,...,3.243736,4.000000,4.034025,3.403172,4.541226,0.250000,3.000000,5.000000,3.000000,2.811735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.000000,4.601449,0.000000,0.0,4.393126,4.067479,0.000000,5.000000,4.000000,5.000000,...,2.053308,0.000000,4.000000,5.000000,4.000000,2.000000,3.000000,5.000000,0.000000,3.257939
296,3.104861,1.434783,5.000000,4.0,0.000000,1.000000,3.182234,2.305640,0.806383,1.000000,...,2.764476,0.000000,2.182234,2.294313,0.541226,0.000000,0.000000,2.933278,1.000000,2.617279
297,5.000000,3.628358,0.000000,5.0,0.000000,4.628358,4.098765,0.000000,1.628358,0.000000,...,3.654323,0.000000,5.000000,3.522989,4.055631,2.628358,3.057699,0.000000,0.000000,3.806232
298,2.522727,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,2.522727,0.806383,0.000000,...,3.000000,4.000000,2.948716,2.522989,2.990424,2.369048,0.000000,3.757143,3.404626,3.234472


In [9]:
active_user_pearson_corr = np.zeros((active_ds.shape[0], train_ds.shape[0]))

# Compute Pearson Correlation Coefficient of All Pairs of Users between active set and imputed training set
for i, user_i_vec in enumerate(active_ds.values):
    for j, user_j_vec in enumerate(imputed_train_ds.values):
        
        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        active_user_pearson_corr[i][j] = weighted_sim

active_user_pearson_corr

array([[ 0.06834807, -0.11825281, -0.3618097 , ...,  0.05649737,
        -0.21006067,  0.55411922],
       [ 0.02085776, -0.35505123,  0.45130864, ..., -0.15909952,
         0.29091363,  0.07351821],
       [-0.05241235,  0.21306032,  0.19074118, ..., -0.07058337,
         0.17362642, -0.22867732],
       ...,
       [ 0.3885444 ,  0.6166051 ,  0.06399615, ...,  0.54401705,
        -0.2083534 ,  0.58643886],
       [ 0.22896284,  0.48473701,  0.0974579 , ...,  0.57082826,
         0.65231431,  0.03444186],
       [ 0.23649629, -0.23892767, -0.06896578, ..., -0.14897401,
         0.10426395,  0.05929731]])

## Predict Ratings of Testing Set

In [10]:
K = 10

test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_user_ids = np.argsort(active_user_pearson_corr[i])[-1:-(K + 1):-1]

        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = active_user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = imputed_train_ds.values[sim_user_ids]
        user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        test_ds_pred[i][j] = user_based_pred
        
test_ds_pred


array([[0.        , 0.        , 0.        , ..., 0.        , 2.36458673,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 3.85921733, ..., 3.38491988, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [4.24176149, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Compute MAE and RMSE

In [11]:
# MAE
MAE = np.sum(np.abs(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1)))

print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7706912964162808, RMSE: 0.9890086344496307
