# Install and load necesary packages

In [None]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [None]:
# Please don't change this cell

df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

# obtain top 500 users and top 500 items
user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(500).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(500).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

df.head()

# Split dataset

## Randomly select one rating from each user as test set

In [None]:
# Please don't change this cell

# remap user and item ID
df['user_id'] = df.groupby('user_id').ngroup()
df['item_id'] = df.groupby('item_id').ngroup()

test_df = df.groupby('user_id').sample(1, random_state=1024)
train_df = df[~df.index.isin(test_df.index)]

In [None]:
# Please don't change this cell

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
avg_num = df.groupby('user_id').size().mean()
density = df.shape[0] / (n_users * n_items)
min_ratings = df.rating.min()
max_ratings = df.rating.max()

print("The number of users: {}" .format(n_users))
print("The number of items: {}" .format(n_items))
print("Avg. # of rated Items/User: {}" .format(avg_num))
print("Density of data: {}" .format(density))
print("Ratings Range: {} - {}" .format(min_ratings, max_ratings))

The number of users: 500
The number of items: 500
Avg. # of rated Items/User: 129.914
Density of data: 0.259828
Ratings Range: 1 - 5


In [None]:
# Please don't change this cell

# Convert the format of datasets to matrices
# Train dataset
df_zeros = pd.DataFrame({
    'user_id': np.tile(np.arange(0, n_users), n_items), 
    'item_id': np.repeat(np.arange(0, n_items), n_users), 
    'rating': 0})
train_ds = df_zeros.merge(train_df, 
                          how='left', 
                          on=['user_id', 'item_id']).fillna(0.).pivot_table(
                              values='rating_y', 
                              index='user_id', 
                              columns='item_id').values
                           
# Test dataset
test_ds = df_zeros.merge(test_df, 
                         how='left', 
                         on=['user_id', 'item_id']).fillna(0.).pivot_table(
                             values='rating_y', 
                             index='user_id', 
                             columns='item_id').values

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

Construct the rating matrix based on train_df:
[[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [4. 3. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 4. 0.]]
Construct the rating matrix based on test_df:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Utils

In [None]:
# Please don't change this cell
EPSILON = 1e-9

def user_corr(imputed_train_ds):
    '''
    Function for calculating user's similarity
    '''
    active_user_pearson_corr = np.zeros((imputed_train_ds.shape[0], imputed_train_ds.shape[0]))

    # Compute Pearson Correlation Coefficient of All Pairs of Users between active set and training dataset
    for i, user_i_vec in enumerate(imputed_train_ds):
        for j, user_j_vec in enumerate(imputed_train_ds):

            # ratings corated by the current pair od users
            mask_i = user_i_vec > 0
            mask_j = user_j_vec > 0

            # corrated item index, skip if there are no corrated ratings
            corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
            if len(corrated_index) == 0:
                continue

            # average value of user_i_vec and user_j_vec
            mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
            mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

            # compute pearson corr
            user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
            user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

            r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
            r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
            active_user_pearson_corr[i][j] = sim

    return active_user_pearson_corr

def predict(test_ds, imputed_train_ds, user_corr, k=20):
    '''
    Function for predicting ratings in test_ds
    '''

    # Predicting ratings of test set
    predicted_ds = np.zeros_like(test_ds)

    for (i, j), rating in np.ndenumerate(test_ds):

        if rating > 0:

            # only predict ratings on test set
            sim_user_ids = np.argsort(user_corr[i])[-1:-(k + 1):-1]

            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = user_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds[sim_user_ids]
            
            mask_rateditem_user = imputed_train_ds[i] != 0
            num_rated_items = mask_rateditem_user.astype(np.float32)
            user_mean = np.sum(imputed_train_ds[i, mask_rateditem_user]) / (num_rated_items.sum() + EPSILON)

            mask_nei_rated_items = sim_users != 0
            num_rated_per_user = mask_nei_rated_items.astype(np.float32)
            num_per_user = num_rated_per_user.sum(axis=1)

            sum_per_user = sim_users.sum(axis=1)
            sim_user_mean = sum_per_user / (num_per_user + EPSILON)
            
            mask_rated_j = sim_users[:, j] > 0
                            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)

            predicted_ds[i, j] = np.clip(user_based_pred, 0, 5)
            
    return predicted_ds

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Baseline - KNN based recommendation (Similarity Metric: Pearson Correlation Coefficient)

In [None]:
# Please don't change this cell

user_pearson_corr = user_corr(train_ds)
predicted_ds = predict(test_ds, train_ds, user_pearson_corr, k=20)

In [None]:
# Please don't change this cell

MAE, RMSE = evaluate(test_ds, predicted_ds)

print("===================== Baseline Result =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8471711011333851, RMSE: 1.092846045041526


# Your Solution
(Put all your implementation for your solution in the following cell only)

In [None]:
# Write your code here
# You are required to implement the existing solution in the given report here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

MAE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

# To avoid messing up anything above
dfc = df.drop(columns=['timestamp'])
dfoutput = dfc.copy(deep=True)

# Iterate through list of items
for item in df.item_id.unique():

  # Iterate through list of users
  for user in df.user_id.unique():

    # If a user does not have a rating for a particular item
    if item not in dfc[dfc.user_id == user]['item_id'].values:

      # Give the user a new rating of the item, where rating = average rating given out by that user
      add = {'user_id': user, 'item_id': item, 'rating':dfc[dfc.user_id == user].rating.mean()}
      dfoutput = dfoutput.append(add, ignore_index=True)

# Data density verification, should be 1.0 assuming all the missing ratings are filled
density = dfoutput.shape[0] / (dfoutput.user_id.unique().shape[0] * dfoutput.item_id.unique().shape[0])
avg_num = dfoutput.groupby('user_id').size().mean()
density = dfoutput.shape[0] / (n_users * n_items)
min_ratings = dfoutput.rating.min()
max_ratings = dfoutput.rating.max()

print("The number of users: {}" .format(n_users))
print("The number of items: {}" .format(n_items))
print("Avg. # of rated Items/User: {}" .format(avg_num))
print("Density of data: {}" .format(density))
print("Ratings Range: {} - {}" .format(min_ratings, max_ratings))

# Operate on the new data
dfoutput['user_id'] = dfoutput.groupby('user_id').ngroup()
dfoutput['item_id'] = dfoutput.groupby('item_id').ngroup()

test_dfo = dfoutput.groupby('user_id').sample(1, random_state=1024)
train_dfo = dfoutput[~dfoutput.index.isin(test_dfo.index)]

dfo_zeros = pd.DataFrame({
    'user_id': np.tile(np.arange(0, n_users), n_items), 
    'item_id': np.repeat(np.arange(0, n_items), n_users), 
    'rating': 0})
train_dso = dfo_zeros.merge(train_dfo, 
                          how='left', 
                          on=['user_id', 'item_id']).fillna(0.).pivot_table(
                              values='rating_y', 
                              index='user_id', 
                              columns='item_id').values
                           
test_dso = dfo_zeros.merge(test_dfo, 
                         how='left', 
                         on=['user_id', 'item_id']).fillna(0.).pivot_table(
                             values='rating_y', 
                             index='user_id', 
                             columns='item_id').values

user_pearson_corr = user_corr(train_dso)
predicted_dso = predict(test_dso, train_dso, user_pearson_corr, k=20)

# Write new values in
MAE, RMSE = evaluate(test_dso, predicted_dso)





Density of data: 1.0


## Print the MAE and RMSE of Your Implementation

In [None]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.263879522269567, RMSE: 0.4723215815156996
