# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('~/ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [3]:
from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

943 users
1682 items
Construct the rating matrix based on train_df:
     0     1     2     3     4     5     6     7     8     9     ...  1672  \
0     0.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   0.0   
1     4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   0.0   
2     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4     4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
938   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   0.0   
939   0.0   0.0   0.0   2.0   0.0   0.0   4.0   5.0   3.0   0.0  ...   0.0   
940   5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
941   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
942   0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0   3.0   0.0  ...   0.0

# Utils

In [4]:
# Please don't change this cell
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution

In [5]:
# Reference: Dr.Yongli Ren (2013), Lecture Class 10 example, KNN_based_CF_Demo.ipynb
# Prepare centered cosin similarity
GAMMA = 25
EPSILON = 1e-9


np_user_pearson_corr = np.zeros((n_users, n_users))
for i, user_i_vec in enumerate(train_ds.values):
    for j, user_j_vec in enumerate(train_ds.values):
        
        # ratings corated by the current pair of users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)
                
        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j
        
        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))
        
        #  centered cosin similarity
        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting - Use significance weighting if the size of co-rated item set is too small, the corresponding similarity is likely not that reliable.
        weighted_sim = sim
        #weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim
        #print("len(corrated_index) " + " is " + str(len(corrated_index)))
        
        np_user_pearson_corr[i][j] = weighted_sim
        
print('user_pearson_corr:')        
print(np_user_pearson_corr)


user_pearson_corr:
[[ 1.          0.5199339   0.16829646 ...  0.50529681 -0.04560263
  -0.05569836]
 [ 0.5199339   1.          0.14726926 ... -0.11386218  0.0846679
   0.76751649]
 [ 0.16829646  0.14726926  1.         ...  0.85049107 -0.31133671
   1.        ]
 ...
 [ 0.50529681 -0.11386218  0.85049107 ...  1.         -1.
   0.01272331]
 [-0.04560263  0.0846679  -0.31133671 ... -1.          1.
   0.4021779 ]
 [-0.05569836  0.76751649  1.         ...  0.01272331  0.4021779
   1.        ]]


In [6]:
KNN_LAMBDA = 0
LAMBDA = 0.5

# Slope One Predictors    
def slope_one_predictors(i_input, j_input):
    dev1 = np.zeros((n_items, n_items))
    dev2 = np.zeros((n_items, n_items))
    dev = np.zeros((n_items, n_items))
    dev_user = np.zeros((n_items, n_items))
    for i, item_i_vec in enumerate(train_ds.T.values):
        if i == j_input:  # only perfrom prediction for the test_df items
            for j, item_j_vec in enumerate(train_ds.T.values):
        
                if i != j:
                    # only include rated users
                    mask_i = item_i_vec > 0
                    mask_j = item_j_vec > 0
                    corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
                    # filter users with KNN concept take users with sim lager than definded value only 
                    # comment the follow 2 lines if not apply the KNN concept 
                    corrated_index2 = np.where(np_user_pearson_corr[i_input][corrated_index]>KNN_LAMBDA)[0]
                    corrated_index = corrated_index[corrated_index2]
                    if len(corrated_index) == 0:
                        continue
                    
                    item_corrated_diff = item_i_vec[corrated_index] - item_j_vec[corrated_index]
                    item_corrated_user = len(corrated_index)
                    
                    # DEV part 1 - get the average difference between the ratings of one item and another
                    dev1[i][j] = np.sum(item_corrated_diff)/len(corrated_index)
                    dev_user[i][j] = len(corrated_index)
        
                    # DEV part 2 -  with Centered Cosine Similarity and Rating-Based Collaborative Filtering ( active user = i_input)
                    dev2[i][j] = np.sum(np.multiply(item_corrated_diff, np.exp(np_user_pearson_corr[i_input][corrated_index])))/np.sum(np.multiply(len(corrated_index), np.exp(np_user_pearson_corr[i_input][corrated_index])))
                    
                    # DEV
                    dev[i][j] = (LAMBDA * dev1[i][j]) + ((1-LAMBDA) * dev2[i][j])
    
    
    # prediction
    pred_wso = np.zeros((n_users, n_items))
    for i, user_i_vec in enumerate(train_ds.values):
        if i == i_input:  ## only perfrom prediction for the test_df users
            rated_index = np.where(user_i_vec>0)[0]
        
            for n in range(n_items):
                if n == j_input:  ## only perfrom prediction for the test_df items
                    pred_wso[i][n] = np.sum((dev[n][rated_index] + user_i_vec[rated_index]) * dev_user[n][rated_index]) / np.sum(dev_user[n][rated_index])
            
    return pred_wso

train_test_predictions_result = np.zeros((n_users, n_items))
for (i, j), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        predictions_result = slope_one_predictors(i,j)
        train_test_predictions_result[i][j] = predictions_result[i][j]
        
print('prediction:')
print(train_test_predictions_result)

prediction:
[[3.90957736 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [7]:
# Reference: Dr.Yongli Ren (2013), Lecture Class 10 example, KNN_based_CF_Demo.ipynb
EPSILON = 1e-9
weight = np.zeros((n_users, n_items))
absolute_error = np.zeros((n_users, n_items))
abs_error = np.zeros((n_users, n_items))

# MAE and RMSE on Testing set
print('prediction:')
print(train_test_predictions_result)

labels = test_ds.values
print('labels:')
print(labels)

absolute_error = np.abs(train_test_predictions_result - labels)
print('absolute_error:')
print(absolute_error)

# weight
weight = np.clip(labels, 0, 1)
print('weight:')
print(weight)

# absoulte error on rated ratings
abs_error = absolute_error * weight
print('abs_error:')
print(abs_error)
abs_error = np.nan_to_num(abs_error, copy=True, nan=0.0, posinf=None, neginf=None)

# MAE
MAE = np.sum(abs_error) / np.sum(weight)
print('MAE:')
print(MAE)


# squared error on all ratings
squared_error = np.square(train_test_predictions_result - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight
squared_error = np.nan_to_num(squared_error, copy=True, nan=0.0, posinf=None, neginf=None)

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print('RMSE:')
print(RMSE)

#MAERMSE = evaluate(labels, train_test_predictions_result)
#MAE = MAERMSE[0] 
#RMSE = MAERMSE[1]

prediction:
[[3.90957736 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
labels:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
absolute_error:
[[1.09042264 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.    

In [8]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7461683679701425, RMSE: 0.9495673617313544
