In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
names = ['user_id', 'item_id', 'rating']
df = pd.read_csv('rating.csv', sep='\t', names=names)
df.head()

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')
df

5 users
4 items


Unnamed: 0,user_id,item_id,rating
0,0,0,5
1,0,1,3
2,0,2,5
3,0,3,1
4,1,0,5
5,1,1,4
6,1,2,3
7,1,3,1
8,2,0,4
9,2,1,4


In [4]:
train_df, test_df = train_test_split(df, test_size=0.1)
train_df, test_df

(    user_id  item_id  rating
 2         0        2       5
 11        2        3       1
 1         0        1       3
 4         1        0       5
 18        4        2       3
 14        3        3       5
 5         1        1       4
 12        3        0       4
 19        4        3       5
 6         1        2       3
 9         2        1       4
 7         1        3       1
 17        4        1       2
 10        2        2       5
 16        4        0       1
 8         2        0       4
 13        3        2       5
 3         0        3       1,
     user_id  item_id  rating
 15        3        1       3
 0         0        0       5)

In [7]:
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1], row[2]] = row[3]
train_ds = pd.DataFrame(train_ds)   

In [8]:
train_ds

Unnamed: 0,0,1,2,3
0,0.0,3.0,5.0,1.0
1,5.0,4.0,3.0,1.0
2,4.0,4.0,5.0,1.0
3,4.0,0.0,5.0,5.0
4,1.0,2.0,3.0,5.0


In [9]:
# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1], row[2]] = row[3]
test_ds = pd.DataFrame(test_ds)

test_ds

Unnamed: 0,0,1,2,3
0,5.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,3.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [41]:
# user based similarity

GAMMA = 5
EPSILON = 1e-9
np_user_pearson_corr = np.zeros((n_users, n_users))

In [42]:
np_user_pearson_corr.shape

(5, 5)

In [50]:
# user * user

for i, user_i_vec in enumerate(train_ds.values):  # train_ds.T.values for Item based
    for j, user_j_vec in enumerate(train_ds.values):  # train_ds.T.values for Item based
        # print(i,j) # 0,0 0,1 .. 4,4
        # print(str(i) + " user: " + str(user_i_vec) + "; " + str(j) + " user: " + str(user_j_vec))
        
        # find corrare users - intersection. Before mask, as zero is not considered intersect
        
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0
        #print(mask_i, mask_j)
        
        # find co rate index. Skip if no intersecrtion
        
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        # print("co-reated item index",i," ", j,str(corrated_index))
        
        if len(corrated_index) == 0:
            continue
        # centering by subracting with mean - (a-mean(a))*(b-mean(b))
        
        # finding mean
        # np.sum(user_i_vec) total rating of user_i eg 3+5+1
        
        # print(np.sum(np.clip(user_i_vec,0,1))) # gives total, 3,5,1 all become 1. So 1+1+1 = 3
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec,0,1))+EPSILON)
        #print(np.sum(np.clip(user_i_vec,0,1)))
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)
        
        # print("np.clip(user_i_vec, 0, 1)" + " is " + str(np.clip(user_i_vec, 0, 1)))
        
        # compute pearson corr -  (a-mean(a))*(b-mean(b)) only for correlated. So zero doesn't mean negative
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j
        
        #print("mean_user_i " + " is " + str(mean_user_i))
        #print("user_i_sub_mean " + " is " + str(user_i_sub_mean))
        
        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))
        
        sim = np.sum(user_i_sub_mean*user_j_sub_mean)/(r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON) # similarity formula to code
        #print(sim)
        
        # significance weighting
        #weighted_sim = sim
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim
        #print("len(corrated_index) " + " is " + str(len(corrated_index)))
        #print(weighted_sim)
        np_user_pearson_corr[i][j] = weighted_sim
np_user_pearson_corr

array([[ 6.00000000e-01,  3.55799672e-01,  5.73709732e-01,
         2.00000015e-10, -3.55799672e-01],
       [ 3.55799672e-01,  8.00000000e-01,  5.85973617e-01,
        -5.13630990e-01, -8.00000000e-01],
       [ 5.73709732e-01,  5.85973617e-01,  8.00000000e-01,
        -1.65615734e-01, -5.85973617e-01],
       [ 2.00000015e-10, -5.13630990e-01, -1.65615734e-01,
         5.99999999e-01,  5.13630990e-01],
       [-3.55799672e-01, -8.00000000e-01, -5.85973617e-01,
         5.13630990e-01,  8.00000000e-01]])

In [64]:
# rating prediction

np_predictions = np.zeros((n_users, n_items))

K = 3
EPSILON = 1e-9

for (i, j), rating in np.ndenumerate(test_ds.values): 
    # i, j have index 
    # rating have value
    # print(rating)
    if rating>0:
        # print("---"+str(i) + " " + str(j))
        
        # find top-k most similar users as the current user, remove itself
        #print(np_user_pearson_corr[i])
        sim_user_ids = np.argsort(np_user_pearson_corr[i])[-(K + 1):-1] # take top neighbours, but not itself. Comparing it with it have heighest values so removing
        #print(np.argsort(np_user_pearson_corr[i]))
        #print(np.argsort(np_user_pearson_corr[i])[-(K + 1):-1]) # a = [0,1,2,3,4,5] - a[3:-1] -> 3,4
        
        # save those values in sim_user_ids
        sim_val = np_user_pearson_corr[i][sim_user_ids]
        
        # similar user / similar item -  train_ds.T.values for Item based
        sim_users = train_ds.values[sim_user_ids] 
        user_mean =  np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)
        
        #print(sim_user_mean)
        
        
        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        np_predictions[i][j] = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5) # ensure it is between 0 to 5
        
np_predictions

array([[3.97847777, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 3.32180864, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ]])

In [66]:
#==================MAE on Testing set===================#
labels = test_ds.values

# absolute error on all ratings
absolute_error = np.abs(np_predictions - labels)
print(absolute_error)

# weight
weight = np.clip(labels, 0, 1)
print(weight)

# absoulte error on rated ratings
abs_error = absolute_error * weight

# MAE
MAE = np.sum(abs_error) / np.sum(weight)

print("MAE on Tesing set (User-based): " + str(MAE));

[[1.02152223 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.32180864 0.         0.        ]
 [0.         0.         0.         0.        ]]
[[1. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]]
MAE on Tesing set (User-based): 0.6716654364542862


In [67]:
#==================RMSE on Testing set===================
labels = test_ds.values

# squared error on all ratings
squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

# squared error on rated ratings
squared_error = squared_error * weight

# RMSE
RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (User-based): " + str(RMSE));

RMSE on Tesing set (User-based): 0.757320429823951
