In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

Import necessary modules

In [3]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

items.head()

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

943 users
1682 items


name the features
check the number of users and items

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, test_df

(       user_id  item_id  rating  timestamp
 84196      629      632       3  880117031
 87893      940       98       4  885921421
 46671      639      796       1  891240805
 67505      744      340       3  881171820
 31329      328       44       3  885047864
 ...        ...      ...     ...        ...
 52239      627      810       3  879531459
 55169      710      335       1  882063564
 70002      458      762       3  886395065
 87627      344      117       3  884899767
 55861      752     1105       3  891207983
 
 [80000 rows x 4 columns],
        user_id  item_id  rating  timestamp
 66807      843      447       2  879443297
 84389      907     1284       5  881030348
 83678      715       31       4  875963692
 97427      759      237       3  881476891
 49817      681      259       2  885409882
 ...        ...      ...     ...        ...
 56790      413       14       5  879969513
 64497      395      186       5  883764817
 78371      144     1028       3  888104495
 74

split the data into training and testing data.

In [5]:
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
train_ds = pd.DataFrame(train_ds)

test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
test_ds = pd.DataFrame(test_ds)

train_ds, test_ds

(     0     1     2     3     4     5     6     7     8     9     ...  1672  \
 0     5.0   3.0   4.0   0.0   3.0   5.0   0.0   1.0   0.0   3.0  ...   0.0   
 1     4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   0.0   
 2     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 3     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 4     4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 ..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
 938   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   0.0   
 939   0.0   0.0   0.0   2.0   0.0   0.0   4.0   0.0   3.0   0.0  ...   0.0   
 940   5.0   0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0  ...   0.0   
 941   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 942   0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
 
      1673  1674  1675  1676  1677  1678  1679  16

Create a user-item table with the corresponding ratings for training data set
Create a user-item table with the corresponding ratings for test data set

In [None]:
DELTA = 25
EPSILON = 1e-9

np_item_pearson_corr = np.zeros((n_items, n_items))

#for every item (T is the transpose of train_ds and values)to make rows items.
for i, item_i_vec in enumerate(train_ds.T.values):
    #for every item.
    for j, item_j_vec in enumerate(train_ds.T.values):

        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0

        #np.where(mask_i) will return indexes of the columns where there are values (the users that have rated i)
        #np.where(mask_j) will return indexes of the columns where there are values (the users that have rated j)
        #np.intersect will return the indexes of common users, that have rated both i and j.
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        
        #if there are no common ratings, do nothing.
        if len(corrated_index) == 0:
            continue
        
        #the numerator is the sum of the ratings for i and j
        #np.clip will remap values smaller than 0 to 0 and values larger than 1, to 1, for all ratings of item i and j.
            #as the minimum rating is 1, this will provide a count for how many total ratings there are.
        #np.sum will sum how many ratings there are for i and j.
        #EPSILON will make sure that there is no null divisor.
        #The result is the mean rating for i and j.
        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

        #take the mean ratings of i and j from each rating for i and j
        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j
        
        #square the ratings for i and j
        r_ui_sub_ri_sq = np.square(item_i_sub_mean)
        r_uj_sub_rj_sq = np.square(item_j_sub_mean)

        #root the sum of the squares of the ratings for i and j.
        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

        #numerator
            #multiply all of the normalised ratings for i and j
            #sum the result
        #denominator
            #Multiply the root of the sum of the squares for i and j
            #Add epsilon to stop divisions by 0
        #This yields the centered cosine similarity between i and j
        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

        #If the size of the co-rated item set is too small, the corresponding similarity is likely not that reliable.
        #To weight it, we take in to account how many items are in the co-rated set.
        #If there are less than 25 (DELTA) items, it will get less weight.
        #If there are more than 25 items, the weight will stay the same
        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim

        np_item_pearson_corr[i][j] = weighted_sim

#This now holds the correlations between each item.
np_item_pearson_corr

In [None]:
#create a matrix to store the predictions for each user-item pair
np_predictions = np.zeros((n_users, n_items))

K = 10
EPSILON = 1e-9

#for each user-item pair in the test dataset
for (user, item), rating in np.ndenumerate(test_ds.values):
    if rating > 0:
        #if there is an existing rating, 
        #What is being sorted:
            #The ids of similarities between item and the other items
        #Result of the sorted:
            #Ids of similarity between item and other items , sorted ascendingly by similarity value.
        #Sim_item_ids:
            #The ids of the top-ten most similar items with the current item.
        sim_item_ids = np.argsort(np_item_pearson_corr[item])[-(K + 1):-1]
        
        #The similarities of the top ten most similar items to the current item.
        sim_val = np_item_pearson_corr[item][sim_item_ids]
    
        #sim_items is the item vectors of the top ten most similar items to the current item
        sim_items = train_ds.T.values[sim_item_ids]
        
        #Numerator
            #The sum of the ratings of the current item
        #Denominator
            #The count of ratings of the current item
        #Result
            #The average rating of the current item
        item_mean = np.sum(train_ds.T.values[item]) / (np.sum(np.clip(train_ds.T.values[item], 0, 1)) + EPSILON)
        
        #Numerator
            #The sum of the ratings for each item
        #Denominator
            #The count of the ratings for each item
        #Result
            #The average rating of each item
        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

        #What is being multiplied
            #The similarities of the top ten most similar items to the current item.
            #The ratings of the user for the most similar items, subtracted by the average rating for that item.
        #Result
            #The weighted ratings of the current user to the top ten most similar items, weighted by similarity. (Centering)
        sim_r_sum_mean = sim_val * (sim_items[:, user] - sim_item_mean) 

        #w is the boolean of whether the current user rated the 10 most similar items
        w = np.clip(sim_items[:, user], 0, 1)
        
        #Only the weighted ratings that are actually items rated by user are kept.
        sim_r_sum_mean *= w

        #The prediction is
            #The average rating for the current item (calculated with all users)
            # +
            #Numerator
                #The sum of the weighted ratings that the user has rated, of the top ten most similar items
            #Denominator 
                #The sum of the similarities of the items that the user has rated, of the top ten most similar items.
            #Result
                # The weighted rating 
        np_predictions[user][item] = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)  
        
        #The prediction is clipped at 0 and 5.
            #Should this not be 1 and 5? It can be, but does not have to be.
        np_predictions[user][item] = np.clip(np_predictions[user][item], 0, 5)
    

In [18]:
labels = test_ds.values

absolute_error = np.abs(np_predictions - labels)

weight = np.clip(labels, 0, 1)

abs_error = absolute_error * weight

MAE = np.sum(abs_error) / np.sum(weight)

print("MAE on Tesing set (Item-based): " + str(MAE));

MAE on Tesing set (Item-based): 0.8129100318152723


In [16]:
labels = test_ds.values

squared_error = np.square(np_predictions - labels)
weight = np.clip(labels, 0, 1)

squared_error = squared_error * weight

RMSE = np.sqrt(np.sum(squared_error) / np.sum(weight))

print("RMSE on Tesing set (Item-based): " + str(RMSE));

RMSE on Tesing set (Item-based): 1.0504356631295801
