In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd

## Read Data

In [2]:
data =  pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
data

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


## User-Item Matrix

In [3]:
user_ids, item_ids, ratings = data['user_id'].to_list(), data['item_id'].to_list(), data['rating'].to_list()
user_item_matrix = np.zeros((max(user_ids), max(item_ids)))
print(user_item_matrix.shape)

for user_id, item_id, rating in zip(user_ids, item_ids, ratings):
    row, col = user_id - 1, item_id - 1
    user_item_matrix[row, col] = rating

(943, 1682)


array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

## Normalization

In [176]:
def normalize(user_item_matrix):
    mean = np.mean(user_item_matrix, axis=1)
    std = np.std(user_item_matrix, axis=1)
    normalized_table = user_item_matrix - mean[:, None]
    normalized_table /= std[:, None]
    return normalized_table, mean, std

normalized_table, mean, std = normalize(user_item_matrix)

## User-User Similarity Matrix

In [14]:
num_users = user_item_matrix.shape[0]
similarity_matrix = np.zeros((num_users, num_users))

for user_id, user_ratings in tqdm(enumerate(user_item_matrix), position=0):
    for comp_user_id, comp_user_ratings in enumerate(user_item_matrix):
        user_nonzero_ratings = [index for index, rating in enumerate(user_ratings) if rating != 0]
        comp_user_nonzero_ratings = [index for index, rating in enumerate(comp_user_ratings) if rating != 0]
        common_rating_indices = np.intersect1d(user_nonzero_ratings, comp_user_nonzero_ratings)

        score = .0
        num_common_ratings = len(common_rating_indices)
        if num_common_ratings != 0:
            for movie_rating_index in common_rating_indices:
                score += normalized_table[user_id][movie_rating_index] * normalized_table[comp_user_id][movie_rating_index]
            score /= num_common_ratings
        
        similarity_matrix[user_id, comp_user_id] = score

943it [17:09,  1.16s/it]
943it [17:09,  1.09s/it]


## Predictions & Performance(RMSE)

In [89]:
def calculate_rmse(predictions, user_ratings):
    return np.sqrt(np.mean((predictions - user_ratings) ** 2))

In [174]:
def movie_recommendation(user_id, k):
    movie_ids = np.nonzero(user_item_matrix[user_id])[0] # ids of movies rated by the user
    user_ratings = user_item_matrix[user_id][movie_ids]  # ratings given by the user. paired with movie_ids

    predictions = []
    for movie_id in movie_ids:
        common_users_ids = np.nonzero(user_item_matrix[:, movie_id])[0]             # ids of users who rated specific movie
        common_users_ratings = normalized_table[:, movie_id][common_users_ids]      # ratings of users who rated specific movie
        user_similarity_scores = similarity_matrix[user_id, :][common_users_ids]    # similarity scores paired with common_users_ratings

        val_sum = 0
        for rating, similarity_score in zip(common_users_ratings, user_similarity_scores):
            val_sum += rating * similarity_score
        prediction = val_sum / sum(user_similarity_scores)
        prediction = prediction * std[user_id] + mean[user_id]                      # compute prediction rating
        predictions.append(prediction)

    sorted_predictions = np.argsort(predictions)[::-1][:k]
    recommendations = [movie_ids[movie_id] for movie_id in sorted_predictions]      # get top k recommendations based off the predictions
    return recommendations, calculate_rmse(predictions, user_ratings)

recommendations, rmse = movie_recommendation(user_id=25, k=5)

In [216]:
for test_user_id in range(0, 1000, 100):
    recommendations, rmse = movie_recommendation(test_user_id, 5)
    recommendations_string = ','.join(str(v) for v in recommendations)
    print(f'user id: {test_user_id}\t recommendations: {recommendations_string}\t rmse: {rmse}')

user id: 0	 recommendations: 241,271,268,257,252	 rmse: 2.585239943137318
user id: 100	 recommendations: 303,49,287,180,0	 rmse: 0.8557054742806952
user id: 200	 recommendations: 312,301,271,241,314	 rmse: 3.0890860468716643
user id: 300	 recommendations: 268,299,257,339,332	 rmse: 2.2377882376023956
user id: 400	 recommendations: 271,301,314,315,285	 rmse: 1.7022862136834105
user id: 500	 recommendations: 312,306,126,99,287	 rmse: 0.8908815972715709
user id: 600	 recommendations: 257,49,126,99,8	 rmse: 1.4846059674128074
user id: 700	 recommendations: 312,271,314,268,315	 rmse: 0.9737524448656566
user id: 800	 recommendations: 312,301,299,332,306	 rmse: 1.4551148911358651
user id: 900	 recommendations: 49,274,180,747,14	 rmse: 1.3967622905159345
