## Implement a movie recommendation system and run it on the movie lens dataset 

(train vs test)

Measure performance on test set using RMSE (Root mean squared deviation)



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os

import torch.nn as nn

In [None]:
# first, generate train and test sets of data:
filepath = 'ml-100k/u'
#pd.read_csv(filepath, sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
u = ['1', '2', '3', '4', '5']
    # base = train
    # test = test
for i in u:
    train = pd.read_csv(f'{filepath}{i}.base', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
    test = pd.read_csv(f'{filepath}{i}.test', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
    train['user'] = train['user'].astype('category')
    test['user'] = test['user'].astype('category')
    train['item'] = train['item'].astype('category')
    test['item'] = test['item'].astype('category')

# observe data:
# print(train.head())
# print(test.head())
print(train.info())
print(test.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   user       80000 non-null  category
 1   item       80000 non-null  category
 2   rating     80000 non-null  int64   
 3   timestamp  80000 non-null  int64   
dtypes: category(2), int64(2)
memory usage: 1.6 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   user       20000 non-null  category
 1   item       20000 non-null  category
 2   rating     20000 non-null  int64   
 3   timestamp  20000 non-null  int64   
dtypes: category(2), int64(2)
memory usage: 473.6 KB
None


In [23]:
# unique users and items:
n_users = train['user'].nunique()
print(f'Number of unique users: {n_users}')

n_items = train['item'].nunique()
print(f'Number of unique movies: {n_items}')

Number of unique users: 943
Number of unique movies: 1650


In [27]:
# First, required to compute a user-user similarity based on ratings and movies in common. 

# dot product, average over common ratings
    # Pearsons:
def user_similarity(train, user1, user2):
    # get ratings for both users
    user1_ratings = train[train['user'] == user1]
    user2_ratings = train[train['user'] == user2]

    # merge ratings on item
    merged_ratings = pd.merge(user1_ratings, user2_ratings, on='item', suffixes=('_user1', '_user2'))

    # calculate similarity based on common ratings
    if len(merged_ratings) == 0:
        return 0  # no common items

    # calculate the dot product of the ratings
    dot_product = np.dot(merged_ratings['rating_user1'], merged_ratings['rating_user2'])

    # calculate the average rating for each user
    avg_user1 = merged_ratings['rating_user1'].mean()
    avg_user2 = merged_ratings['rating_user2'].mean()

    # calculate the similarity score
    similarity_score = dot_product / (avg_user1 * avg_user2)

    return similarity_score

# Example usage:
user1 = 1
user2 = 2
similarity = user_similarity(train, user1, user2)
print(f'Similarity between user {user1} and user {user2}: {similarity}')


Similarity between user 1 and user 2: 15.207373271889399


In [29]:
# Second, make rating predictions on the test set following the KNN idea: 
# a prediction (user, movie) is the weighted average of other users' rating for the movie, weighted by user-similarity to the given user. 
def predict_rating(train, test, user1, item):
    # get ratings for the item from all users
    item_ratings = train[train['item'] == item]

    # calculate similarity for each user who rated the item
    similarities = []
    for user2 in item_ratings['user'].unique():
        if user2 != user1:
            similarity = user_similarity(train, user1, user2)
            similarities.append((user2, similarity))

    # calculate the weighted average rating
    if len(similarities) == 0:
        return 0  # no similar users

    weighted_sum = 0
    total_weight = 0
    for user2, similarity in similarities:
        rating = item_ratings[item_ratings['user'] == user2]['rating'].values[0]
        weighted_sum += rating * similarity
        total_weight += abs(similarity)

    predicted_rating = weighted_sum / total_weight

    return predicted_rating

# Example usage:
user1 = 1
item = 1
predicted_rating = predict_rating(train, test, user1, item)
print(f'Predicted rating for user {user1} on item {item}: {predicted_rating}')

Predicted rating for user 1 on item 1: 3.8850903599452957
