## Implement a movie recommendation system and run it on the movie lens dataset 

(train vs test)

Measure performance on test set using RMSE (Root mean squared deviation)



In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
from tqdm import tqdm
import torch.nn as nn

In [47]:
# first, generate train and test sets of data:
filepath = 'ml-100k/u'
#pd.read_csv(filepath, sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
u = ['1', '2', '3', '4', '5']
    # base = train
    # test = test
for i in u:
    train = pd.read_csv(f'{filepath}{i}.base', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
    test = pd.read_csv(f'{filepath}{i}.test', sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
    train['user'] = train['user'].astype('category')
    test['user'] = test['user'].astype('category')
    train['item'] = train['item'].astype('category')
    test['item'] = test['item'].astype('category')

# observe data:
# print(train.head())
# print(test.head())
print(train.info())
print(test.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   user       80000 non-null  category
 1   item       80000 non-null  category
 2   rating     80000 non-null  int64   
 3   timestamp  80000 non-null  int64   
dtypes: category(2), int64(2)
memory usage: 1.6 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   user       20000 non-null  category
 1   item       20000 non-null  category
 2   rating     20000 non-null  int64   
 3   timestamp  20000 non-null  int64   
dtypes: category(2), int64(2)
memory usage: 473.6 KB
None


In [22]:
# unique users and items:
n_users = train['user'].nunique()
print(f'Number of unique users: {n_users}')

n_items = train['item'].nunique()
print(f'Number of unique movies: {n_items}')

Number of unique users: 943
Number of unique movies: 1650


In [23]:
# First, required to compute a user-user similarity based on ratings and movies in common. 

# dot product, average over common ratings
    # Pearsons:
def user_similarity(train, user1, user2):
    # get ratings for both users
    user1_ratings = train[train['user'] == user1]
    user2_ratings = train[train['user'] == user2]

    # merge ratings on item
    merged_ratings = pd.merge(user1_ratings, user2_ratings, on='item', suffixes=('_user1', '_user2'))

    # calculate similarity based on common ratings
    if len(merged_ratings) == 0:
        return 0  # no common items

    # calculate the dot product of the ratings
    dot_product = np.dot(merged_ratings['rating_user1'], merged_ratings['rating_user2'])

    # calculate the average rating for each user
    avg_user1 = merged_ratings['rating_user1'].mean()
    avg_user2 = merged_ratings['rating_user2'].mean()

    # calculate the similarity score
    similarity_score = dot_product / (avg_user1 * avg_user2)

    return similarity_score

# Example usage:
user1 = 1
user2 = 2
similarity = user_similarity(train, user1, user2)
print(f'Similarity between user {user1} and user {user2}: {similarity}')


Similarity between user 1 and user 2: 15.207373271889399


# Pearson correlation coefficient
# product of standard deviations 

# hybrid approach
    # combine collaborative with content-based methods: 
    # default recommendation
        # baseline recommendation for "most" users -> 
    # collecting demographic information 
        # uses this baseline to recommend 

content-based methods:
tempo,
contextual-based:
mood, activity 

In [None]:
# Second, make rating predictions on the test set following the KNN idea: 
# a prediction (user, movie) is the weighted average of other users' rating for the movie, weighted by user-similarity to the given user. 
def predict_rating(train, user1, item):
    # get ratings for the item from all users
    item_ratings = train[train['item'] == item]

    # calculate similarity for each user who rated the item
    similarities = []
    for user2 in item_ratings['user'].unique():
        if user2 != user1:
            similarity = user_similarity(train, user1, user2)
            similarities.append((user2, similarity))

    # calculate the weighted average rating
    if len(similarities) == 0:
        return 0  # no similar users

    weighted_sum = 0
    total_weight = 0
    for user2, similarity in similarities:
        rating = item_ratings[item_ratings['user'] == user2]['rating'].values[0]
        # print(f'Rating for user {user2} on item {item}: {rating}')
        # print(f'Similarity between user {user1} and user {user2}: {similarity}')
        weighted_sum += rating * similarity
        total_weight += abs(similarity)
        # user 2: 4 and 15
            # wisdom of the crowd

    predicted_rating = weighted_sum / total_weight

    return predicted_rating

# Example usage:
user1 = 1
item = 1
predicted_rating = predict_rating(train, user1, item)
print(f'Predicted rating for user {user1} on item {item}: {predicted_rating}')

Rating for user 2 on item 1: 4
Similarity between user 1 and user 2: 15.207373271889399
Rating for user 5 on item 1: 4
Similarity between user 1 and user 5: 73.68794565915312
Rating for user 6 on item 1: 4
Similarity between user 1 and user 6: 85.85488958990537
Rating for user 10 on item 1: 4
Similarity between user 1 and user 10: 65.93407606607761
Rating for user 13 on item 1: 3
Similarity between user 1 and user 13: 155.01671234802163
Rating for user 15 on item 1: 1
Similarity between user 1 and user 15: 27.59481499513145
Rating for user 16 on item 1: 5
Similarity between user 1 and user 16: 63.400072735410944
Rating for user 17 on item 1: 4
Similarity between user 1 and user 17: 14.41496598639456
Rating for user 18 on item 1: 5
Similarity between user 1 and user 18: 97.09614039179102
Rating for user 20 on item 1: 3
Similarity between user 1 and user 20: 19.978835978835978
Rating for user 23 on item 1: 5
Similarity between user 1 and user 23: 75.61539242562698
Rating for user 25 on i

In [None]:
# Second, make rating predictions on the test set following the KNN idea: 
# a prediction (user, movie) is the weighted average of other users' rating for the movie, weighted by user-similarity to the given user. 
def predict_rating(train, user1, item):
    # get ratings for the item from all users
    item_ratings = train[train['item'] == item]

    # calculate similarity for each user who rated the item
    similarities = []
    for user2 in item_ratings['user'].unique():
        if user2 != user1:
            similarity = user_similarity(train, user1, user2)
            similarities.append((user2, similarity))

    # calculate the weighted average rating
    if len(similarities) == 0:
        return 0  # no similar users

    weighted_sum = 0
    total_weight = 0
    for user2, similarity in similarities:
        rating = item_ratings[item_ratings['user'] == user2]['rating'].values[0]
        # print(f'Rating for user {user2} on item {item}: {rating}')
        # print(f'Similarity between user {user1} and user {user2}: {similarity}')
        weighted_sum += rating * similarity # where similarity acts as a weight, scaling the affect of the rating
        total_weight += abs(similarity)

    predicted_rating = weighted_sum / total_weight # approach different from average?

    return predicted_rating

# Example usage:
user1 = 1
item = 1
predicted_rating = predict_rating(train, user1, item)
print(f'Predicted rating for user {user1} on item {item}: {predicted_rating}')

Predicted rating for user 1 on item 1: 3.8850903599452957


In [25]:
# observe test set:
# we can drop timestamp, as it is not needed for the analysis:
test = test.drop(columns=['timestamp'])
print(test)



      user  item  rating
0        1     3       4
1        1    13       5
2        1    15       5
3        1    18       4
4        1    19       5
...    ...   ...     ...
19995  943  1028       2
19996  943  1044       3
19997  943  1047       2
19998  943  1228       3
19999  943  1330       3

[20000 rows x 3 columns]


In [48]:
# now, we can make predictions for all users and items in the test set:

# create a dataframe to store predictions
predictions = pd.DataFrame(columns=['user', 'item', 'predicted_rating'])

# truncate test set:
test = test.head(500)

# for each row (user,item)
for index, row in tqdm(test.iterrows(), total=test.shape[0]):
    user = row['user']
    item = row['item']
    predicted_rating = predict_rating(train, user, item)
    current_prediction = pd.DataFrame({'user': [user], 'item': [item], 'predicted_rating': [predicted_rating], 'actual_rating': [row['rating']], 'RMSE': [np.sqrt((np.mean([row['rating'] - predicted_rating]) ** 2))]})
    predictions = pd.concat([predictions, current_prediction], ignore_index=True)

print(predictions)

# sum of all rows:
rmse = (predictions['RMSE'].sum())/len(predictions['RMSE'])
print(f'RMSE: {rmse}')





  predictions = pd.concat([predictions, current_prediction], ignore_index=True)
100%|██████████| 500/500 [01:48<00:00,  4.61it/s]

    user item  predicted_rating  actual_rating      RMSE
0      1    3          3.010842            4.0  0.989158
1      1   13          3.422749            5.0  1.577251
2      1   15          3.688495            5.0  1.311505
3      1   18          3.182241            4.0  0.817759
4      1   19          3.570785            5.0  1.429215
..   ...  ...               ...            ...       ...
495   59  507          3.811905            4.0  0.188095
496   59  517          3.578268            5.0  1.421732
497   59  526          3.846545            4.0  0.153455
498   59  602          3.815881            2.0  1.815881
499   59  608          3.900662            4.0  0.099338

[500 rows x 5 columns]
RMSE: 0.8659811788395032



