# User-Based Collaborative Filtering

This notebook presents the most elementary solution to the problem. The solution is to use User-Based Collaborative Filtering. User-Based Collaborative Filtering is a recommendation system technology that suggests products based on the preferences of users similar to the target user. The solution is presented in the form of an algorithm that calculates cosine similarity between users and based on this predicts whether a user will like a particular book or not

## Data Loading

In [1]:
import pandas as pd

train = pd.read_csv('../data/interim/test.csv')

train.head()

Unnamed: 0,user_id,book_id,rating
0,42562,2757,3
1,43232,134,4
2,37244,1463,5
3,53366,71,2
4,29634,3339,4


In [2]:
from sklearn.model_selection import train_test_split

test_add, train_small = train_test_split(train, test_size=0.01, random_state=42)

In [3]:
test = pd.read_csv('../data/interim/train.csv')

test.head()

Unnamed: 0,user_id,book_id,rating
0,10714,7164,3
1,48091,2213,3
2,9809,5769,4
3,25191,86,5
4,25441,4884,3


In [4]:
test = pd.concat([test, test_add], ignore_index=True)

In [5]:
books_information = pd.read_csv('../data/interim/books_information.csv')

books_information.head()

Unnamed: 0,book_id,goodreads_book_id,authors,original_publication_year,title
0,1,2767052,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)"
1,2,3,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...
2,3,41865,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)"
3,4,2657,Harper Lee,1960.0,To Kill a Mockingbird
4,5,4671,F. Scott Fitzgerald,1925.0,The Great Gatsby


## Preparation data

In [6]:
# Create a user-item matrix
user_item_matrix = train_small.pivot(index='user_id', columns='book_id', values='rating')

user_item_matrix.head()

book_id,1,2,3,4,5,6,7,8,9,10,...,9957,9960,9962,9965,9966,9967,9968,9972,9979,9996
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
22,3.0,,,,,,,,,,...,,,,,,,,,,
27,,,,,,,,,,,...,,,,,,,,,,
36,,,,,,,,,,,...,,,,,,,,,,
39,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# Fill missing values with 0 (assuming no rating means a rating of 0)
user_item_matrix = user_item_matrix.fillna(0)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)

In [9]:
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

user_similarity_df.head()

user_id,8,22,27,36,39,41,54,61,65,74,...,53362,53366,53386,53392,53395,53396,53408,53412,53419,53422
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## User-Based Collaborative Filtering

In [10]:
def true_lable(books_ids):
    
    rating_from_test = test[(test['user_id'] == user_id) & test['book_id'].isin(books_ids)][['book_id', 'rating']].values
    
    if len(rating_from_test) == 0:
        return []
        
    true_rating = [0 for _ in books_ids]
    
    for book in rating_from_test:
        true_rating[books_ids.index(book[0])] = book[1]
        
    return true_rating

In [11]:
import math

def RMSE(true_rating, predict_rating):
    
    mse = 0
            
    for i in range(len(true_rating)):
        for j in range(len(true_rating[i])):
            if true_rating[i][j] != 0:
                mse += (true_rating[i][j] - predict_rating[i][j])**2


    
    return math.sqrt(mse/len(true_rating))

In [12]:
def get_recommendations(user_id, n=5):

    user_ratings = user_item_matrix.loc[user_id].values.reshape(1, -1)

    # Calculate the similarity between the user and all other users
    similarities = user_similarity_df.loc[user_id].values.reshape(1, -1)
    sim_mask = similarities == 1
    similarities[sim_mask] = 0

    # Predict the user's ratings for all books
    if similarities.sum() != 0:
        predicted_ratings = similarities.dot(user_item_matrix.values) / similarities.sum()
    else:
        predicted_ratings = similarities.dot(user_item_matrix.values)

    # Mask out books the user has already rated
    user_ratings_mask = user_ratings != 0
    predicted_ratings[user_ratings_mask] = 0

    top_books_indices = predicted_ratings.argsort()[0, ::-1][:n]

    top_books_ids = user_item_matrix.columns[top_books_indices]

    return top_books_ids, predicted_ratings[0][top_books_indices]

## Test

In [13]:
arr_predict = []
arr_true = []

for user_id in train_small['user_id'].unique()[:500]:
    books_ids, predicted_rating = get_recommendations(user_id)
    arr_predict.append(list(predicted_rating))
    arr_true.append(true_lable(list(books_ids)))

In [14]:
arr_predict_clean = []
arr_true_clean = []

for i in range(len(arr_true)):
    if len(arr_true[i]) != 0 and sum(arr_predict[i]) != 0:
        arr_predict_clean.append(arr_predict[i])
        arr_true_clean.append(arr_true[i])
        
        

In [15]:
print('RMSE:', RMSE(arr_true_clean, arr_predict_clean))

RMSE: 3.5356917532945604


In [16]:
# Example
user_id = 22

recommendations, predicted_rating = get_recommendations(user_id)

books_information[books_information['book_id'].isin(recommendations)][['book_id', 'authors', 'original_publication_year', 'title']]

Unnamed: 0,book_id,authors,original_publication_year,title
2,3,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)"
134,135,George R.R. Martin,2000.0,"A Storm of Swords (A Song of Ice and Fire, #3)"
451,452,Rick Yancey,2013.0,"The 5th Wave (The 5th Wave, #1)"
4160,4161,G. Norman Lippert,2007.0,James Potter and the Hall of Elders' Crossing ...
4423,4424,Sherrilyn Kenyon,2010.0,"Infinity (Chronicles of Nick, #1)"
