# User-Based Book Recommendation System 

This notebook presents a User-Based book recommendation system using Keras, which is using collaborative filtering. The solution is presented as a model that uses embeddings to represent users and books in a low dimensional space, and then combines these embeddings and runs them through a neural network to predict books.

## Data Loading

In [1]:
import pandas as pd

train = pd.read_csv('../data/interim/train.csv')

train.head()

Unnamed: 0,user_id,book_id,rating
0,10714,7164,3
1,48091,2213,3
2,9809,5769,4
3,25191,86,5
4,25441,4884,3


In [2]:
test = pd.read_csv('../data/interim/test.csv')

test.head()

Unnamed: 0,user_id,book_id,rating
0,42562,2757,3
1,43232,134,4
2,37244,1463,5
3,53366,71,2
4,29634,3339,4


In [3]:
books_information = pd.read_csv('../data/interim/books_information.csv')

books_information.head()

Unnamed: 0,book_id,authors,original_publication_year,title,average_rating,image_url
0,1,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...
1,2,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,https://images.gr-assets.com/books/1474154022m...
2,3,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,https://images.gr-assets.com/books/1361039443m...
3,4,Harper Lee,1960.0,To Kill a Mockingbird,4.25,https://images.gr-assets.com/books/1361975680m...
4,5,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,https://images.gr-assets.com/books/1490528560m...


## Model

In [4]:
# !pip install tensorflow
# !pip install keras

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.optimizers import Adam




### Define the model

In [6]:
num_users = len(train['user_id'].unique())
num_books = len(train['book_id'].unique())
embedding_size = 10

In [7]:
# Define input layers

user_input = Input(shape=(1,), name='user_input')
book_input = Input(shape=(1,), name='book_input')




In [8]:
# Create embeddings for users and books

user_embedding = Embedding(input_dim=num_users+1, output_dim=embedding_size, input_length=1)(user_input)
book_embedding = Embedding(input_dim=num_books+1, output_dim=embedding_size, input_length=1)(book_input)

In [9]:
# Flatten the embeddings

user_flat = Flatten()(user_embedding)
book_flat = Flatten()(book_embedding)

In [10]:
# Concatenate user and book embeddings

concatenated = Concatenate()([book_flat, user_flat])

In [11]:
# Build a neural network

dense1 = Dense(128, activation='relu')(concatenated)
dense2 = Dense(32, activation='relu')(dense1)
output = Dense(1)(dense2)

In [12]:
model = Model(inputs=[user_input, book_input], outputs=output)

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 book_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 1, 10)                100010    ['book_input[0][0]']          
                                                                                                  
 embedding (Embedding)       (None, 1, 10)                534250    ['user_input[0][0]']          
                                                                                              

In [14]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

### Train the model on train dataset

In [15]:
import keras.models
import os

if os.path.exists('../models/train_user_based_model.keras'):
    model = keras.models.load_model('../models/train_user_based_model.keras')
else:
    model.fit([train['user_id'], train['book_id']], train['rating'], epochs=5, batch_size=64, validation_split=0.2)
    model.save('../models/train_user_based_model.keras')

### Evaluation

In [16]:
from sklearn.metrics import mean_squared_error

# Function to calculate Root Mean Squared Error (RMSE)
def calculate_rmse(predictions, targets):
    return np.sqrt(mean_squared_error(predictions, targets))

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_users(user_ratings, train):
    # Filter train data for the specified books in user_ratings
    selected_books = user_ratings['book_id'].values
    train_subset = train[train['book_id'].isin(selected_books)]

    # Pivot the train_subset to have users as rows and books as columns
    user_book_matrix = train_subset.pivot_table(index='user_id', columns='book_id', values='rating', fill_value=0)

    # Create a user-book matrix for the target user
    target_user_ratings = pd.Series(user_ratings['rating'].values, index=user_ratings['book_id'])
    target_user_matrix = pd.DataFrame(target_user_ratings).transpose()

    # Calculate cosine similarity between the target user and all other users
    similarity_scores = cosine_similarity(target_user_matrix, user_book_matrix)

    # Get the top 3 most similar users
    similar_users_indices = similarity_scores.argsort()[0, ::-1][1:4]  # Exclude the target user
    similar_users = user_book_matrix.index[similar_users_indices]

    return list(similar_users)

In [18]:
import numpy as np


def predict_ratings(train, test, model):
    predicted_ratings = []
    
    for index, row in test.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        rating = row['rating']

        # Find similar users for the current test user
        similar_users = find_similar_users(test[test['user_id'] == user_id], train)

        predict_data = pd.DataFrame([(similar_user, book_id) for similar_user in similar_users],
                                    columns=['user_id', 'book_id'])

        # Use the model to predict ratings
        predictions = model.predict([np.array(predict_data['user_id']), np.array(predict_data['book_id'])], verbose=0)

        predicted_rating = np.mean(predictions)

        # Append the predicted rating to the list
        predicted_ratings.append(predicted_rating)


    return predicted_ratings


#### Evaluation on test data

In [19]:
# Evaluate the model on the test set
loss = model.evaluate([test['user_id'], test['book_id']], test['rating'])
print(f'RMSE: {loss**0.5}')


RMSE: 0.8322319540884523


In [20]:
test1 = test[:1000]

predicted_ratings = predict_ratings(train, test1, model)

# Add the predicted ratings to the test_data DataFrame
test1['predicted_rating'] = predicted_ratings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test1['predicted_rating'] = predicted_ratings


In [21]:
rmse = calculate_rmse(test1['predicted_rating'], test1['rating'])
print(f"RMSE: {rmse}")

RMSE: 0.9892375964637813


#### Evaluation on highly rated books

In [22]:
test1 = test[test["rating"] > 3]

loss = model.evaluate([test1['user_id'], test1['book_id']], test1['rating'])
print(f'RMSE: {loss**0.5}')

RMSE: 0.6786692874121824


In [23]:
test2 = test[test["rating"] > 3][:1000]

predicted_ratings = predict_ratings(train, test2, model)

test2['predicted_rating'] = predicted_ratings

In [24]:
rmse = calculate_rmse(test2['predicted_rating'], test2['rating'])
print(f"RMSE: {rmse}")

RMSE: 0.8197350469425796


## Test

### Prediction function

In [25]:
import numpy as np


def recommend_books_user_based(user_ratings, model):
    
    similar_users = find_similar_users(user_ratings, train)
    
    # Get the list of all books
    all_books = train['book_id'].unique()
    
    predict_data = pd.DataFrame([(similar_user, book_id) for similar_user in similar_users for book_id in all_books if book_id not in user_ratings['book_id']],
                                columns=['user', 'book'])


    # Use the model to predict ratings
    predictions = model.predict([np.array(predict_data['user']), np.array(predict_data['book'])])

    predict_data['predicted_rating'] = predictions
    
    
    top_books = predict_data.groupby('book')['predicted_rating'].mean().reset_index()

    return top_books[['book', 'predicted_rating']]
    

In [26]:
def top_recommend_books(user_ratings, model, num_recommendations=5):
    predict_data = recommend_books_user_based(user_ratings, model)
    top_recommendations = predict_data.sort_values(by='predicted_rating', ascending=False).head(num_recommendations)
    recommended_book_ids = top_recommendations['book']
    return books_information[books_information['book_id'].isin(recommended_book_ids)][['authors', 'original_publication_year', 'title']]

In [27]:
user_ratings = pd.DataFrame({  
    'book_id': [1, 12, 3, 4, 15, 4640, 3093, 8, 9, 10],
    'rating': [5, 5, 3, 5, 5, 1, 5, 5, 3, 4]
})

In [28]:
# Example: Get top 5 recommendations for the user's ratings
top_recommendations = top_recommend_books(user_ratings, model, num_recommendations=5)

print("Top 5 Recommended Books:")
top_recommendations

Top 5 Recommended Books:


Unnamed: 0,authors,original_publication_year,title
2100,"J.K. Rowling, Mary GrandPré",1999.0,"The Harry Potter Collection 1-4 (Harry Potter,..."
3752,J.K. Rowling,2005.0,"Harry Potter Collection (Harry Potter, #1-6)"
5989,"Radwa Ashour, رضوى عاشور",2010.0,الطنطورية
9359,Stephen King,1996.0,"The Green Mile, Part 6: Coffey on the Mile"
9922,Stephen King,1996.0,"The Green Mile, Part 5: Night Journey"
