# Hybrid model

This notebook presents a hybrid model that combines a content based system and a user based model. 

## Data Loading

In [14]:
import pandas as pd

train = pd.read_csv('../data/interim/train.csv')

train.head()

Unnamed: 0,user_id,book_id,rating
0,10714,7164,3
1,48091,2213,3
2,9809,5769,4
3,25191,86,5
4,25441,4884,3


In [15]:
test = pd.read_csv('../data/interim/test.csv')

test.head()

Unnamed: 0,user_id,book_id,rating
0,42562,2757,3
1,43232,134,4
2,37244,1463,5
3,53366,71,2
4,29634,3339,4


In [16]:
books_information = pd.read_csv('../data/interim/books_information.csv')

books_information.head()

Unnamed: 0,book_id,authors,original_publication_year,title,average_rating,image_url
0,1,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,https://images.gr-assets.com/books/1447303603m...
1,2,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,https://images.gr-assets.com/books/1474154022m...
2,3,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,https://images.gr-assets.com/books/1361039443m...
3,4,Harper Lee,1960.0,To Kill a Mockingbird,4.25,https://images.gr-assets.com/books/1361975680m...
4,5,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,https://images.gr-assets.com/books/1490528560m...


In [17]:
import pickle
import os

if os.path.exists('../data/interim/cosine_similarity_matrix.pkl'):
    with open("../data/interim/cosine_similarity_matrix.pkl", 'rb') as file:
        cosine_sim = pickle.load(file)  
else:
    print('Not found')

In [18]:
import keras.models

if os.path.exists('../models/train_user_based_model.keras'):
    model = keras.models.load_model('../models/train_user_based_model.keras')  
else:
    print('Not found')

## Model

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [20]:
def find_similar_users(user_ratings, train):
    # Filter train data for the specified books in user_ratings
    selected_books = user_ratings['book_id'].values
    train_subset = train[train['book_id'].isin(selected_books)]

    # Pivot the train_subset to have users as rows and books as columns
    user_book_matrix = train_subset.pivot_table(index='user_id', columns='book_id', values='rating', fill_value=0)

    # Create a user-book matrix for the target user
    target_user_ratings = pd.Series(user_ratings['rating'].values, index=user_ratings['book_id'])
    target_user_matrix = pd.DataFrame(target_user_ratings).transpose()

    # Calculate cosine similarity between the target user and all other users
    similarity_scores = cosine_similarity(target_user_matrix, user_book_matrix)

    # Get the top 3 most similar users
    similar_users_indices = similarity_scores.argsort()[0, ::-1][1:4]
    similar_users = user_book_matrix.index[similar_users_indices]

    return list(similar_users)

In [21]:
def recommend_books_user_based(user_ratings, model, train):
    
    similar_users = find_similar_users(user_ratings, train)
    
    # Get the list of all books
    all_books = train['book_id'].unique()
    
    predict_data = pd.DataFrame([(similar_user, book_id) for similar_user in similar_users for book_id in all_books if book_id not in user_ratings['book_id']],
                                columns=['user_id', 'book_id'])


    # Use the model to predict ratings
    predictions = model.predict([np.array(predict_data['user_id']), np.array(predict_data['book_id'])])

    predict_data['predicted_rating'] = predictions
    
    
    top_books = predict_data.groupby('book_id')['predicted_rating'].mean().reset_index()

    return top_books[['book_id', 'predicted_rating']]

In [22]:
def recommend_books_content_based(user_ratings, cosine_sim, books_information):
    # Get the list of all books
    all_books = books_information['book_id'].unique()

    predict_data = pd.DataFrame([book_id for book_id in all_books
                                 if book_id not in user_ratings['book_id']],
                                columns=['book_id'])

    # Use the cosine similarity to predict ratings
    for i, book_id in enumerate(predict_data['book_id']):
        
        similarity_scores = cosine_sim[book_id-1]
        
        weighted_sum = 0
        similarity_sum = 0

        for _, rated_book in user_ratings.iterrows():
            similarity = similarity_scores[rated_book['book_id'] - 1]
            weighted_sum += similarity * rated_book['rating']
            similarity_sum += abs(similarity)

        if similarity_sum != 0:
            #similarity_sum = max(0.2, similarity_sum)
            predict_data.at[i, 'predicted_rating'] = weighted_sum / similarity_sum
            
        else:
            predict_data.at[i, 'predicted_rating'] = 3
            
    predict_data = predict_data[~predict_data['book_id'].isin(user_ratings['book_id'])]

    return predict_data[['book_id', 'predicted_rating']]

In [23]:
def hybrid_model(user_ratings, model, cosine_sim, train, books_information):
    
    user_based_predict = recommend_books_user_based(user_ratings, model, train)
    content_based_predict = recommend_books_content_based(user_ratings, cosine_sim, books_information)
    
    # Merge the two DataFrames on 'book_id'
    merged_df = pd.merge(user_based_predict, content_based_predict, on='book_id', suffixes=('_user', '_content'))

    # Calculate the overall rating using the specified formula
    merged_df['predicted_rating'] = merged_df['predicted_rating_user'] * 0.6 + merged_df['predicted_rating_content'] * 0.4
    
    return merged_df[['book_id', 'predicted_rating']]

In [24]:
def top_recommend_books(user_ratings, model, cosine_sim, train, books_information, num_recommendations=5):
    
    predict_data = hybrid_model(user_ratings, model, cosine_sim, train, books_information)
    
    top_recommendations = predict_data.sort_values(by='predicted_rating', ascending=False).head(num_recommendations)
    
    recommended_book_ids = top_recommendations['book_id']
    
    return books_information[books_information['book_id'].isin(recommended_book_ids)][['authors', 'original_publication_year', 'title']]

## Evaluation

In [25]:
from sklearn.metrics import mean_squared_error

# Function to calculate Root Mean Squared Error (RMSE)
def calculate_rmse(predictions, targets):
    return np.sqrt(mean_squared_error(predictions, targets))

In [26]:
def predict_ratings_content(train, test, cosine_sim):
    predicted_ratings = []

    for _, row in test.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']

        # Extract ratings for the current user from the training data
        user_ratings = train[train['user_id'] == user_id]

        # Calculate the weighted sum of ratings based on user similarities
        weighted_sum = 0
        similarity_sum = 0

        for _, rated_book in user_ratings.iterrows():
            rated_book_id = rated_book['book_id']
            rated_book_rating = rated_book['rating']

            # Calculate similarity between the target book and already rated books
            similarity = cosine_sim[book_id-1][rated_book_id-1]

            # Add the weighted contribution to the sum
            weighted_sum += similarity * rated_book_rating
            similarity_sum += abs(similarity)

        # Avoid division by zero
        if similarity_sum > 0.25:
            predicted_rating = weighted_sum / similarity_sum
        else:
            predicted_rating = 3

        predicted_ratings.append(predicted_rating)

    return predicted_ratings

In [27]:
def predict_ratings_user(train, test, model):
    predicted_ratings = []
    
    for index, row in test.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        rating = row['rating']

        # Find similar users for the current test user
        similar_users = find_similar_users(test[test['user_id'] == user_id], train)

        predict_data = pd.DataFrame([(similar_user, book_id) for similar_user in similar_users],
                                    columns=['user_id', 'book_id'])

        # Use the model to predict ratings
        predictions = model.predict([np.array(predict_data['user_id']), np.array(predict_data['book_id'])], verbose=0)

        predicted_rating = np.mean(predictions)

        # Append the predicted rating to the list
        predicted_ratings.append(predicted_rating)


    return predicted_ratings


In [28]:
def predict_ratings(train, test, model, cosine_sim):
    
    user_based_predict = np.array(predict_ratings_user(train, test, model))
    content_based_predict = np.array(predict_ratings_content(train, test, cosine_sim))

    # Calculate the overall rating using the specified formula
    predict = user_based_predict * 0.6 + content_based_predict * 0.4
    
    return predict

### Evaluation on test data

In [29]:
test1 = test[:1000]

predicted_ratings = predict_ratings(train, test1, model, cosine_sim)

# Add the predicted ratings to the test_data DataFrame
test1['predicted_rating'] = predicted_ratings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test1['predicted_rating'] = predicted_ratings


In [30]:
rmse = calculate_rmse(test1['predicted_rating'], test1['rating'])
print(f"RMSE: {rmse}")

RMSE: 0.8994148212138502


### Evaluation on highly rated books

In [31]:
test2 = test[test["rating"] > 3][:1000]

predicted_ratings = predict_ratings(train, test2, model, cosine_sim)

test2['predicted_rating'] = predicted_ratings

In [32]:
rmse = calculate_rmse(test2['predicted_rating'], test2['rating'])
print(f"RMSE: {rmse}")

RMSE: 0.7683385046537304


## Test

In [33]:
user_ratings = pd.DataFrame({  
    'book_id': [1, 200, 295, 271, 12, 488, 4483],
    'rating': [5, 5, 5, 5, 5, 5, 1]
})

In [34]:
# Example: Get top 5 recommendations for the user's ratings
top_recommendations = top_recommend_books(user_ratings, model, cosine_sim, train, books_information, 5)

print("Top 5 Recommended Books:")
top_recommendations

Top 5 Recommended Books:


Unnamed: 0,authors,original_publication_year,title
306,Patrick Rothfuss,2011.0,"The Wise Man's Fear (The Kingkiller Chronicle,..."
1312,Stephen E. Ambrose,1992.0,"Band of Brothers: E Company, 506th Regiment, 1..."
2440,Barbara Demick,2009.0,Nothing to Envy: Ordinary Lives in North Korea
2506,Paullina Simons,2001.0,"The Bronze Horseman (The Bronze Horseman, #1)"
9140,Brandon Sanderson,2011.0,"The Way of Kings, Part 1 (The Stormlight Archi..."
