# Fine tuning of the hybrid model

## Data Loading

In [1]:
import pandas as pd

train = pd.read_csv('../data/interim/train.csv')

train.head()

Unnamed: 0,user_id,book_id,rating
0,10714,7164,3
1,48091,2213,3
2,9809,5769,4
3,25191,86,5
4,25441,4884,3


In [2]:
test = pd.read_csv('../data/interim/test.csv')

test.head()

Unnamed: 0,user_id,book_id,rating
0,42562,2757,3
1,43232,134,4
2,37244,1463,5
3,53366,71,2
4,29634,3339,4


In [3]:
import pickle
import os

if os.path.exists('../data/interim/cosine_similarity_matrix.pkl'):
    with open("../data/interim/cosine_similarity_matrix.pkl", 'rb') as file:
        cosine_sim = pickle.load(file)  
else:
    print('Not found')

In [4]:
import keras.models

if os.path.exists('../models/train_user_based_model.keras'):
    model = keras.models.load_model('../models/train_user_based_model.keras')  
else:
    print('Not found')





## Functions for evaluation

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [6]:
from sklearn.metrics import mean_squared_error

# Function to calculate Root Mean Squared Error (RMSE)
def calculate_rmse(predictions, targets):
    return np.sqrt(mean_squared_error(predictions, targets))

In [7]:
def find_similar_users(user_ratings, train):
    # Filter train data for the specified books in user_ratings
    selected_books = user_ratings['book_id'].values
    train_subset = train[train['book_id'].isin(selected_books)]

    # Pivot the train_subset to have users as rows and books as columns
    user_book_matrix = train_subset.pivot_table(index='user_id', columns='book_id', values='rating', fill_value=0)

    # Create a user-book matrix for the target user
    target_user_ratings = pd.Series(user_ratings['rating'].values, index=user_ratings['book_id'])
    target_user_matrix = pd.DataFrame(target_user_ratings).transpose()

    # Calculate cosine similarity between the target user and all other users
    similarity_scores = cosine_similarity(target_user_matrix, user_book_matrix)

    # Get the top 3 most similar users
    similar_users_indices = similarity_scores.argsort()[0, ::-1][1:4]
    similar_users = user_book_matrix.index[similar_users_indices]

    return list(similar_users)

In [8]:
def predict_ratings_content(train, test, cosine_sim, beta):
    predicted_ratings = []

    for _, row in test.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']

        # Extract ratings for the current user from the training data
        user_ratings = train[train['user_id'] == user_id]

        # Calculate the weighted sum of ratings based on user similarities
        weighted_sum = 0
        similarity_sum = 0

        for _, rated_book in user_ratings.iterrows():
            rated_book_id = rated_book['book_id']
            rated_book_rating = rated_book['rating']

            # Calculate similarity between the target book and already rated books
            similarity = cosine_sim[book_id-1][rated_book_id-1]

            # Add the weighted contribution to the sum
            weighted_sum += similarity * rated_book_rating
            similarity_sum += abs(similarity)

        # Avoid division by zero
        if similarity_sum > beta:
            predicted_rating = weighted_sum / similarity_sum
        else:
            predicted_rating = 3

        predicted_ratings.append(predicted_rating)

    return predicted_ratings

In [9]:
def predict_ratings_user(train, test, model):
    predicted_ratings = []
    
    for index, row in test.iterrows():
        user_id = row['user_id']
        book_id = row['book_id']
        rating = row['rating']

        # Find similar users for the current test user
        similar_users = find_similar_users(test[test['user_id'] == user_id], train)

        predict_data = pd.DataFrame([(similar_user, book_id) for similar_user in similar_users],
                                    columns=['user_id', 'book_id'])

        # Use the model to predict ratings
        predictions = model.predict([np.array(predict_data['user_id']), np.array(predict_data['book_id'])], verbose=0)

        predicted_rating = np.mean(predictions)

        # Append the predicted rating to the list
        predicted_ratings.append(predicted_rating)


    return predicted_ratings

In [10]:
def predict_ratings(train, test, model, cosine_sim, alpha, beta):
    
    user_based_predict = np.array(predict_ratings_user(train, test, model))
    content_based_predict = np.array(predict_ratings_content(train, test, cosine_sim, beta))

    # Calculate the overall rating using the specified formula
    predict = user_based_predict * alpha + content_based_predict * (1 - alpha)
    
    return predict

## Fine tuning

In [11]:
alpha_parm = [0.4, 0.5, 0.6, 0.7]
beta_parm = [0, 0.1, 0.25, 0.4]

In [14]:
result = []

In [17]:
for alpha in alpha_parm:
    for beta in beta_parm:
        
        test_data = test[:1000]
        
        predicted_ratings = predict_ratings(train, test_data, model, cosine_sim, alpha, beta)
        
        test_data['predicted_rating'] = predicted_ratings
        rmse = calculate_rmse(test_data['predicted_rating'], test_data['rating'])
        
        result.append([('alpha', alpha), ('beta', beta), ('rmse', rmse)])
        print(f"alpha: {alpha}, beta: {beta}, rmse: {rmse}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.4, beta: 0, rmse: 0.8777152105171364


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.4, beta: 0.1, rmse: 0.8732465324108785


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.4, beta: 0.25, rmse: 0.8954561205857657


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.4, beta: 0.4, rmse: 0.9252155712385228


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.5, beta: 0, rmse: 0.881568837771882


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.5, beta: 0.1, rmse: 0.8777887625267387


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.5, beta: 0.25, rmse: 0.8959509501386755


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.5, beta: 0.4, rmse: 0.920219096738721


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.6, beta: 0, rmse: 0.8921320995404428


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.6, beta: 0.1, rmse: 0.8890867214414189


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.6, beta: 0.25, rmse: 0.9032336768500735


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.6, beta: 0.4, rmse: 0.9220790287485634


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.7, beta: 0, rmse: 0.9091711795063687


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.7, beta: 0.1, rmse: 0.9068879814699183


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


alpha: 0.7, beta: 0.25, rmse: 0.9171426504424696
alpha: 0.7, beta: 0.4, rmse: 0.9307543158684762


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = predicted_ratings


In [18]:
result

[[('alpha', 0.4), ('beta', 0), ('rmse', 0.8784871114945239)],
 [('alpha', 0.4), ('beta', 0), ('rmse', 0.8777152105171364)],
 [('alpha', 0.4), ('beta', 0.1), ('rmse', 0.8732465324108785)],
 [('alpha', 0.4), ('beta', 0.25), ('rmse', 0.8954561205857657)],
 [('alpha', 0.4), ('beta', 0.4), ('rmse', 0.9252155712385228)],
 [('alpha', 0.5), ('beta', 0), ('rmse', 0.881568837771882)],
 [('alpha', 0.5), ('beta', 0.1), ('rmse', 0.8777887625267387)],
 [('alpha', 0.5), ('beta', 0.25), ('rmse', 0.8959509501386755)],
 [('alpha', 0.5), ('beta', 0.4), ('rmse', 0.920219096738721)],
 [('alpha', 0.6), ('beta', 0), ('rmse', 0.8921320995404428)],
 [('alpha', 0.6), ('beta', 0.1), ('rmse', 0.8890867214414189)],
 [('alpha', 0.6), ('beta', 0.25), ('rmse', 0.9032336768500735)],
 [('alpha', 0.6), ('beta', 0.4), ('rmse', 0.9220790287485634)],
 [('alpha', 0.7), ('beta', 0), ('rmse', 0.9091711795063687)],
 [('alpha', 0.7), ('beta', 0.1), ('rmse', 0.9068879814699183)],
 [('alpha', 0.7), ('beta', 0.25), ('rmse', 0.9171

In [22]:
test2 = test[test["rating"] > 3][:1000]

predicted_ratings = predict_ratings(train, test2, model, cosine_sim, 0.4, 0.1)

test2['predicted_rating'] = predicted_ratings

In [24]:
rmse = calculate_rmse(test2['predicted_rating'], test2['rating'])
print(f"RMSE: {rmse}")

RMSE: 0.7434915922177887
