In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For collaborative filtering
from surprise import SVD, Dataset, Reader, KNNBasic
from surprise.model_selection import cross_validate, train_test_split

# For neural network model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from tensorflow.keras.optimizers import Adam

# Suppress warnings for clean output
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
books = pd.read_csv('data/Books.csv', sep=',', error_bad_lines=False, encoding='latin-1')
users = pd.read_csv('data/Users.csv', sep=',', error_bad_lines=False, encoding='latin-1')
ratings = pd.read_csv('data/Ratings.csv', sep=',', error_bad_lines=False, encoding='latin-1')

# Preview the data
print('Books Data:')
print(books.head())
print('\nUsers Data:')
print(users.head())
print('\nRatings Data:')
print(ratings.head())

Books Data:
         ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  \
0    Mark P. O. Morford                2002     Oxford University Press   
1  Richard Bruce Wright                2001       HarperFlamingo Canada   
2          Carlo D'Este                1991             HarperPerennial   
3      Gina Bari Kolata                1999        Farrar Straus Giroux   
4       E. J. W. Barber                1999  W. W. Norton &amp; Company   

                                         Image-URL-S  \
0  http://images.amazon.com/images/P/0195153448.0...   
1  http://images

In [ ]:
# Data Cleaning and Preprocessing

# Rename columns for ease of use
books.columns = ['ISBN', 'BookTitle', 'BookAuthor', 'YearOfPublication', 'Publisher', 'ImageURLS', 'ImageURLM', 'ImageURLL']
users.columns = ['UserID', 'Location', 'Age']
ratings.columns = ['UserID', 'ISBN', 'BookRating']

# Handle missing values
books['YearOfPublication'] = pd.to_numeric(books['YearOfPublication'], errors='coerce')
books['YearOfPublication'].fillna(int(books['YearOfPublication'].mean()), inplace=True)
books['YearOfPublication'] = books['YearOfPublication'].astype(int)

users['Age'].fillna(users['Age'].mean(), inplace=True)
users['Age'] = users['Age'].astype(int)

# Exploratory Data Analysis

# Distribution of Book Ratings
plt.figure(figsize=(10,6))
sns.countplot(ratings['BookRating'])
plt.title('Distribution of Book Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# Number of Ratings per Book
ratings_per_book = ratings.groupby('ISBN')['BookRating'].count().reset_index().rename(columns={'BookRating': 'RatingCount'})
print('\nTop 5 Books with Most Ratings:')
print(ratings_per_book.sort_values('RatingCount', ascending=False).head())

In [12]:
# Collaborative Filtering using Matrix Factorization (SVD)

# Prepare data for Surprise library
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings[['UserID', 'ISBN', 'BookRating']], reader)

# Build and evaluate the SVD model
trainset, testset = train_test_split(data, test_size=0.2)

svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)

# Evaluate the model
from surprise import accuracy
rmse = accuracy.rmse(predictions)
print(f'\nSVD Model RMSE: {rmse}')

# Collaborative Filtering using KNN
knn_model = KNNBasic()
knn_model.fit(trainset)
predictions_knn = knn_model.test(testset)
rmse_knn = accuracy.rmse(predictions_knn)
print(f'KNN Model RMSE: {rmse_knn}')


KeyError: "['UserID', 'BookRating'] not in index"

In [ ]:
# Neural Collaborative Filtering using TensorFlow/Keras

# Prepare data for neural network model
user_ids = ratings['UserID'].unique().tolist()
user_id_to_index = {x: i for i, x in enumerate(user_ids)}
ratings['user_index'] = ratings['UserID'].map(user_id_to_index)

book_ids = ratings['ISBN'].unique().tolist()
book_id_to_index = {x: i for i, x in enumerate(book_ids)}
ratings['book_index'] = ratings['ISBN'].map(book_id_to_index)

# Define model architecture
num_users = ratings['user_index'].nunique()
num_books = ratings['book_index'].nunique()

# Input layers
user_input = Input(shape=(1,))
book_input = Input(shape=(1,))

# Embedding layers
user_embedding = Embedding(input_dim=num_users, output_dim=50, input_length=1)(user_input)
book_embedding = Embedding(input_dim=num_books, output_dim=50, input_length=1)(book_input)

# Flatten layers
user_vec = Flatten()(user_embedding)
book_vec = Flatten()(book_embedding)

# Concatenate user and book vectors
concat = Concatenate()([user_vec, book_vec])

# Add dense layers
dense = Dense(128, activation='relu')(concat)
dense = Dense(64, activation='relu')(dense)
output = Dense(1)(dense)

# Build and compile the model
model = Model(inputs=[user_input, book_input], outputs=output)
model.compile(optimizer=Adam(lr=0.001), loss='mean_squared_error')

In [ ]:
# Prepare training and testing data
from sklearn.model_selection import train_test_split

X = ratings[['user_index', 'book_index']]
y = ratings['BookRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
history = model.fit([X_train['user_index'], X_train['book_index']], y_train,
                    batch_size=256, epochs=5, verbose=1,
                    validation_data=([X_test['user_index'], X_test['book_index']], y_test))

# Evaluate the model
y_pred = model.predict([X_test['user_index'], X_test['book_index']])
from sklearn.metrics import mean_squared_error
rmse_nn = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'\nNeural Network Model RMSE: {rmse_nn}')

# Plot training & validation loss
plt.figure(figsize=(10,6))
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [ ]:
def recommend_books(user_id, num_recommendations=5):
    # Check if user_id exists
    if user_id not in user_id_to_index:
        print("User ID not found.")
        return
    user_idx = user_id_to_index[user_id]
    # Books not yet rated by the user
    books_not_rated = ratings[~ratings['ISBN'].isin(
        ratings[ratings['UserID'] == user_id]['ISBN'].tolist()
    )]['ISBN'].unique()
    books_not_rated_idx = [book_id_to_index[x] for x in books_not_rated]
    user_idx_array = np.array([user_idx for _ in range(len(books_not_rated_idx))])
    predictions = model.predict([user_idx_array, np.array(books_not_rated_idx)])
    top_indices = predictions.flatten().argsort()[-num_recommendations:][::-1]
    recommended_isbns = [books_not_rated[i] for i in top_indices]
    recommended_books = books[books['ISBN'].isin(recommended_isbns)]
    return recommended_books[['BookTitle', 'BookAuthor', 'YearOfPublication']]

In [ ]:
# Example: Recommend books for a user
user_to_recommend = 276725  # Replace with a valid UserID from your dataset
recommended_books = recommend_books(user_to_recommend)
print(f'\nRecommended Books for User {user_to_recommend}:')
print(recommended_books)