# CP421 Final Project

Book Recommendation System

- Justin Medeiros (200744390) [Team Lead]
- Peranvan Paralogasingam (190867830)
- Jack Pham (200530610)
- Levi Van Veen (200852490)
- Ratul Sarker (203087260)


# Data Preprocessing

In [1]:
import pandas as pd
from collections import Counter

In [2]:
#DATA PRECPROCESSING-----------------------------------------------------------------------------------------------

# File paths
users_file = 'Users.csv'
books_file = 'Books.csv'
ratings_file = 'Ratings.csv'

# Load the datasets
users_df = pd.read_csv(users_file)
books_df = pd.read_csv(books_file)
ratings_df = pd.read_csv(ratings_file)

# Users Data Preprocessing
# Handle missing values in 'Age' and 'Location'
users_df['Age'].fillna(users_df['Age'].median(), inplace=True)
users_df['Location'].fillna('Unknown', inplace=True)


# Books Data Preprocessing
# Convert 'Year-Of-Publication' to numeric and handle non-numeric values
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
books_df['Year-Of-Publication'].fillna(books_df['Year-Of-Publication'].median(), inplace=True)
# Handle missing values in 'Book-Author' and 'Publisher'
books_df['Book-Author'].fillna('Unknown', inplace=True)
books_df['Publisher'].fillna('Unknown', inplace=True)

# Drop the url columns as we will not be using them
books_df.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)

# Ratings Data Preprocessing
# Merge 'Ratings' with 'Books' to filter out books not in the 'Books' dataset
ratings_df = ratings_df[ratings_df['ISBN'].isin(books_df['ISBN'])]

# Merge books_df, users_df, and ratings_df
merged_df = pd.merge(ratings_df, books_df, on='ISBN', how='inner')
merged_df = pd.merge(merged_df, users_df, on='User-ID', how='inner')

# Filter ratings above 0
merged_df = merged_df[merged_df['Book-Rating'] > 0]

# Filter users aged less than 110
merged_df = merged_df[merged_df['Age'] < 110]

# Count the number of ratings by each user
user_ratings_count = merged_df['User-ID'].value_counts()

# Keep only users who have rated 3 or more books
valid_users = user_ratings_count[user_ratings_count >= 2].index
merged_df = merged_df[merged_df['User-ID'].isin(valid_users)]

# Use only a fraction of the dataset
merged_df = merged_df.sample(frac=0.01, random_state=42)

# Data Splitting
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(merged_df, test_size=0.7, random_state=42)

# Checking the transformations and missing values after preprocessing
print("\nData after preprocessing:")
print("Unique values in 'Year-Of-Publication':", books_df['Year-Of-Publication'].unique())
print("\nMissing values after preprocessing:")
print("Users:", users_df.isnull().sum())
print("Books:", books_df.isnull().sum())
print("Ratings:", ratings_df.isnull().sum())



  books_df = pd.read_csv(books_file)



Data after preprocessing:
Unique values in 'Year-Of-Publication': [2002. 2001. 1991. 1999. 2000. 1993. 1996. 1988. 2004. 1998. 1994. 2003.
 1997. 1983. 1979. 1995. 1982. 1985. 1992. 1986. 1978. 1980. 1952. 1987.
 1990. 1981. 1989. 1984.    0. 1968. 1961. 1958. 1974. 1976. 1971. 1977.
 1975. 1965. 1941. 1970. 1962. 1973. 1972. 1960. 1966. 1920. 1956. 1959.
 1953. 1951. 1942. 1963. 1964. 1969. 1954. 1950. 1967. 2005. 1957. 1940.
 1937. 1955. 1946. 1936. 1930. 2011. 1925. 1948. 1943. 1947. 1945. 1923.
 2020. 1939. 1926. 1938. 2030. 1911. 1904. 1949. 1932. 1928. 1929. 1927.
 1931. 1914. 2050. 1934. 1910. 1933. 1902. 1924. 1921. 1900. 2038. 2026.
 1944. 1917. 1901. 2010. 1908. 1906. 1935. 1806. 2021. 2012. 2006. 1909.
 2008. 1378. 1919. 1922. 1897. 2024. 1376. 2037.]

Missing values after preprocessing:
Users: User-ID     0
Location    0
Age         0
dtype: int64
Books: ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher           

# **Content Based Filtering**

### 1. Weighted rating scores of each book for prediction




In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel

# Function that computes the weighted rating of each book
# We are utilizing the IMDB formula for weighted average ratings
def weighted_rating(dataframe, m, C):
    v = dataframe['Rating_Count']
    R = dataframe['Average_Rating']
    return (v / (v + m) * R) + (m / (m + v) * C)

# Count the amount of ratings
book_rated_count = ratings_df['ISBN'].value_counts().reset_index()
book_rated_count.columns = ['ISBN', 'Rating_Count']

# print(book_rated_count)
global_mean = merged_df['Book-Rating'].mean()

# Calculate the average of the ratings for each book
average_ratings = merged_df.groupby('ISBN')['Book-Rating'].mean().reset_index()

# Merge the book_means DataFrame with the original DataFrame to fill missing ratings in user-book matrix
merged_df = pd.merge(merged_df, average_ratings, on='ISBN', how='left', suffixes=('_original', '_mean'))
merged_df['Book-Rating'] = merged_df['Book-Rating_original'].combine_first(merged_df['Book-Rating_mean'])
merged_df.drop(['Book-Rating_original', 'Book-Rating_mean'], axis=1, inplace=True)

average_ratings.columns = ['ISBN', 'Average_Rating']
sorted_ratings = average_ratings.sort_values(by='Average_Rating', ascending=False)

# Merge the book count and average ratings
merged_data = pd.merge(book_rated_count, average_ratings, on='ISBN', how='inner')

# Calculate mean of 'Average_Rating' column
C = average_ratings['Average_Rating'].mean()
print("Mean Average Rating:", C)

# Calculate the minimum number of ratings required to be in the chart, m
m = book_rated_count['Rating_Count'].quantile(0.80)
print("Minimum ratings required:", m)

# Filter out all qualified books into a new DataFrame
q_books = merged_data.copy().loc[book_rated_count['Rating_Count'] >= m]

# Apply weighted rating function to calculate the weighted rating for each book
q_books['Weighted_Rating'] = q_books.apply(weighted_rating, axis=1, m=m, C=C)

# Sorting the DataFrame by 'Weighted_Rating' in descending order
sorted_q_books = q_books.sort_values(by='Weighted_Rating', ascending=False)

# Print the result
print(sorted_q_books)


Mean Average Rating: 7.665682370662014
Minimum ratings required: 3.0
           ISBN  Rating_Count  Average_Rating  Weighted_Rating
35   0385484518           391            10.0         9.982226
36   0446310786           389            10.0         9.982135
47   0440206154           365            10.0         9.980970
87   1558743669           262            10.0         9.973574
91   0345339703           257            10.0         9.973066
..          ...           ...             ...              ...
461  044011585X            76             2.0         2.215152
390  0060188731            91             2.0         2.180820
825  3442433495            36             1.0         1.512745
317  0679735771           110             1.0         1.176965
108  0064407667           230             1.0         1.085824

[3193 rows x 4 columns]


Improves accuracy of the model

### 2. Content Based Recommendation using book title and author
We will use content based filtering to create part of our recommendation system. We will use book information for this part. The book information includes: Book title and author

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Create content dataframe with only the book title and author
content_df = merged_df[['Book-Title', 'Book-Author', 'ISBN']].copy()

# Combine the title and author and store in Book-Information column
content_df['Book-Information'] = content_df['Book-Title'] + ' ' + content_df['Book-Author']

# Drop all duplicate rows based on the 'Book-Information' column
content_df = content_df.drop_duplicates(subset='Book-Information')

# Reset the index after dropping duplicates
content_df = content_df.reset_index(drop=True)

# Construct the required TF-IDF matrix for all books information
books_tfidf_matrix = tfidf.fit_transform(content_df['Book-Information'])

# Calculate cosine similarity of the books matrix
similarity = linear_kernel(books_tfidf_matrix, books_tfidf_matrix)

def content_based_recommendations(book_title, book_author, top_n, similarity_matrix=similarity, df=content_df):
    # Combine title and author for the given book
    input_book_info = book_title + ' ' + book_author

    # Get the index of the input book
    input_book_index = df[df['Book-Information'] == input_book_info].index[0]

    # Get the cosine similarity using the precomputed similarity matrix
    similarity_scores = similarity_matrix[input_book_index]

    # Get the indices of the top-n most similar books
    top_indices = similarity_scores.argsort()[-top_n-1:-1]

    # Return the top-n most similar books
    return df.loc[top_indices, ['ISBN', 'Book-Title', 'Book-Author']]


In [5]:
NUM_OF_RECOMMENDED_BOOKS = 5

# Use the random_state in the sample method
random_row = train_data.sample(n=1)

# Use recommend system to get a content based recommendation of the top 5 books for the user
book_title = random_row['Book-Title'].values[0]
book_author = random_row['Book-Author'].values[0]

recommendations_existing_user = content_based_recommendations(book_title, book_author, NUM_OF_RECOMMENDED_BOOKS)

print(f"Recommendations for '{book_title}' by {book_author} (Existing User):")
print(recommendations_existing_user)


Recommendations for 'Left Behind: A Novel of the Earth's Last Days (Left Behind #1)' by Tim Lahaye (Existing User):
            ISBN                                         Book-Title  \
2396  0500542384                    The Earth from the Air 365 Days   
532   0842329269  Apollyon: The Destroyer Is Unleashed (Left Beh...   
2272  0842332340  Armageddon: The Cosmic Battle of the Ages (Lef...   
1381  0842332286  The Mark: The Beast Rules the World (Left Behi...   
51    0842329129  Left Behind: A Novel of the Earth's Last Days ...   

               Book-Author  
2396  Yann Arthus-bertrand  
532          Jerry Jenkins  
2272         Tim F. LaHaye  
1381            Tim Lahaye  
51              Tim Lahaye  


Finds books with close consine similarity based on title/author

# **Collaborative Filtering**

In [6]:
ratings = pd.DataFrame(train_data.groupby('ISBN')['Book-Rating'].mean())

ratings['num_ratings'] = pd.DataFrame(train_data.groupby('ISBN')['Book-Rating'].count())
ratings.head()

bookmat = train_data.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating')
bookmat.head()

ratings.sort_values('num_ratings', ascending=False).head(10)

Unnamed: 0_level_0,Book-Rating,num_ratings
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
044021145X,9.0,3
1400034779,7.0,3
0553279912,8.333333,3
0802135226,8.5,2
0060928336,7.0,2
0553801929,7.5,2
067976402X,8.5,2
0380769557,7.5,2
0679745203,9.0,2
0684874350,7.0,2


In [7]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Create a LabelEncoder object
isbn_encoder = LabelEncoder()

# Fit the encoder to the ISBN column
merged_df['ISBN_encoded'] = isbn_encoder.fit_transform(merged_df['ISBN'])
test_data['ISBN_encoded'] = isbn_encoder.transform(test_data['ISBN'])

# Convert the ISBN column to a numpy array
x_train_global_mean = merged_df[['User-ID', 'ISBN_encoded', 'Book-Rating']].to_numpy()
x_test = test_data[['User-ID', 'ISBN_encoded']].to_numpy()

# Specify parameter grid for KNN
param_grid = {
    'n_neighbors': [40, 45, 50],
    'algorithm': ['auto'],
    'metric': ['euclidean', 'manhattan', 'chebyshev'],
}

# Create KNN model
knn_model = KNeighborsRegressor()

# Use GridSearchCV to find the best parameters
gs = GridSearchCV(knn_model, param_grid, scoring='neg_mean_squared_error', cv=3)
try:
    gs.fit(x_train_global_mean[:, :-1], x_train_global_mean[:, -1])
except ValueError as e:
    print(f"Error during grid search. Error: {e}")

# Print the best parameters and corresponding RMSE
print("Best Parameters: ", gs.best_params_)
print("Best RMSE: ", (-gs.best_score_) ** 0.5)

# Predict on the test set
test_predictions = gs.predict(x_test)

# Evaluate the model
test_rmse = mean_squared_error(test_data['Book-Rating'], test_predictions) ** 0.5
print("Test RMSE: ", test_rmse)


Best Parameters:  {'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 40}
Best RMSE:  1.851416997740723
Test RMSE:  1.7830938716698912


Root-mean-square-error(RMSE) showing average deviation of prediction versus the actual rating

In [8]:
def collaborative_recommendations(user_id, book_isbn, isbn_encoder=isbn_encoder, model=gs):
    try:
        # Encode the ISBN for the given book
        book_encoded = isbn_encoder.transform([book_isbn])[0]

        # Predict the rating for the given user and book
        prediction = model.predict([[user_id, book_encoded]])[0]
        return prediction
    except ValueError:
        print("User ID or ISBN not found in dataset.")
        return global_mean

In [23]:

    # Get a random user from the test set
    user_id = test_data.sample(n=1)['User-ID'].values[0]
    # get a random book from the test set that the user has rated
    book_isbn = test_data[test_data['User-ID'] == user_id].sample(n=1)['ISBN'].values[0]

    predicted_rating = collaborative_recommendations(user_id, book_isbn)
    
    print(f"Predicted rating for user {user_id} and book {book_isbn}: {predicted_rating}")
    # Print actual rating for the book
    print(f"Actual rating for user {user_id} and book {book_isbn}: {test_data[(test_data['User-ID'] == user_id) & (test_data['ISBN'] == book_isbn)]['Book-Rating'].values[0]}")
    print()
 

Predicted rating for user 100906 and book 0446672289: 7.65
Actual rating for user 100906 and book 0446672289: 8



Collaborative Recommender predicts rating for specific book & user

### **Hybrid Model**

In [11]:
# Hybrid Recommendation System
"""
Generate hybrid book recommendations for a user.

Parameters:
- content_row: DataFrame row representing the user and book information.
- top_n: Number of top recommendations to return.
- collaborative_model: Function for collaborative filtering recommendations.
- content_model: Function for content-based recommendations.

Returns:
- DataFrame containing the top hybrid book recommendations.
"""
def hybrid_recommendations(content_row, top_n, collaborative_model=collaborative_recommendations, content_model=content_based_recommendations):
    # Extract data from row
    book_title = content_row['Book-Title'].values[0]
    book_author = content_row['Book-Author'].values[0]
    user_id = content_row['User-ID'].values[0]

    # Get content-based recommendations for the user
    book_recommendations = content_based_recommendations(book_title, book_author, top_n)

    # Create an empty DataFrame to store the predicted ratings
    predictions_df = pd.DataFrame()

    # Iterate through content-based recommendations
    for index, row in book_recommendations.iterrows():
        book_title = row['Book-Title']
        book_author = row['Book-Author']
        book_isbn = row['ISBN']

        # Predict the rating using collaborative filtering
        predicted_rating = collaborative_recommendations(user_id, book_isbn)

        # Append the results to the DataFrame
        predictions_df = pd.concat([predictions_df, pd.DataFrame({'Book-Title': [book_title], 'Book-Author': [book_author], 'Predicted-Rating': [predicted_rating]})], ignore_index=True)

    # Sort the DataFrame based on predicted ratings
    predictions_df = predictions_df.sort_values(by='Predicted-Rating', ascending=False)
    print('------------------------')
    print(predictions_df)

    # Get the top 5 books
    top_hybrid_books = predictions_df.head(5)

    return top_hybrid_books

# Assuming you have a DataFrame named test_data
random_row = merged_df.sample(n=1)  # Use a specific random_state for reproducibility

# Display the randomly selected row
print("Randomly Selected Row:")
print(random_row[['User-ID', 'Book-Title', 'Book-Author']])

hybrid_recommendations(random_row, 100)


Randomly Selected Row:
      User-ID                                         Book-Title  \
2909    31468  Permed to Death (Bad Hair Day Mysteries (Paper...   

         Book-Author  
2909  Nancy J. Cohen  
------------------------
                                           Book-Title             Book-Author  \
17  Other People's Skeletons (Rebecca Schwartz Mys...             Julie Smith   
3                  1984 (Signet Classics (Paperback))           George Orwell   
57                         The World's Last Mysteries         Reader's Digest   
74                                 Word for Every Day        Alvin N. Rogness   
55                                   Jewels of Elvish  Nancy Varian Berberick   
..                                                ...                     ...   
41   Have a Nice Day : A Tale of Blood and Sweatsocks              Mick Foley   
67  The Bad Beginning (A Series of Unfortunate Eve...          Lemony Snicket   
51                       The Death of Vi

Unnamed: 0,Book-Title,Book-Author,Predicted-Rating
17,Other People's Skeletons (Rebecca Schwartz Mys...,Julie Smith,7.275
3,1984 (Signet Classics (Paperback)),George Orwell,7.275
57,The World's Last Mysteries,Reader's Digest,7.275
74,Word for Every Day,Alvin N. Rogness,7.275
55,Jewels of Elvish,Nancy Varian Berberick,7.275


From random row, gives recommended books using content-based filtering, then uses collab. based filtering to give a predicted rating for the user. Orders from highest to lowest rating

# COLLABRATIVE FILTERING STATS CSV COMPILER


In [22]:
stats_df = pd.DataFrame({"User-ID": [],"ISBN": [],"Predicted-Rating":[],"Actual-Rating":[]})

for i in range(200):
    # Get a random user from the test set
    user_id = test_data.sample(n=1)['User-ID'].values[0]
    # get a random book from the test set that the user has rated
    book_isbn = test_data[test_data['User-ID'] == user_id].sample(n=1)['ISBN'].values[0]

    predicted_rating = collaborative_recommendations(user_id, book_isbn)

    stats_df.loc[len(stats_df.index)] = [user_id, book_isbn, predicted_rating,test_data[(test_data['User-ID'] == user_id) & (test_data['ISBN'] == book_isbn)]['Book-Rating'].values[0]] 

stats_csv_data = stats_df.to_csv('stats.csv', index = False) 
print('\nCSV String:\n', stats_csv_data) 


CSV String:
 None
