# EDA

This section includes the initial installs, imports and some basic exploratory data analysis.

In [1]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd

In [4]:
# Load the ratings data in panda dataframe
ratings_df = pd.read_csv('ratings1.csv')
# Load the books data in panda dataframe
books_df = pd.read_csv('books1.csv')

In [5]:
# set display options
pd.set_option('display.max_columns', None) # display all columns
pd.set_option('display.width', 1000) # set display width

In [6]:
# Print dataset size and examine column data types
print("Number of ratings: ", len(ratings_df))
print(ratings_df.info())

Number of ratings:  5976479
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5976479 entries, 0 to 5976478
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   user_id  int64
 1   book_id  int64
 2   rating   int64
dtypes: int64(3)
memory usage: 136.8 MB
None


In [7]:
print(ratings_df.head())

   user_id  book_id  rating
0        1      258       5
1        2     4081       4
2        2      260       5
3        2     9296       5
4        2     2318       3


In [8]:
print(ratings_df.sort_values('book_id'))

         user_id  book_id  rating
2174136    29300        1       4
433265      6590        1       3
1907014     7546        1       5
3743260    43484        1       1
1266846    18689        1       5
...          ...      ...     ...
2366366    31293    10000       3
3376022    12272    10000       4
2811513    35330    10000       4
4134364    46337    10000       5
4047777    42537    10000       4

[5976479 rows x 3 columns]


In [9]:
books_df.drop(["goodreads_book_id", "best_book_id", "work_id", "books_count", "isbn", "isbn13", "language_code", "work_ratings_count", "work_text_reviews_count"], axis=1, inplace=True)

In [10]:
print(books_df)

      book_id                      authors  original_publication_year                                     original_title                                              title  average_rating  ratings_count  ratings_1  ratings_2  ratings_3  ratings_4  ratings_5                                          image_url                                    small_image_url
0           1              Suzanne Collins                     2008.0                                   The Hunger Games            The Hunger Games (The Hunger Games, #1)            4.34        4780653      66715     127936     560092    1481305    2706317  https://images.gr-assets.com/books/1447303603m...  https://images.gr-assets.com/books/1447303603s...
1           2  J.K. Rowling, Mary GrandPré                     1997.0           Harry Potter and the Philosopher's Stone  Harry Potter and the Sorcerer's Stone (Harry P...            4.44        4602479      75504     101676     455024    1156318    3011543  https://images.gr-ass

# Train the Model

## train/test

You can try running this cell, but it may crash you session while trying to split the dataset.

In [None]:
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader, Dataset

# Define the reader object for Surprise
reader = Reader(rating_scale=(1, 5))

# Load the merged dataframe into Surprise's Dataset object
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)

# Split the data into a train set and a test set
trainset, testset = train_test_split(data, test_size=0.25)

# Create the KNNBasic model
algo = KNNBasic()

# Train the model
algo.fit(trainset)

# Make predictions on the test set
predictions = algo.test(testset)

# Calculate evaluation metrics
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

## k-fold

In [10]:
from surprise import KNNBaseline, Dataset, Reader
from surprise.model_selection import cross_validate

# Define the reader object for Surprise
reader = Reader(rating_scale=(1, 5))

# Load the merged dataframe into Surprise's Dataset object
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)

# Set the similarity options for KNNBaseline
sim_options = {'name': 'pearson_baseline', 'user_based': False}

# Create the KNNBaseline model
knn_model = KNNBaseline(sim_options=sim_options)

# Perform k-fold cross-validation
cv_results = cross_validate(knn_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Print the average RMSE and MAE scores
average_rmse = sum(cv_results['test_rmse']) / len(cv_results['test_rmse'])
average_mae = sum(cv_results['test_mae']) / len(cv_results['test_mae'])

print(f"Average RMSE (5-fold CV): {average_rmse}")
print(f"Average MAE (5-fold CV): {average_mae}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7975  0.7974  0.7969  0.7977  0.7979  0.7975  0.0003  
MAE (testset)     0.6056  0.6059  0.6052  0.6062  0.6058  0.6057  0.0003  
Fit time          94.56   97.50   103.81  102.27  101.01  99.83   3.36    
Test time         218.23  228.01  224.59  217.94  218

## build full trainset


In [21]:
from surprise import KNNBaseline, Reader, Dataset

# Define the reader object for Surprise
reader = Reader(rating_scale=(1, 5))

# Load the merged dataframe into Surprise's Dataset object
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)

trainset = data.build_full_trainset()
sim_options = {"name": "pearson_baseline", "user_based": False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)
algo

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fb23cab2530>

In [22]:
print("Number of items in the trainset:", len(algo.trainset.all_items()))

Number of items in the trainset: 10000


In [23]:
def read_item_names(df):
    """Read book title and id from a csv file and return two dictionaries to
    convert raw ids into book titles and book titles into raw ids.
    """
    # Create dictionaries to convert raw ids into book titles and book titles
    # into raw ids
    rid_to_name = {}
    name_to_rid = {}
    for index, row in df.iterrows():
        rid = str(row["book_id"])
        name = row["title"]
        rid_to_name[rid] = name
        name_to_rid[name] = rid

    return rid_to_name, name_to_rid

In [24]:
# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names(books_df)

# Recommendations
After running all the previous cells, you can run this cell to interact with the model and recieve book recommendations. Remember that there was only 10000 books in this dataset so make sure to input a book that is in the dataset.

In [28]:
# Retrieve inner id of the book
book_title = input("Book title: ")
raw_id = name_to_rid[book_title]
raw_id = int(raw_id)

inner_id = algo.trainset.to_inner_iid(raw_id)

neighbors_inner = algo.get_neighbors(inner_id, k=10)

# Get book information
book_data = books_df.loc[books_df['book_id'] == raw_id].iloc[0]

# Print book information and nearest neighbors
print()
print(f"Book title: {book_data['original_title']}")
print(f"Image URL: {book_data['image_url']}")
print(f"Authors: {book_data['authors']}")
print(f"Publication Year: {book_data['original_publication_year']}")
print()
print(f"The 10 nearest neighbors to {book_title} are:")
for neighbor_inner in neighbors_inner:
    neighbor_raw = algo.trainset.to_raw_iid(neighbor_inner)
    neighbor_name = rid_to_name[str(neighbor_raw)]
    neighbor_data = books_df.loc[books_df['book_id'] == neighbor_raw].iloc[0]
    print(f"Book title: {neighbor_data['original_title']}")
    print(f"Image URL: {neighbor_data['image_url']}")
    print(f"Authors: {neighbor_data['authors']}")
    print(f"Publication Year: {neighbor_data['original_publication_year']}")
    print()

Book title: The Alchemist

Book title: O Alquimista
Image URL: https://images.gr-assets.com/books/1483412266m/865.jpg
Authors: Paulo Coelho, Alan R. Clarke
Publication Year: 1988.0

The 10 nearest neighbors to The Alchemist are:
Book title: Na margem do rio Piedra eu sentei e chorei
Image URL: https://images.gr-assets.com/books/1466877798m/1428.jpg
Authors: Paulo Coelho, Alan R. Clarke
Publication Year: 1994.0

Book title: The Celestine Prophecy
Image URL: https://images.gr-assets.com/books/1341360412m/13103.jpg
Authors: James Redfield
Publication Year: 1993.0

Book title: Veronika decide morrer
Image URL: https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png
Authors: Paulo Coelho, Margaret Jull Costa
Publication Year: 1998.0

Book title: Illusions: The Adventures of a Reluctant Messiah
Image URL: https://images.gr-assets.com/books/1353964306m/29946.jpg
Authors: Richard Bach
Publication Year: 1977.0

Book title: O Diário de um Mago
Image URL: https://