<a href="https://colab.research.google.com/github/hlb-git/AI-ML-Tensorflow-Python/blob/main/book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

## Source for the dataset to be used.


The dataset used is a book rating and review dataset. It contains Users, ratings, ISBN and book Auhtors



In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-02-24 20:15:46--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-02-24 20:15:46 (64.4 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


### import csv data into dataframes

Using the ```pandas.read_csv()``` method

In [3]:

df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})


## A quick Summary of the tables shapes

In [62]:
print('Ratings table Shape:', df_ratings.shape)
print("\nBooks Table Shape:", df_books.shape)

Ratings table Shape: (1149780, 3)

Books Table Shape: (271377, 3)


In [5]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [6]:
df_books.head()


Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


## The code below removes the users with less than 200 ratings

In [7]:
df_ratings.isna().sum()

Unnamed: 0,0
user,0
isbn,0
rating,0


In [8]:
df_books.isna().sum()

Unnamed: 0,0
isbn,0
title,0
author,2


In [9]:
df_books.dropna(inplace=True)
df_books.isna().sum()

Unnamed: 0,0
isbn,0
title,0
author,0


## This also removes books with below 100

In [10]:
user_rating_count = df_ratings['user'].value_counts()

below_200_rating_users = user_rating_count[user_rating_count < 200]

filtered_ratings = df_ratings[~df_ratings['user'].isin(below_200_rating_users.index)]

filtered_ratings.shape


(527556, 3)

In [11]:
book_ratings = df_ratings.isbn.value_counts()
below_100_rating_books = book_ratings[book_ratings < 100]
below_100_rating_books

Unnamed: 0_level_0,count
isbn,Unnamed: 1_level_1
0375500510,99
0671727583,99
0425174271,99
1576737330,99
0425172996,99
...,...
1568656386,1
1568656408,1
1569551553,1
1570081808,1


In [12]:
book_ratings = df_ratings.isbn.value_counts()
below_100_rating_books = book_ratings[book_ratings < 100]
below_100_rating_books
filtered_ratings = filtered_ratings[~filtered_ratings.isbn.isin(below_100_rating_books.index)]
filtered_ratings.shape

(49781, 3)

In [13]:
print(filtered_ratings.isbn.isin(df_books[df_books.title=="Where the Heart Is (Oprah's Book Club (Paperback))"].isbn  ).sum())

183


## This portion of the code pivots the for better model perfomance

In [14]:
ratings_pivot = filtered_ratings.pivot_table(index=['user'], columns=['isbn'], values=['rating']).fillna(0).transpose()
ratings_pivot.head()

Unnamed: 0_level_0,user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Unnamed: 0_level_1,isbn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
rating,002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
rating,0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rating,0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rating,006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rating,0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
ratings_pivot.index = ratings_pivot.join(df_books.set_index('isbn'), how='left')['title']

In [16]:
ratings_pivot.sort_index().head()

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
knn = NearestNeighbors(metric='cosine')
model = knn.fit(ratings_pivot)

In [18]:
model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'cosine',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'radius': 1.0}

In [58]:
preference = "The Queen of the Damned (Vampire Chronicles (Paperback))"
distance, indeces = model.kneighbors([ratings_pivot.loc[preference]], n_neighbors=6)

print(distance)
print(indeces)

[[0.         0.51784116 0.53763384 0.73450685 0.74486566 0.7939835 ]]
[[137 127 153 128 152 644]]


In [20]:
ratings_pivot.iloc[indeces[0]].index.values

array(["Where the Heart Is (Oprah's Book Club (Paperback))",
       'The Lovely Bones: A Novel', 'I Know This Much Is True',
       'The Surgeon', 'The Weight of Water', "I'll Be Seeing You"],
      dtype=object)

In [61]:
from logging import exception
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  try:
    book_pvt_data = ratings_pivot.loc[book]
  except KeyError as e:
    print('The given book', e, 'does not exist')
    return

  distance, indice = model.kneighbors([book_pvt_data.values], n_neighbors=6)

  recommended_books = pd.DataFrame({
      'title'   : ratings_pivot.iloc[indice[0]].index.values,
      'distance': distance[0]
    }).sort_values(by='distance', ascending=False).head(5).values.tolist()
  # recommended_books.pop()
  recommended_books.insert(0,book)
  for i in range(1, len(recommended_books) - 1):
    try:
      recommended_books[i][1] = float(recommended_books[i][1])
    except exception:
      pass

  return recommended_books

books = get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")
books

['The Queen of the Damned (Vampire Chronicles (Paperback))',
 ['Catch 22', 0.793983519077301],
 ['The Witching Hour (Lives of the Mayfair Witches)', 0.7448656558990479],
 ['Interview with the Vampire', 0.7345068454742432],
 ['The Tale of the Body Thief (Vampire Chronicles (Paperback))',
  0.5376338362693787],
 ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.5178411602973938]]

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()