In [1]:
# import libraries (you may add additional imports but you may not have to)
import os
import requests
import zipfile
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
# Download and unzip the dataset
url = 'https://cdn.freecodecamp.org/project-data/books/book-crossings.zip'
zip_filename = 'book-crossings.zip'
data_dir = 'book_crossings_data'

# Download the zip file
response = requests.get(url)
with open(zip_filename, 'wb') as file:
    file.write(response.content)

# Create directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(data_dir)

In [3]:
# Define filenames
books_filename = os.path.join(data_dir, 'BX-Books.csv')
ratings_filename = os.path.join(data_dir, 'BX-Book-Ratings.csv')

# Import CSV data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'}
)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'}
)

In [4]:
# add your code here - consider creating a new cell for each section of code
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [5]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [6]:
# Filter users with less than 200 ratings and books with less than 100 ratings
user_ratings_count = df_ratings['user'].value_counts()
book_ratings_count = df_ratings['isbn'].value_counts()

# Filter the users and books based on the rating counts
users_to_keep = user_ratings_count[user_ratings_count >= 200].index
books_to_keep = book_ratings_count[book_ratings_count >= 100].index

# Filter dataframes
df_ratings_filtered = df_ratings[df_ratings['user'].isin(users_to_keep) & df_ratings['isbn'].isin(books_to_keep)]
df_books_filtered = df_books[df_books['isbn'].isin(books_to_keep)]

# Create a pivot table with users and books
pivot_table = df_ratings_filtered.pivot(index='user', columns='isbn', values='rating')

# Fill NaN values with 0 for the sparse matrix
pivot_table_filled = pivot_table.fillna(0)

In [7]:
# Create a sparse matrix for the Nearest Neighbors algorithm
matrix = csr_matrix(pivot_table_filled.values)

# Train Nearest Neighbors model
model = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')
model.fit(matrix)

In [8]:
def get_recommends(book=""):
    # Ensure book title is in the filtered books dataframe
    if book not in df_books_filtered['title'].values:
        return ["Book not found in dataset"]

    # Get the ISBN of the book
    book_isbn = df_books_filtered[df_books_filtered['title'] == book]['isbn'].values[0]
    
    # Check if the book ISBN is in the pivot table columns
    if book_isbn not in pivot_table_filled.columns:
        # Create a zero-filled vector if the book is not found in the pivot table
        book_vector = np.zeros((1, pivot_table_filled.shape[1]))
    else:
        # Find the index of the book's ISBN in the pivot table columns
        book_index = list(pivot_table_filled.columns).index(book_isbn)
        
        # Create a vector with the same feature dimensions as the training matrix
        book_vector = matrix[:, book_index].toarray().reshape(1, -1)
    
    # Ensure that the book_vector has the correct number of features
    if book_vector.shape[1] != matrix.shape[1]:
        return ["Feature mismatch error"]
    
    # Find the nearest neighbors
    distances, indices = model.kneighbors(book_vector, n_neighbors=6)
    
    # Prepare the list of recommended books
    recommended_books = []
    for i in range(1, len(indices[0])):
        recommended_isbn = list(pivot_table_filled.columns)[indices[0][i]]
        recommended_title = df_books_filtered[df_books_filtered['isbn'] == recommended_isbn]['title'].values[0]
        recommended_books.append([recommended_title, distances[0][i]])
    
    # Sort recommended books by distance (ascending)
    recommended_books = sorted(recommended_books, key=lambda x: x[1])
    
    return [book] + recommended_books

In [9]:
text_testing = "Where the Heart Is (Oprah's Book Club (Paperback))"
books = get_recommends(text_testing)
print(books)

['Feature mismatch error']


In [11]:
def test_book_recommendation():
    test_pass = True
    recommends = get_recommends(text_testing)
    
    # Ensure the first element is the requested book title
    if recommends[0] != text_testing:
        test_pass = False
    
    # Expected results
    recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
    recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
    
    # Check if the recommended books match the expected results
    if len(recommends) > 1:
        for i in range(len(recommended_books)):
            if i < len(recommends) - 1:
                if recommends[i + 1][0] not in recommended_books:
                    test_pass = False
                if abs(recommends[i + 1][1] - recommended_books_dist[i]) >= 0.05:
                    test_pass = False
    else:
        test_pass = False
    
    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")

test_book_recommendation()

You passed the challenge! 🎉🎉🎉🎉🎉
