In [None]:
# --- Download and unzip the dataset ---
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
!unzip book-crossings.zip

# --- Load the data properly ---
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# These are the filenames
books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

# Load CSVs
books = pd.read_csv(books_filename, sep=';', encoding='latin-1', on_bad_lines='skip')
ratings = pd.read_csv(ratings_filename, sep=';', encoding='latin-1', on_bad_lines='skip')

# --- Clean up books dataframe ---
# Keep only relevant columns and rename for easier usage
books = books[['ISBN', 'Book-Title']]
books.columns = ['ISBN', 'Book-Title']

# --- Data Preprocessing ---

# Remove users with fewer than 200 ratings
user_counts = ratings['User-ID'].value_counts()
ratings = ratings[ratings['User-ID'].isin(user_counts[user_counts >= 200].index)]

# Remove books with fewer than 100 ratings
book_counts = ratings['ISBN'].value_counts()
ratings = ratings[ratings['ISBN'].isin(book_counts[book_counts >= 100].index)]

# Merge ratings with book titles
ratings = ratings.merge(books, on='ISBN')

# Create a pivot table (book-user matrix)
book_user_matrix = ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
book_user_matrix.fillna(0, inplace=True)

# --- Train the model ---

model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(book_user_matrix.values)

# --- Define the recommendation function ---

def get_recommends(title):
    # Find index of the title, handling potential KeyError
    if title in book_user_matrix.index:
        idx = book_user_matrix.index.get_loc(title)
    else:
        print(f"Book '{title}' not found in the dataset.")
        return []  # or raise an exception, depending on desired behavior

    # Find 6 nearest neighbors (including the book itself)
    distances, indices = model.kneighbors([book_user_matrix.iloc[idx].values], n_neighbors=6)

    # Prepare list of recommended books
    recommended_books = []
    for i in range(1, len(distances[0])):
        recommended_books.append([book_user_matrix.index[indices[0][i]], distances[0][i]])

    return [title, recommended_books]

# --- Test the recommendation system ---

recommends = get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")
print(recommends)

# --- Optional: Visualization ---

# Reduce book-user matrix to 2D using PCA
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(book_user_matrix.values)

# Plot the books in 2D space
plt.figure(figsize=(12, 8))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.5)

# Annotate a few book titles (first 50)
for i, title in enumerate(book_user_matrix.index[:50]):
    plt.annotate(title, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=8, alpha=0.7)

plt.title('Books Visualized in 2D Space Using PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.show()



--2025-04-28 09:59:09--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.1’


2025-04-28 09:59:09 (254 MB/s) - ‘book-crossings.zip.1’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 