In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Cell 2: Load Data
# Load the dataset
books = pd.read_csv('https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv')

# Print initial data info
print(f"Original data: {len(books)} books, {len(ratings)} ratings")

# Cell 3: Clean Data
# Use less strict filtering criteria
# Remove users with less than 50 ratings (was 200)
user_counts = ratings['user_id'].value_counts()
ratings = ratings[ratings['user_id'].isin(user_counts[user_counts >= 50].index)]

# Remove books with less than 20 ratings (was 100)
book_counts = ratings['book_id'].value_counts()
ratings = ratings[ratings['book_id'].isin(book_counts[book_counts >= 20].index)]

# Print filtered data info
print(f"After filtering: {len(ratings['book_id'].unique())} books, {len(ratings['user_id'].unique())} users, {len(ratings)} ratings")

# Create a pivot table
ratings_pivot = ratings.pivot(index='book_id', columns='user_id', values='rating').fillna(0)
print(f"Pivot table shape: {ratings_pivot.shape}")

# Check if the pivot table is empty
if ratings_pivot.shape[0] == 0 or ratings_pivot.shape[1] == 0:
    raise ValueError("The pivot table is empty. Try reducing the filtering thresholds.")

# Create mappings for book_id to title and title to book_id
book_id_to_title = books.set_index('book_id')['title'].to_dict()
title_to_book_id = {title: book_id for book_id, title in book_id_to_title.items()
                    if book_id in ratings_pivot.index}  # Only include books in the pivot table

# Cell 4: Develop KNN Model
# Ensure that the ratings_pivot DataFrame is correctly formatted
ratings_pivot = ratings_pivot.astype(float)

# Normalize the data
scaler = StandardScaler()
ratings_normalized = scaler.fit_transform(ratings_pivot.values)

# Fit the NearestNeighbors model
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(ratings_normalized)

# Cell 5: Create get_recommends Function
def get_recommends(book_title):
    # Find the book_id for the given book title
    if book_title not in title_to_book_id:
        return f"Book '{book_title}' not found in the dataset."

    book_id = title_to_book_id[book_title]

    # Check if the book_id exists in the ratings_pivot index
    if book_id not in ratings_pivot.index:
        return f"Book '{book_title}' (ID: {book_id}) has insufficient ratings data."

    # Get the index of the book in the pivot table
    book_idx = ratings_pivot.index.get_indexer([book_id])[0]

    # Find the nearest neighbors
    distances, indices = model.kneighbors(ratings_normalized[book_idx].reshape(1, -1), n_neighbors=min(6, len(ratings_pivot)))

    # Get the recommended books and their distances
    recommended_books = []
    for i in range(1, len(indices[0])):  # Skip the first one as it's the book itself
        idx = indices[0][i]
        recommended_book_id = ratings_pivot.index[idx]
        recommended_book_title = book_id_to_title.get(recommended_book_id, "Unknown Title")
        recommended_books.append([recommended_book_title, float(distances[0][i])])

    return [book_title, recommended_books]

# Cell 6: Test the Function
# Find a book that exists in our filtered dataset
available_books = [book_id_to_title[book_id] for book_id in ratings_pivot.index if book_id in book_id_to_title]

if available_books:
    print(f"\nTesting with an available book: {available_books[0]}")
    result = get_recommends(available_books[0])
    print(result)
else:
    print("No books available for testing after filtering.")

# Test with specific titles if they exist in the filtered dataset
test_titles = [
    "The Queen of the Damned (Vampire Chronicles (Paperback))",
    "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
]

for title in test_titles:
    if title in title_to_book_id:
        print(f"\nTesting with: {title}")
        result = get_recommends(title)
        print(result)
    else:
        print(f"\n{title} is not in the filtered dataset.")



Original data: 10000 books, 5976479 ratings
After filtering: 9980 books, 52525 users, 5942599 ratings
Pivot table shape: (9980, 52525)
