In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# 1. Load the pre-split data
train_df = pd.read_csv('/kaggle/input/book-recommend/train.csv')
test_df = pd.read_csv('/kaggle/input/book-recommend/test.csv')

# 2. Create Lookup Dictionaries (Same as NCF)
df_full = pd.concat([train_df, test_df])
user_id_to_encoded = dict(zip(df_full['User_ID'], df_full['user_encoded']))
book_title_to_encoded = dict(zip(df_full['Book_Title'], df_full['book_encoded']))
encoded_to_title = dict(zip(df_full['book_encoded'], df_full['Book_Title']))

# 3. Model Constants
n_users = df_full['user_encoded'].max() + 1
n_books = df_full['book_encoded'].max() + 1

# 4. Create Sparse Matrices
# KNN requires a matrix (Rows=Users, Cols=Books)
train_m = csr_matrix(
    (train_df['Book_Rating'].values, (train_df['user_encoded'].values, train_df['book_encoded'].values)), 
    shape=(n_users, n_books)
)

test_m = csr_matrix(
    (test_df['Book_Rating'].values, (test_df['user_encoded'].values, test_df['book_encoded'].values)), 
    shape=(n_users, n_books)
)

# Popularity Dictionary for Scorecard
book_pop_dict = df_full.groupby('book_encoded')['Book_Rating'].count().to_dict()

print(f"Data Loaded. Users: {n_users}, Books: {n_books}")

Data Loaded. Users: 10697, Books: 4106


In [2]:
# User-Based Collaborative Filtering
# We look for users with similar rating patterns (Cosine Similarity)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

# Fit on the Training Matrix
model_knn.fit(train_m)

print("KNN (Cosine) Model Trained!")

KNN (Cosine) Model Trained!


In [3]:
def predict_rating_cosine(user_id, book_title):
    # 1. Check if User/Book exists
    if (user_id not in user_id_to_encoded) or (book_title not in book_title_to_encoded):
        return 7.5 # Default

    u_enc = user_id_to_encoded[user_id]
    b_enc = book_title_to_encoded[book_title]
    
    # 2. Find Nearest Neighbors for this User
    # Returns distances and indices of the K nearest users
    distances, indices = model_knn.kneighbors(train_m[u_enc], n_neighbors=20)
    
    # Flatten
    neighbor_indices = indices.flatten()
    neighbor_distances = distances.flatten()
    
    # 3. Calculate Weighted Average
    # We only care about neighbors who actually rated the target book 'b_enc'
    # Get ratings of neighbors for this specific book
    neighbor_ratings = train_m[neighbor_indices, b_enc].toarray().flatten()
    
    # Filter out zeros (neighbors who didn't read this book)
    mask = neighbor_ratings > 0
    relevant_ratings = neighbor_ratings[mask]
    relevant_distances = neighbor_distances[mask]
    
    if len(relevant_ratings) == 0:
        return 7.5 # None of the similar users read this book
        
    # Weight by similarity (1 - distance) because Cosine Distance = 1 - Cosine Similarity
    similarities = 1 - relevant_distances
    
    if np.sum(similarities) == 0:
        return np.mean(relevant_ratings)
        
    prediction = np.dot(relevant_ratings, similarities) / np.sum(similarities)
    
    return np.clip(prediction, 1, 10)

# Test
print(predict_rating_cosine(276747, 'The Lovely Bones: A Novel'))

7.5


In [4]:
def recommend_cosine(user_id_original, n_recommendations=5):
    if user_id_original not in user_id_to_encoded:
        return []

    u_enc = user_id_to_encoded[user_id_original]
    
    # 1. Find Neighbors
    distances, indices = model_knn.kneighbors(train_m[u_enc], n_neighbors=10)
    neighbor_indices = indices.flatten()
    
    # 2. Get Neighbors' Top Books
    # We sum up the ratings from all neighbors for all books
    neighbor_ratings_sum = np.array(train_m[neighbor_indices].sum(axis=0)).flatten()
    
    # 3. Filter out books user has already read
    user_history = train_m[u_enc].indices
    neighbor_ratings_sum[user_history] = 0 # Zero out read books
    
    # 4. Get Top N
    top_indices = neighbor_ratings_sum.argsort()[-n_recommendations:][::-1]
    
    # 5. Decode
    print(f"--- Cosine Recommendations for User {user_id_original} ---")
    results = []
    for book_int in top_indices:
        # Only recommend if score > 0
        if neighbor_ratings_sum[book_int] == 0: continue
            
        title = encoded_to_title.get(book_int, "Unknown")
        print(f"Score {neighbor_ratings_sum[book_int]:.1f} | {title}")
        results.append(title)
        
    return results

# Test
recs = recommend_cosine(276747)

In [5]:
# Create Item-Item Model (Fit on Transpose)
# This finds books with similar rating patterns
model_item_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
model_item_knn.fit(train_m.T) # Note the .T (Transpose)

def recommend_similar_cosine(book_title):
    if book_title not in book_title_to_encoded:
        print("Book not found.")
        return

    b_enc = book_title_to_encoded[book_title]
    
    # Find Nearest Books
    distances, indices = model_item_knn.kneighbors(train_m.T[b_enc], n_neighbors=6)
    
    print(f"Cosine says these are similar to '{book_title}':")
    for i in range(1, len(indices.flatten())):
        idx = indices.flatten()[i]
        title = encoded_to_title.get(idx, "Unknown")
        print(f"{i}: {title}")

# Test
recommend_similar_cosine('Animal Farm')

Cosine says these are similar to 'Animal Farm':
1: 1984
2: The Green Mile: The Mouse on the Mile (Green Mile Series)
3: Tis: A Memoir
4: Losing Julia
5: The Green Mile: Coffey on the Mile (Green Mile Series)


In [6]:
from sklearn.metrics import ndcg_score, mean_squared_error
import numpy as np

# --- 1. FAST WRAPPER (Required for efficient scoring) ---
def predict_cosine_wrapper(u_enc, b_enc):
    """
    Fast prediction using Integers directly.
    Bypasses the dictionary lookups to speed up the loop.
    """
    # 1. Find neighbors for the User (u_enc)
    distances, indices = model_knn.kneighbors(train_m[u_enc], n_neighbors=20)
    neighbor_indices = indices.flatten()
    neighbor_distances = distances.flatten()
    
    # 2. Get ratings for the Book (b_enc) from these neighbors
    # We index the sparse matrix directly
    neighbor_ratings = train_m[neighbor_indices, b_enc].toarray().flatten()
    
    # 3. Filter: Keep only neighbors who actually rated this book
    mask = neighbor_ratings > 0
    relevant_ratings = neighbor_ratings[mask]
    relevant_distances = neighbor_distances[mask]
    
    # 4. Handle Edge Cases
    if len(relevant_ratings) == 0:
        return 7.5 # Default if no neighbors read it
        
    # 5. Weighted Average Calculation
    # Similarity = 1 - Distance (approx)
    similarities = 1 - relevant_distances
    
    if np.sum(similarities) == 0:
        return np.mean(relevant_ratings)
        
    pred = np.dot(relevant_ratings, similarities) / np.sum(similarities)
    return np.clip(pred, 1, 10)

# --- 2. THE SCORECARD FUNCTION ---
def get_model_scorecard(model_name, test_data_matrix, prediction_function, book_popularity_dict):
    print(f"--- Scoring Model: {model_name} ---")
    rmses = []
    ndcg_scores = []
    novelty_scores = []
    
    # Identify users in the test set
    test_users = np.unique(test_data_matrix.nonzero()[0])
    
    # Sample 100 users
    # (KNN is much slower than NCF, so we sample 100 instead of 200 to save time)
    n_sample = min(100, len(test_users))
    sample_users = np.random.choice(test_users, size=n_sample, replace=False)
    
    for u in sample_users:
        # Get Truth (What user really rated)
        true_book_ids = test_data_matrix[u].indices
        true_ratings = test_data_matrix[u].data
        
        if len(true_ratings) < 2: continue 
        
        # Get Predictions (Ask our model)
        pred_ratings = []
        for book_id in true_book_ids:
            pred = prediction_function(u, book_id)
            pred_ratings.append(pred)
            
        # --- Metric A: RMSE (Accuracy) ---
        rmses.append(np.sqrt(mean_squared_error(true_ratings, pred_ratings)))
        
        # --- Metric B: NDCG (Ranking) ---
        try:
            ndcg_scores.append(ndcg_score([true_ratings], [pred_ratings]))
        except: pass
        
        # --- Metric C: Novelty (Discovery) ---
        top_k_idx = np.argsort(pred_ratings)[::-1][:5]
        top_books = true_book_ids[top_k_idx]
        pop_score = np.mean([book_popularity_dict.get(b, 0) for b in top_books])
        novelty_scores.append(pop_score)

    return {
        "Model": model_name,
        "RMSE": np.mean(rmses),
        "NDCG": np.mean(ndcg_scores),
        "Novelty": np.mean(novelty_scores)
    }

# --- 3. RUN EVALUATION ---
# 'test_m' and 'book_pop_dict' were created in Cell 1
cosine_scores = get_model_scorecard("Cosine (KNN)", test_m, predict_cosine_wrapper, book_pop_dict)
print(cosine_scores)

--- Scoring Model: Cosine (KNN) ---
{'Model': 'Cosine (KNN)', 'RMSE': np.float64(1.967552140998551), 'NDCG': np.float64(0.9541488724623698), 'Novelty': np.float64(49.18641975308642)}
