In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy.sparse import csr_matrix

# 1. Load the pre-split data
# (Assumes train.csv and test.csv are in the input directory)
train_df = pd.read_csv('/kaggle/input/book-recommend/train.csv')
test_df = pd.read_csv('/kaggle/input/book-recommend/test.csv')

# 2. Recombine briefly to create Lookup Dictionaries
# We need these to map "User 276747" -> "ID 45" without the original LabelEncoder
df_full = pd.concat([train_df, test_df])

# Map: User_ID -> Encoded_ID
user_id_to_encoded = dict(zip(df_full['User_ID'], df_full['user_encoded']))

# Map: Book_Title -> Encoded_ID (CRITICAL: This solves your NameError)
book_title_to_encoded = dict(zip(df_full['Book_Title'], df_full['book_encoded']))

# Map: Encoded_ID -> Book_Title (for printing results)
encoded_to_title = dict(zip(df_full['book_encoded'], df_full['Book_Title']))

# 3. Prepare Arrays for the Neural Network
X_train = train_df[['user_encoded', 'book_encoded']].values
y_train = train_df['Book_Rating'].values

X_test = test_df[['user_encoded', 'book_encoded']].values
y_test = test_df['Book_Rating'].values

# 4. Set Model Architecture Constants
# The Neural Net needs to know the largest ID to set the Embedding size
n_users = df_full['user_encoded'].max() + 1
n_books = df_full['book_encoded'].max() + 1

print(f"Loaded Data! Num Users: {n_users}, Num Books: {n_books}")
print(f"Train rows: {len(X_train)}, Test rows: {len(X_test)}")

# 5. Helpers for the Scorecard (Referee)
test_m = csr_matrix(
    (y_test, (X_test[:, 0], X_test[:, 1])), 
    shape=(n_users, n_books)
)

# Create Popularity Dictionary (Encoded ID -> Count)
book_pop_dict = df_full.groupby('book_encoded')['Book_Rating'].count().to_dict()

2025-12-28 16:18:22.821814: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766938703.009841      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766938703.065690      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766938703.516284      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766938703.516325      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766938703.516328      24 computation_placer.cc:177] computation placer alr

Loaded Data! Num Users: 10697, Num Books: 4106
Train rows: 82234, Test rows: 20559


In [2]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def build_ncf_model(num_users, num_items, embedding_size=50):
    # Inputs
    user_input = Input(shape=(1,), name='user_input')
    book_input = Input(shape=(1,), name='book_input')

    # Embeddings
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embedding')(user_input)
    book_embedding = Embedding(input_dim=num_items, output_dim=embedding_size, name='book_embedding')(book_input)

    # Flatten
    user_vec = Flatten()(user_embedding)
    book_vec = Flatten()(book_embedding)

    # Concatenate & Dense Layers
    concat = Concatenate()([user_vec, book_vec])

    dense1 = Dense(128, activation='relu')(concat)
    dropout1 = Dropout(0.2)(dense1)
    dense2 = Dense(64, activation='relu')(dropout1)
    dense3 = Dense(32, activation='relu')(dense2)

    # Output
    output = Dense(1, activation='linear', name='output')(dense3)

    model = Model(inputs=[user_input, book_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    return model

# Initialize
ncf_model = build_ncf_model(n_users, n_books)
ncf_model.summary()

I0000 00:00:1766938716.970071      24 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [3]:
history = ncf_model.fit(
    [X_train[:, 0], X_train[:, 1]], 
    y_train,                        
    batch_size=64,
    epochs=5,
    validation_data=([X_test[:, 0], X_test[:, 1]], y_test),
    verbose=1
)

Epoch 1/5


I0000 00:00:1766938719.755788      65 service.cc:152] XLA service 0x7eab44009b20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1766938719.755825      65 service.cc:160]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1766938720.071946      65 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m  77/1285[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 2ms/step - loss: 51.2039

I0000 00:00:1766938721.295356      65 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 10.7124 - val_loss: 2.4828
Epoch 2/5
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.2589 - val_loss: 2.4464
Epoch 3/5
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.0611 - val_loss: 2.4418
Epoch 4/5
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.8666 - val_loss: 2.4868
Epoch 5/5
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.6412 - val_loss: 2.5653


In [4]:
def predict_rating_ncf(user_id, book_title):
    # 1. Check if User/Book exists in our lookups
    if (user_id not in user_id_to_encoded) or (book_title not in book_title_to_encoded):
        # Cold Start: Return global average if unknown
        return 7.5

    # 2. Get the Integers from the Dictionaries
    u_enc = user_id_to_encoded[user_id]
    b_enc = book_title_to_encoded[book_title]

    # 3. Predict
    prediction = ncf_model.predict([np.array([u_enc]), np.array([b_enc])], verbose=0)
    
    return np.clip(prediction[0][0], 1, 10)

# Test it
# (Make sure to use a User ID that exists in your train/test CSVs)
print(predict_rating_ncf(276747, 'The Lovely Bones: A Novel'))

7.5


In [5]:
def recommend_ncf(user_id_original, n_recommendations=5):
    # 1. Handle Cold Start
    if user_id_original not in user_id_to_encoded:
        print(f"User {user_id_original} not found (Cold Start).")
        return []

    # 2. Get Encoded User ID (Using Dictionary)
    user_int = user_id_to_encoded[user_id_original]

    # 3. Find Candidates (Books user has NOT read)
    all_books = np.arange(n_books)
    
    # Get user history from the full dataframe
    user_history = df_full[df_full['User_ID'] == user_id_original]['book_encoded'].values
    candidates = np.setdiff1d(all_books, user_history)
    
    # Speed limit: if >1000 candidates, pick random 1000 to score
    if len(candidates) > 1000:
        candidates = np.random.choice(candidates, size=1000, replace=False)

    # 4. Predict
    user_input_array = np.full(len(candidates), user_int)
    predictions = ncf_model.predict([user_input_array, candidates], batch_size=64, verbose=0).flatten()

    # 5. Top N
    top_indices = predictions.argsort()[-n_recommendations:][::-1]
    top_book_ints = candidates[top_indices]
    top_scores = predictions[top_indices]
    
    # 6. Decode (Using Dictionary)
    print(f"--- NCF Recommendations for User {user_id_original} ---")
    results = []
    for book_int, score in zip(top_book_ints, top_scores):
        title = encoded_to_title.get(book_int, "Unknown")
        print(f"{score:.2f} stars | {title}")
        results.append(title)
        
    return results

# Test
recs = recommend_ncf(276747)

User 276747 not found (Cold Start).


In [6]:
from sklearn.neighbors import NearestNeighbors

# 1. Extract Embeddings
book_weights = ncf_model.get_layer('book_embedding').get_weights()[0]

# 2. Fit KNN
model_knn_ncf = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn_ncf.fit(book_weights)

def recommend_similar_ncf(book_title):
    # Check if book exists
    if book_title not in book_title_to_encoded:
        print(f"Book '{book_title}' not found.")
        return

    # Get Integer ID
    book_int = book_title_to_encoded[book_title]
    
    # Find Neighbors
    distances, indices = model_knn_ncf.kneighbors(
        book_weights[book_int].reshape(1, -1), 
        n_neighbors=6
    )
    
    print(f"NCF says these are similar to '{book_title}':")
    for i in range(1, len(indices.flatten())):
        idx = indices.flatten()[i]
        # Decode using dictionary
        similar_title = encoded_to_title.get(idx, "Unknown")
        print(f"{i}: {similar_title}")

# Test
recommend_similar_ncf('Animal Farm')

NCF says these are similar to 'Animal Farm':
1: Frankenstein (Changing Our World)
2: Fox in Socks (I Can Read It All by Myself Beginner Books)
3: The Phantom Tollbooth
4: Dead Men Do Tell Tales: The Strange and Fascinating Cases of a Forensic Anthropologist
5: The Blue Day Book


In [7]:
from sklearn.metrics import ndcg_score, mean_squared_error

def get_model_scorecard(model_name, test_data_matrix, prediction_function, book_popularity_dict):
    """
    The Universal Referee. 
    It takes ANY prediction function and returns the 3 critical scores.
    """
    print(f"--- Scoring Model: {model_name} ---")
    
    rmses = []
    ndcg_scores = []
    novelty_scores = []
    
    test_users = np.unique(test_data_matrix.nonzero()[0])
    # Sample 200 users for speed
    sample_users = np.random.choice(test_users, size=min(200, len(test_users)), replace=False)
    
    for u in sample_users:
        true_book_ids = test_data_matrix[u].indices
        true_ratings = test_data_matrix[u].data
        
        if len(true_ratings) < 2: continue 
        
        pred_ratings = []
        for book_id in true_book_ids:
            # prediction_function must take (user_id, book_id) where these are integers
            pred = prediction_function(u, book_id)
            pred_ratings.append(pred)
            
        # RMSE
        rmses.append(np.sqrt(mean_squared_error(true_ratings, pred_ratings)))
        
        # NDCG
        try:
            ndcg_scores.append(ndcg_score([true_ratings], [pred_ratings]))
        except: pass
        
        # Novelty
        top_k_idx = np.argsort(pred_ratings)[::-1][:5]
        top_books = true_book_ids[top_k_idx]
        pop_score = np.mean([book_popularity_dict.get(b, 0) for b in top_books])
        novelty_scores.append(pop_score)

    return {
        "Model": model_name,
        "RMSE (Error)": np.mean(rmses),
        "NDCG (Ranking)": np.mean(ndcg_scores),
        "Novelty (Popularity)": np.mean(novelty_scores)
    }

# Wrapper for the scorecard
def predict_ncf_wrapper(user_int, book_int):
    # Predicts single rating for Encoded User u and Encoded Book b
    pred = ncf_model.predict([np.array([user_int]), np.array([book_int])], verbose=0)
    return np.clip(pred[0][0], 1, 10)

# Run Scorecard
ncf_scores = get_model_scorecard("NCF", test_m, predict_ncf_wrapper, book_pop_dict)
print(ncf_scores)

--- Scoring Model: NCF ---
{'Model': 'NCF', 'RMSE (Error)': np.float64(1.4189802808663492), 'NDCG (Ranking)': np.float64(0.9713533809614314), 'Novelty (Popularity)': np.float64(56.528440366972475)}
