In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
import torch

# --- 1. Load and Prepare the Data ---
# Load the full book dataset from books.csv, handling potential bad lines.
try:
    # Now loading the full dataset instead of a sample.
    df = pd.read_csv('books.csv', on_bad_lines='skip')
except FileNotFoundError:
    print("Error: 'books.csv' not found. Please make sure the file is in the same directory.")
    # Exiting here to prevent further errors
    exit()

# Drop the 'Unnamed: 12' column as seen in the original notebook
if 'Unnamed: 12' in df.columns:
    df = df.drop(columns=['Unnamed: 12'])

# --- Explicitly convert columns to numeric, coercing errors to NaN ---
# This fixes the initial ValueError by handling non-numeric data gracefully.
df['average_rating'] = pd.to_numeric(df['average_rating'], errors='coerce')
df['ratings_count'] = pd.to_numeric(df['ratings_count'], errors='coerce')

# Drop rows with any missing values after the conversion
df.dropna(inplace=True)

# Combine relevant text fields into a single column for the Sentence Transformer
df['combined_text'] = df['title'] + " by " + df['authors']
print("Combined Text Data Sample:")
print(df[['title', 'combined_text']].head())
print("-" * 50)


# --- 2. Advanced Feature Engineering using NLP (Sentence-BERT) ---
# Initialize the Sentence-BERT model.
print("Loading Sentence-BERT model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each book's combined text.
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Using device: {device}")

text_embeddings = model.encode(df['combined_text'].tolist(), convert_to_tensor=True)
print("Text Embeddings Shape:", text_embeddings.shape)
print("-" * 50)


# --- 3. Combine Advanced Features with Existing Features ---
# Extract numerical features
numerical_features = df[['average_rating', 'ratings_count']]

# Scale the numerical features
scaler = MinMaxScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)
scaled_numerical_features_df = pd.DataFrame(scaled_numerical_features, columns=['average_rating_scaled', 'ratings_count_scaled'])
print("Scaled Numerical Features:")
print(scaled_numerical_features_df.head())
print("-" * 50)

# Concatenate all features: text embeddings + scaled numerical features
feature_matrix = pd.concat([
    pd.DataFrame(text_embeddings.cpu().numpy()),
    scaled_numerical_features_df
], axis=1)

print("Final Feature Matrix Shape:", feature_matrix.shape)
print("-" * 50)

# Convert all column names to strings to avoid TypeError
feature_matrix.columns = feature_matrix.columns.astype(str)

# --- 4. Train the Recommender Model on the New Feature Matrix ---
# The model can now be fit successfully.
model_nn = NearestNeighbors(n_neighbors=6, algorithm='ball_tree')
model_nn.fit(feature_matrix.values)

# --- 5. Generate and Display Recommendations ---
def book_recom_advanced(book_title):
    # Find all books that contain the user's input, ignoring case, and without using regex.
    matches = df[df['title'].str.contains(book_title, case=False, na=False, regex=False)]

    if matches.empty:
        print("Book not found in the dataset. Please try another title.")
        return []

    if len(matches) > 1:
        print("Multiple matches found. Please be more specific. Possible titles:")
        for title in matches['title'].tolist():
            print(f"- {title}")
        return []

    # If exactly one match, proceed with recommendation.
    book_idx = matches.index[0]
    target_features = feature_matrix.loc[book_idx].values.reshape(1, -1)
    distances, indices = model_nn.kneighbors(target_features)

    recommendations = []
    for i in indices.flatten()[1:]:
        recommended_book = df.iloc[i]
        recommendations.append(f"{recommended_book['title']} by {recommended_book['authors']}")

    return recommendations

# --- Get user input and run in a continuous loop ---
# Provide some example titles to make the tool easier to use.
print("Here are some example books you can search for:")
for title in df['title'].sample(5, random_state=1).tolist():
    print(f"- {title}")
print("-" * 50)

while True:
    user_book = input("Enter a book title to get recommendations (or 'quit' to exit): ")
    if user_book.lower() in ['quit', 'exit']:
        print("Exiting the application. Goodbye!")
        break

    recommended_books = book_recom_advanced(user_book)

    if recommended_books:
        print(f"\nBooks recommended for '{user_book}':")
        for rec in recommended_books:
            print(f"- {rec}")
    print("-" * 50)


Combined Text Data Sample:
                                               title  \
4  Comoediae 1: Acharenses/Equites/Nubes/Vespae/P...   
5                  Willem de Kooning: Late Paintings   
6  Literature Circle Guide: Bridge to Terabithia:...   
7  Middlesex Borough (Images of America: New Jersey)   
8  Zone of the Enders: The 2nd Runner Official St...   

                                       combined_text  
4  Comoediae 1: Acharenses/Equites/Nubes/Vespae/P...  
5  Willem de Kooning: Late Paintings by Julie Syl...  
6  Literature Circle Guide: Bridge to Terabithia:...  
7  Middlesex Borough (Images of America: New Jers...  
8  Zone of the Enders: The 2nd Runner Official St...  
--------------------------------------------------
Loading Sentence-BERT model...
Using device: cpu
Text Embeddings Shape: torch.Size([11123, 384])
--------------------------------------------------
Scaled Numerical Features:
   average_rating_scaled  ratings_count_scaled
0                    1.0         