# Book Recommendation System

# Import and Load Data

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import joblib
from scipy import sparse

# Path to your Excel file (adjust if needed)
DATA_PATH = "books.xlsx"

# load
df = pd.read_excel(DATA_PATH)

# Quick check
print("Rows:", len(df))
print(df.columns.tolist())
df.head()


Rows: 70
['Title', 'Author', 'Category', 'Rating', 'Cover_URL']


Unnamed: 0,Title,Author,Category,Rating,Cover_URL
0,Bossypants,Tina Fey,Funny,4.2,https://m.media-amazon.com/images/I/81vnzJvZZ-...
1,Is Everyone Hanging Out Without Me?,Mindy Kaling,Funny,4.1,https://m.media-amazon.com/images/I/81b3x7h5zV...
2,Yes Please,Amy Poehler,Funny,4.0,https://m.media-amazon.com/images/I/81kqrwS1nN...
3,Good Omens,Neil Gaiman & Terry Pratchett,Funny,4.3,https://m.media-amazon.com/images/I/91-d4hgLq-...
4,The Hitchhiker's Guide to the Galaxy,Douglas Adams,Funny,4.6,https://m.media-amazon.com/images/I/81WcnNQ-TB...


# Prepare text field & clean data

In [3]:
# Ensure Rating is numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Create a text column for TF-IDF
if 'Description' in df.columns:
    df['Description'] = df['Description'].fillna('').astype(str)
    df['Text'] = df['Description'] + " " + df['Title'].astype(str) + " " + df['Author'].astype(str) + " " + df['Category'].astype(str)
else:
    # fallback: combine title, author, category (you can later replace with real descriptions)
    df['Text'] = (df['Title'].astype(str) + " " + 
                  df['Author'].astype(str) + " " +
                  df['Category'].astype(str))

# Lowercase category for robust matching
df['Category'] = df['Category'].astype(str)
df['Category_clean'] = df['Category'].str.strip().str.lower()

df.head()


Unnamed: 0,Title,Author,Category,Rating,Cover_URL,Text,Category_clean
0,Bossypants,Tina Fey,Funny,4.2,https://m.media-amazon.com/images/I/81vnzJvZZ-...,Bossypants Tina Fey Funny,funny
1,Is Everyone Hanging Out Without Me?,Mindy Kaling,Funny,4.1,https://m.media-amazon.com/images/I/81b3x7h5zV...,Is Everyone Hanging Out Without Me? Mindy Kali...,funny
2,Yes Please,Amy Poehler,Funny,4.0,https://m.media-amazon.com/images/I/81kqrwS1nN...,Yes Please Amy Poehler Funny,funny
3,Good Omens,Neil Gaiman & Terry Pratchett,Funny,4.3,https://m.media-amazon.com/images/I/91-d4hgLq-...,Good Omens Neil Gaiman & Terry Pratchett Funny,funny
4,The Hitchhiker's Guide to the Galaxy,Douglas Adams,Funny,4.6,https://m.media-amazon.com/images/I/81WcnNQ-TB...,The Hitchhiker's Guide to the Galaxy Douglas A...,funny


# TF-IDF vectorization (fit)

In [7]:
# Create TF-IDF over the Text column
tfidf = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=1, ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(df['Text'].fillna(""))

print("TF-IDF shape:", tfidf_matrix.shape)  # (n_books, n_features)


TF-IDF shape: (70, 535)


# Save vectorizer

In [8]:
os.makedirs("models", exist_ok=True)
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")
# If you want to save the sparse matrix for faster load later:
sparse.save_npz("models/tfidf_matrix.npz", tfidf_matrix)
# Save prepared dataframe (Text included)
df.to_csv("models/books_prepared.csv", index=False)
print("Saved: tfidf vectorizer, tfidf_matrix and books_prepared.csv")


Saved: tfidf vectorizer, tfidf_matrix and books_prepared.csv


# Recommendation functions

In [9]:
# Cell 5
from sklearn.preprocessing import minmax_scale

# helper: map title -> index
title_to_index = {title.strip().lower(): idx for idx, title in enumerate(df['Title'].astype(str))}

def recommend_by_category_baseline(category, n=10):
    cat = category.strip().lower()
    subset = df[df['Category_clean'] == cat].copy()
    if subset.empty:
        raise ValueError(f"No books found for category '{category}'")
    subset = subset.sort_values(by='Rating', ascending=False).head(n)
    return subset[['Title','Author','Category','Rating','Cover_URL']]

def recommend_by_category_ml(category, n=10, alpha=0.6):
    """
    category: string, the chosen category
    n: number of books to return
    alpha: weight for rating in hybrid score (0..1). hybrid = alpha*rating_norm + (1-alpha)*sim_norm
    """
    cat = category.strip().lower()
    indices_cat = df[df['Category_clean'] == cat].index.tolist()
    if len(indices_cat) == 0:
        raise ValueError(f"No books found for category '{category}'")

    # compute centroid vector of TF-IDF for books in chosen category
    centroid = tfidf_matrix[indices_cat].mean(axis=0)  # shape (1, n_features)

    # similarity of centroid against ALL books
    sim_all = linear_kernel(centroid, tfidf_matrix).flatten()  # shape (n_books,)

    # restrict to same category (we want recommendations within the category)
    sim_cat = sim_all[indices_cat]

    # ratings for the candidates
    ratings_cat = df.loc[indices_cat, 'Rating'].fillna(df['Rating'].median()).values

    # normalize similarity and ratings to [0,1] (avoid division by zero)
    if sim_cat.max() != sim_cat.min():
        sim_norm = (sim_cat - sim_cat.min()) / (sim_cat.max() - sim_cat.min())
    else:
        sim_norm = np.zeros_like(sim_cat)

    rating_min, rating_max = df['Rating'].min(), df['Rating'].max()
    if rating_max != rating_min:
        rating_norm = (ratings_cat - rating_min) / (rating_max - rating_min)
    else:
        rating_norm = np.full_like(ratings_cat, 0.5, dtype=float)

    # hybrid score
    hybrid = alpha * rating_norm + (1.0 - alpha) * sim_norm

    # sort by hybrid score descending
    order = np.argsort(hybrid)[::-1]
    chosen_idx = [indices_cat[i] for i in order[:n]]

    results = df.loc[chosen_idx].copy()
    # attach similarity and hybrid for transparency
    results['Similarity'] = sim_cat[order[:n]]
    results['Hybrid_Score'] = hybrid[order[:n]]
    results = results[['Title','Author','Category','Rating','Similarity','Hybrid_Score','Cover_URL']]

    return results.reset_index(drop=True)

def recommend_similar_by_title(title, n=10):
    """
    Recommend books most similar to a given title (across entire dataset).
    """
    t = title.strip().lower()
    if t not in title_to_index:
        raise ValueError(f"Title '{title}' not found.")
    idx = title_to_index[t]
    sim_scores = linear_kernel(tfidf_matrix[idx], tfidf_matrix).flatten()
    sim_scores[idx] = -1  # ignore itself
    top_idx = np.argsort(sim_scores)[::-1][:n]
    return df.loc[top_idx][['Title','Author','Category','Rating','Cover_URL']].reset_index(drop=True)


In [11]:
# Cell 6 - baseline top 10 for category "Money"
print("Baseline (top by rating) - Money:")
print(recommend_by_category_baseline("Money", n=10).to_string(index=False))


# Cell 7 - ML hybrid recommendations for "Money"
print("\nML Hybrid recommendations - Money:")

# --- FIXED: ensure centroid is ndarray inside the function ---
def recommend_by_category_ml(category, n=10, alpha=0.6):
    cat = category.strip().lower()
    indices_cat = df[df['Category_clean'] == cat].index.tolist()
    if len(indices_cat) == 0:
        raise ValueError(f"No books found for category '{category}'")

    # FIX: convert centroid from np.matrix -> np.ndarray
    centroid = np.asarray(tfidf_matrix[indices_cat].mean(axis=0)).reshape(1, -1)

    # similarity of centroid against ALL books
    sim_all = linear_kernel(centroid, tfidf_matrix).flatten()

    # restrict to same category
    sim_cat = sim_all[indices_cat]

    ratings_cat = df.loc[indices_cat, 'Rating'].fillna(df['Rating'].median()).values

    # normalize similarity and ratings
    if sim_cat.max() != sim_cat.min():
        sim_norm = (sim_cat - sim_cat.min()) / (sim_cat.max() - sim_cat.min())
    else:
        sim_norm = np.zeros_like(sim_cat)

    rating_min, rating_max = df['Rating'].min(), df['Rating'].max()
    if rating_max != rating_min:
        rating_norm = (ratings_cat - rating_min) / (rating_max - rating_min)
    else:
        rating_norm = np.full_like(ratings_cat, 0.5, dtype=float)

    hybrid = alpha * rating_norm + (1.0 - alpha) * sim_norm
    order = np.argsort(hybrid)[::-1]
    chosen_idx = [indices_cat[i] for i in order[:n]]

    results = df.loc[chosen_idx].copy()
    results['Similarity'] = sim_cat[order[:n]]
    results['Hybrid_Score'] = hybrid[order[:n]]
    return results[['Title','Author','Category','Rating','Similarity','Hybrid_Score','Cover_URL']].reset_index(drop=True)

# Now call it
print(recommend_by_category_ml("Money", n=10, alpha=0.6).to_string(index=False))


# Cell 8 - Similar books to a title (example)
print("\nBooks similar to 'Rich Dad Poor Dad':")
print(recommend_similar_by_title("Rich Dad Poor Dad", n=6).to_string(index=False))



Baseline (top by rating) - Money:
                          Title            Author Category  Rating                                           Cover_URL
       The Intelligent Investor   Benjamin Graham    Money     4.8 https://m.media-amazon.com/images/I/81wKn4HMI8L.jpg
        The Psychology of Money     Morgan Housel    Money     4.7 https://m.media-amazon.com/images/I/81Lb75rUhLL.jpg
       The Total Money Makeover       Dave Ramsey    Money     4.6 https://m.media-amazon.com/images/I/91W5Zl1L1DL.jpg
            Think and Grow Rich     Napoleon Hill    Money     4.6 https://m.media-amazon.com/images/I/71UypkUjStL.jpg
    I Will Teach You to Be Rich       Ramit Sethi    Money     4.6 https://m.media-amazon.com/images/I/81GtwYt9bML.jpg
              Rich Dad Poor Dad   Robert Kiyosaki    Money     4.5 https://m.media-amazon.com/images/I/81bsw6fnUiL.jpg
Secrets of the Millionaire Mind      T. Harv Eker    Money     4.5 https://m.media-amazon.com/images/I/81df0BOfn-L.jpg
        Your M

# Save model

In [12]:
# Cell 9
# Save vectorizer and the prepared DataFrame (we already saved TF-IDF & csv earlier, but do it again if needed)
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")
df.to_csv("models/books_prepared.csv", index=False)
print("Artifacts saved to models/")


Artifacts saved to models/
