<a href="https://colab.research.google.com/github/lilswapnil/book-recommender/blob/main/notebook/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 3. Text Classification

## 3.0 Requirements, Credentials & Libraries

In [None]:
from google.colab import userdata

HF_TOKEN = userdata.get("HUGGINGFACEHUB_API_TOKEN")
OPENAI_KEY = userdata.get("OPENAI_API_KEY")

In [None]:
import numpy as np
import pandas as pd

In [None]:

from google.colab import drive
drive.mount('/content/drive')

# Define the path where you saved the file in Google Drive
drive_path = '/content/drive/MyDrive/books.csv'

# Load the CSV file into a pandas DataFrame
loaded_books_df = pd.read_csv(drive_path)

# Display the first few rows to verify
print(f"Successfully loaded 'books.csv' from Google Drive. Shape: {loaded_books_df.shape}")
display(loaded_books_df.head())

## 3.1. Text Classification (Zero Shot Classification on Genre)

In [None]:
loaded_books_df['genre'].value_counts().reset_index()

In [None]:
loaded_books_df['genre'].value_counts().reset_index().query("count >= 50").sort_values('count', ascending=False)

In [None]:
loaded_books_df.head()

In [None]:
loaded_books_df["genre"] = loaded_books_df["genre"].str.split(",")
loaded_books_df

In [None]:
loaded_books_df.info()

In [None]:
loaded_books_df["genre"] = loaded_books_df["genre"].apply(
    lambda x: list(set([g.strip() for g in x])) if isinstance(x, list) else []
)

In [None]:
genre_counts = (
    loaded_books_df.explode("genre")
      .groupby("genre")
      .size()
      .sort_values(ascending=False)
)

genre_counts.head()

In [None]:
category_mapping = {

    # --- FICTION ---
    "Fiction": "Fiction",
    "Literary Fiction": "Fiction",
    "Historical Fiction": "Fiction",
    "Romance": "Fiction",
    "Drama": "Fiction",
    "Thrillers": "Fiction",
    "Mystery": "Fiction",
    "Science Fiction": "Fiction",
    "Fantasy": "Fiction",
    "Horror": "Fiction",
    "Comics & Graphic Novels": "Fiction",
    "Poetry": "Fiction",

    # --- CHILDREN ---
    "Juvenile Fiction": "Children's Fiction",
    "Children": "Children's Fiction",
    "Young Adult Fiction": "Children's Fiction",

    "Juvenile Nonfiction": "Children's Nonfiction",
    "Young Adult Nonfiction": "Children's Nonfiction",

    # --- NONFICTION CORE ---
    "Nonfiction": "Nonfiction",
    "Biography & Autobiography": "Nonfiction",
    "History": "Nonfiction",
    "Philosophy": "Nonfiction",
    "Religion": "Nonfiction",
    "Literary Criticism": "Nonfiction",
    "Science": "Nonfiction",
    "Mathematics": "Nonfiction",
    "Political Science": "Nonfiction",
    "Sociology": "Nonfiction",
    "Psychology": "Nonfiction",

    # --- HOBBY / LIFESTYLE ---
    "Art": "Lifestyle",
    "Music": "Lifestyle",
    "Crafts": "Lifestyle",
    "Quilting": "Lifestyle",
    "Origami": "Lifestyle",
    "Games": "Lifestyle",
    "Chess": "Lifestyle",
    "Cooking": "Lifestyle",
    "Wine": "Lifestyle",
    "Alcohol": "Lifestyle",

    # --- PROFESSIONAL / TECH ---
    "Business": "Professional",
    "Economics": "Professional",
    "Technology": "Professional",
    "Computers": "Professional",
    "Engineering": "Professional",
    "Medical": "Professional",
    "Nursing": "Professional",
    "Law": "Professional",
    "Education": "Professional",

    # --- ESOTERIC ---
    "Occult": "Spiritual & Esoteric",
    "Tarot": "Spiritual & Esoteric",
    "Astrology": "Spiritual & Esoteric",
    "Esoterica": "Spiritual & Esoteric",

}

In [None]:
def simplify_to_primary(cat_value):

    # Case 1: null
    if cat_value is None:
        return "Other"

    # Case 2: already a list
    if isinstance(cat_value, list):
        categories = cat_value

    # Case 3: string
    elif isinstance(cat_value, str):
        categories = [c.strip() for c in cat_value.split(",")]

    else:
        return "Other"

    # Map to primary category
    for c in categories:
        if c in category_mapping:
            return category_mapping[c]

    return "Other"


loaded_books_df["category"] = loaded_books_df["genre"].apply(simplify_to_primary)

In [None]:
loaded_books_df['category'].value_counts().reset_index()

In [None]:
loaded_books_df[~loaded_books_df['genre'].isna()]

In [None]:
loaded_books_df.shape

In [None]:
loaded_books_df.to_csv('categorized_books.csv', index=False)

## 3.2 Transformer Model

In [None]:
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device
)

In [None]:
fiction_categories = ['Fiction', 'Nonfiction']

In [None]:
sequence = loaded_books_df.loc[loaded_books_df["category"] == "Fiction", "desc"].dropna().iloc[0]
print(sequence)


In [None]:
print(type(sequence))

In [None]:
classifier(sequence, fiction_categories)

In [None]:
result = classifier(sequence, fiction_categories)

max_index = np.argmax(result["scores"])
max_label = result["labels"][max_index]

print(f"The book is {max_label}")

In [None]:
def generate_predictions(sequence, categories):
    result = classifier(sequence, categories)
    return result["labels"][int(np.argmax(result["scores"]))]

In [None]:
# from tqdm import tqdm

# actual_cats = []
# predicted_cats = []

# for i in tqdm(range(0, 300)):
#   sequence = loaded_books_df.loc[loaded_books_df['category'] == 'Fiction', 'desc'].reset_index(drop=True)[i]

#   predicted_cats.append(generate_predictions(sequence, fiction_categories))
#   actual_cats += ['Fiction']

In [None]:
# for i in tqdm(range(0, 300)):
#   sequence = loaded_books_df.loc[loaded_books_df['category'] == 'Nonfiction', 'desc'].reset_index(drop=True)[i]
#   predicted_cats.append(generate_predictions(sequence, fiction_categories))
#   actual_cats += ['Nonfiction']

In [None]:
# 1) Build a balanced subset: 300 Fiction + 300 Nonfiction
fiction_df = loaded_books_df.loc[loaded_books_df["category"] == "Fiction"].head(300)
nonfiction_df = loaded_books_df.loc[loaded_books_df["category"] == "Nonfiction"].head(300)

subset = pd.concat([fiction_df, nonfiction_df], axis=0).reset_index(drop=True)

# 2) Create text input (title + desc)
texts = (
    subset["title"].fillna("").astype(str) + " " +
    subset["desc"].fillna("").astype(str)
).tolist()

# 3) Run classifier in batches on GPU
results = classifier(texts, fiction_categories, batch_size=16)

# 4) Extract predictions + actuals
predicted_cats = [r["labels"][0] for r in results]   # top label
actual_cats = subset["category"].tolist()

print(len(actual_cats), len(predicted_cats))  # should both be 600

In [None]:
prediction_df = pd.DataFrame({
    "actual_categories": actual_cats,
    "predicted_categories": predicted_cats
})
prediction_df["correct_prediction"] = prediction_df["actual_categories"] == prediction_df["predicted_categories"]

In [None]:
incorrect_results = prediction_df[prediction_df["correct_prediction"] == False]
prediction_df["correct_prediction"].value_counts()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(prediction_df["actual_categories"],
                       prediction_df["predicted_categories"]))

print(classification_report(prediction_df["actual_categories"],
                            prediction_df["predicted_categories"]))

In [None]:
results[20]

In [None]:
other_cats = loaded_books_df['category'] == 'Other'
other_cats.value_counts()

In [None]:
isbns = []
predicted_cats = []

missing_cats = loaded_books_df.loc[
    loaded_books_df["category"] == "Other",
    ["isbn", "desc"]
].reset_index(drop=True)

In [None]:
from tqdm import tqdm

for i in tqdm(range(len(missing_cats))):
    sequence = missing_cats["desc"].iloc[i]
    predicted_cats.append(generate_predictions(sequence, fiction_categories))
    isbns.append(missing_cats["isbn"].iloc[i])

In [None]:
missing_prediction_df = pd.DataFrame({
    "isbn": isbns,
    "predicted_categories": predicted_cats
})
missing_prediction_df

In [None]:
# find total books with category other
loaded_books_df.loc[loaded_books_df['category'] == 'Other']

In [None]:
# 1) remove any old predicted columns from prior merges
loaded_books_df = loaded_books_df.drop(
    columns=[c for c in loaded_books_df.columns if c.startswith("predicted_categories")],
    errors="ignore"
)

# 2) merge fresh
loaded_books_df = pd.merge(loaded_books_df, missing_prediction_df, on="isbn", how="left")

# 3) overwrite category only where needed
mask = (loaded_books_df["category"] == "Other") & loaded_books_df["predicted_categories"].notna()
loaded_books_df.loc[mask, "category"] = loaded_books_df.loc[mask, "predicted_categories"]

# 4) final df without helper column
books = loaded_books_df.drop(columns=["predicted_categories"])

In [None]:
books

In [None]:
books.to_csv('categorized_books.csv', index = False)