In [2]:
# %% IMPORT LIBRARIES
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import pipeline

# %% SET WORKING DIRECTORY (OPTIONAL)
print("Current working directory:", os.getcwd())

# %% LOAD CLEANED BOOK DATA
books = pd.read_csv(r"C:\Users\MSI\Desktop\BookNavigator\data\books_cleaned.csv")
print("Books loaded:", books.shape)

# %% DEFINE CATEGORY MAPPING
category_mapping = {
    'Fiction': "Fiction",
    'Juvenile Fiction': "Children's Fiction",
    'Biography & Autobiography': "Nonfiction",
    'History': "Nonfiction",
    'Literary Criticism': "Nonfiction",
    'Philosophy': "Nonfiction",
    'Religion': "Nonfiction",
    'Comics & Graphic Novels': "Fiction",
    'Drama': "Fiction",
    'Juvenile Nonfiction': "Children's Nonfiction",
    'Science': "Nonfiction",
    'Poetry': "Fiction"
}

books["simple_categories"] = books["categories"].map(category_mapping)

# %% SET UP ZERO-SHOT CLASSIFICATION PIPELINE
categories = ["Fiction", "Nonfiction", "Children's Fiction", "Children's Nonfiction"]
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)  # CPU=-1, GPU=0

# %% DEFINE PREDICTION FUNCTION
def generate_prediction(sequence, categories):
    if pd.isna(sequence) or sequence.strip() == "":
        return np.nan
    predictions = pipe(sequence, candidate_labels=categories)
    max_index = np.argmax(predictions["scores"])
    return predictions["labels"][max_index]

# %% PREDICT MISSING CATEGORIES
missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
predicted_categories = []

for i in tqdm(range(len(missing_cats)), desc="Predicting missing categories"):
    seq = missing_cats.loc[i, "description"]
    predicted_categories.append(generate_prediction(seq, categories))

# Merge predictions back to main dataframe
missing_cats["predicted_categories"] = predicted_categories
books = pd.merge(
    books,
    missing_cats[["isbn13", "predicted_categories"]],
    on="isbn13",
    how="left"
)

books["simple_categories"] = np.where(
    books["simple_categories"].isna(),
    books["predicted_categories"],
    books["simple_categories"]
)

books.drop(columns=["predicted_categories"], inplace=True)

# %% SAVE FINAL DATASET
output_path = r"C:\Users\MSI\Desktop\BookNavigator\data\books_with_categories_new.csv"
books.to_csv(output_path, index=False)
print(f"✅ All categories processed and saved as {output_path}")

# %% OPTIONAL: CHECK CLASS DISTRIBUTION
print(books["simple_categories"].value_counts())


  from .autonotebook import tqdm as notebook_tqdm


Current working directory: C:\Users\MSI\Desktop\BookNavigator\notebooks
Books loaded: (5197, 13)


Device set to use cpu
Predicting missing categories: 100%|██████████| 1454/1454 [38:43<00:00,  1.60s/it]

✅ All categories processed and saved as C:\Users\MSI\Desktop\BookNavigator\data\books_with_categories_new.csv
simple_categories
Fiction                  2761
Nonfiction               1882
Children's Fiction        447
Children's Nonfiction     107
Name: count, dtype: int64



