In [1]:
"""

BERT_NLP_Project.ipynb

Project: Book Review Categorization using BERT
By: Kevin Lin


Objective:
  This project leverages a pretrained BERT model (DistilBERT) to classify Amazon book reviews
  into one of 16 predefined categories using the review text as input. Many of the methods
  and approaches used here are adapted from "Lab 7" in our coursework, including
  tokenization, label encoding, model fine-tuning with HuggingFace’s Trainer API, and evaluation.

  However, the original lab focused on binary classification for spam/not, so the code
  had to be adapted to support 16 unique categories, including manual label encoding, metric
  handling for each class, and configuring the model output size accordingly.

Data Source:
  The original dataset is available on Kaggle:
    https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews?select=books_data.csv

Data Preparation:
  The raw dataset was cleaned, stratified, and split using a separate preprocessing notebook:
    "NLP_Project_Data_Prep.ipynb"

  From that notebook, two stratified CSV files were generated and uploaded to Google Drive:

  We used a 90/10 split for training and testing, respectively

    - Training Dataset: "14400_strat_samp_training.csv"
      - 900 samples per category × 16 categories
      - Uses "review/text" to predict "categories"

    - Testing Dataset: "1600_strat_samp_test.csv"
      - 100 samples per category × 16 categories
      - Also uses "review/text" to predict "categories"

"""
# Note:
# The code below is commented out but can be used as a fallback to manually upload CSV files
# if the Google Drive links fail or are unavailable in Colab.


# from google.colab import files
# uploaded = files.upload()

'\n\nBERT_NLP_Project.ipynb\n\nProject: Book Review Categorization using BERT\nBy: Kevin Lin\n\n\nObjective:\n  This project leverages a pretrained BERT model (DistilBERT) to classify Amazon book reviews \n  into one of 16 predefined categories using the review text as input. Many of the methods \n  and approaches used here are adapted from "Lab 7" in our coursework, including \n  tokenization, label encoding, model fine-tuning with HuggingFace’s Trainer API, and evaluation.\n\n  However, the original lab focused on binary classification for spam/not, so the code \n  had to be adapted to support 16 unique categories, including manual label encoding, metric \n  handling for each class, and configuring the model output size accordingly.\n\nData Source:\n  The original dataset is available on Kaggle:\n    https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews?select=books_data.csv\n\nData Preparation:\n  The raw dataset was cleaned, stratified, and split using a separate prepr

In [2]:
"""
This cell is if additional file need to be uploaded
"""
# test_uploaded = files.upload()

'\nThis cell is if additional file need to be uploaded\n'

In [3]:
# Install necessary libraries for NLP model training and evaluation

!pip install datasets
!pip install transformers
!pip install evaluate

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [4]:
## Code so we don't need an API
## Keys for Weights and Biases

import os
os.environ["WANDB_DISABLED"] = "true"

# Load tokenizer from the DistilBERT model
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# Tokenization function to truncate long review texts

def preprocess_function(examples):
  return tokenizer(examples["review/text"], truncation=True)

In [6]:
# Load training and testing CSVs directly from Google Drive
# These CSVs are stratified to ensure class balance across 16 categories
categorize_dataset = load_dataset('csv',data_files={'train': 'https://drive.google.com/uc?export=download&id=1tSYiGAMF2Tcl3EogDEWdMn3-ftvqA4ZN',
                                                    'test':  'https://drive.google.com/uc?export=download&id=1adXdX47B7jiikftExs-Q1GzHnU461JpX'})


# Retain only relevant columns for modeling: review text and category label
columns_to_keep = ["review/text", "categories"]

# Keep / Remove Columns that aren't Needed using List Comprehension
categorize_dataset["train"] = categorize_dataset["train"].remove_columns([col for col in categorize_dataset['train'].column_names if col not in columns_to_keep])
categorize_dataset["test"] = categorize_dataset["test"].remove_columns([col for col in categorize_dataset['test'].column_names if col not in columns_to_keep])

Downloading data:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [7]:
# Data should already be shuffled from data cleaning file, but shuffles order just in case!
categorize_train_dataset = categorize_dataset["train"].shuffle(seed=777)
categorize_test_dataset = categorize_dataset["test"].shuffle(seed=777)

# Apply preprocessing (e.g., tokenization, truncation) to each example in the training and test datasets
categorize_tokenized_small_train = categorize_train_dataset.map(preprocess_function)
categorize_tokenized_small_test = categorize_test_dataset.map(preprocess_function)

Map:   0%|          | 0/14400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [8]:
# Print the entire first tokenized training example
print(categorize_tokenized_small_train)

# Display the original review text and its corresponding category label
print("Text:", categorize_tokenized_small_train[0]["review/text"])
print("Label:", categorize_tokenized_small_train[0]["categories"])

# Show the input token IDs and attention mask produced by the tokenizer
print("Input IDs:", categorize_tokenized_small_train[0]["input_ids"])
print("Attention Mask:", categorize_tokenized_small_train[0]["attention_mask"])

# Convert token IDs back to tokens
categorize_tokens = tokenizer.convert_ids_to_tokens(categorize_tokenized_small_train[0]["input_ids"])
print("Tokenized text:", categorize_tokens)

Dataset({
    features: ['review/text', 'categories', 'input_ids', 'attention_mask'],
    num_rows: 14400
})
Text: A good book for Audi owners, and fans in general.The book dosen't just focus on the quattro itself, but the surrounding models as well.Not too much of a technical book, but more of an enthusiasts read.
Label: Sports & Recreation
Input IDs: [101, 1037, 2204, 2338, 2005, 20075, 5608, 1010, 1998, 4599, 1999, 2236, 1012, 1996, 2338, 13004, 2078, 1005, 1056, 2074, 3579, 2006, 1996, 24209, 19321, 3217, 2993, 1010, 2021, 1996, 4193, 4275, 2004, 2092, 1012, 2025, 2205, 2172, 1997, 1037, 4087, 2338, 1010, 2021, 2062, 1997, 2019, 20305, 3191, 1012, 102]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokenized text: ['[CLS]', 'a', 'good', 'book', 'for', 'audi', 'owners', ',', 'and', 'fans', 'in', 'general', '.', 'the', 'book', 'dose', '##n', "'", 't', 'just', 'fo

In [9]:
# Step 1: Manually define a mapping from category names to numeric labels (0–15)
# This is necessary for multiclass classification with 16 categories
print("Category-to-Label mapping...")
category_to_label = {
    'Fiction' : 0,
    'Religion' : 1,
    'History' : 2,
    'Juvenile Fiction' : 3,
    'Biography & Autobiography': 4,
    'Business & Economics' : 5,
    'Computers' : 6,
    'Social Science' : 7,
    'Juvenile Nonfiction' : 8,
    'Science' : 9,
    'Education' : 10,
    'Cooking' : 11,
    'Sports & Recreation' : 12,
    'Family & Relationships' : 13,
    'Literary Criticism' : 14,
    'Music': 15
}

for category, label in category_to_label.items():
    print(f"  {label}: {category}")

# Step 2: Map each example's string category label to its numeric ID
# Ensures compatibility with the classification usng BERT
def encode_labels(example):
    category = example["categories"]
    if category not in category_to_label:
        raise ValueError(f"Unknown category: {category}")
    label = category_to_label[category]
    example["labels"] = label
    return example

# Step 3: Apply label encoding to the tokenized training and testing datasets
print("\nEncoding training dataset...")
categorize_tokenized_small_train = categorize_tokenized_small_train.map(encode_labels)
print("\nEncoding testing dataset...")
categorize_tokenized_small_test = categorize_tokenized_small_test.map(encode_labels)

print("\n All categories encoded manually.")

Category-to-Label mapping...
  0: Fiction
  1: Religion
  2: History
  3: Juvenile Fiction
  4: Biography & Autobiography
  5: Business & Economics
  6: Computers
  7: Social Science
  8: Juvenile Nonfiction
  9: Science
  10: Education
  11: Cooking
  12: Sports & Recreation
  13: Family & Relationships
  14: Literary Criticism
  15: Music

Encoding training dataset...


Map:   0%|          | 0/14400 [00:00<?, ? examples/s]


Encoding testing dataset...


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]


 All categories encoded manually.


In [10]:
from evaluate import load
import numpy as np

# existing mapping
category_to_label = {
    'Fiction': 0,
    'Religion': 1,
    'History': 2,
    'Juvenile Fiction': 3,
    'Biography & Autobiography': 4,
    'Business & Economics': 5,
    'Computers': 6,
    'Social Science': 7,
    'Juvenile Nonfiction': 8,
    'Science': 9,
    'Education': 10,
    'Cooking': 11,
    'Sports & Recreation': 12,
    'Family & Relationships': 13,
    'Literary Criticism': 14,
    'Music': 15
}

# Reverse the mapping to get label -> category name
label_to_category = {v: k for k, v in category_to_label.items()}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = load("accuracy").compute(predictions=predictions, references=labels)["accuracy"]

    # Per class metrics
    f1 = load("f1").compute(predictions=predictions, references=labels, average=None)["f1"]
    precision = load("precision").compute(predictions=predictions, references=labels, average=None)["precision"]
    recall = load("recall").compute(predictions=predictions, references=labels, average=None)["recall"]

    # Overall scores
    # Note to self: Macro and Weigthed should be same since testing dataset is stratified
    f1_macro = load("f1").compute(predictions=predictions, references=labels, average="macro")["f1"]
    f1_weighted = load("f1").compute(predictions=predictions, references=labels, average="weighted")["f1"]

    # Assemble all per-category metrics into a dict
    class_metrics = {}
    for i in range(len(f1)):
        name = label_to_category[i]
        class_metrics[f"{name}_f1"] = f1[i]
        class_metrics[f"{name}_precision"] = precision[i]
        class_metrics[f"{name}_recall"] = recall[i]

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        **class_metrics
    }

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import numpy as np
from evaluate import load

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=16)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/NLP Final Project: Book Review Categorization/",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)

categorize_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=categorize_tokenized_small_train,
    eval_dataset=categorize_tokenized_small_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
"""

Warning! This is a long training process. Using an dedicated GPU would speed it up a lot.
Using Google's T4, it took us ~1 Hour

"""
categorize_trainer.train()

Step,Training Loss
500,1.7057
1000,0.9861
1500,0.7021
2000,0.5947
2500,0.4595
3000,0.3639
3500,0.323
4000,0.2562
4500,0.243


TrainOutput(global_step=4500, training_loss=0.6260179036458333, metrics={'train_runtime': 2967.0931, 'train_samples_per_second': 24.266, 'train_steps_per_second': 1.517, 'total_flos': 8665998748534272.0, 'train_loss': 0.6260179036458333, 'epoch': 5.0})

In [13]:
import pandas as pd

# Run evaluation
results = categorize_trainer.evaluate()

# Extract each category's metrics
category_rows = []
for i in range(len(category_to_label)):
    category_name = label_to_category[i]
    row = {
        "category": category_name,
        "f1": results.get(f"eval_{category_name}_f1", None),
        "precision": results.get(f"eval_{category_name}_precision", None),
        "recall": results.get(f"eval_{category_name}_recall", None)
    }
    category_rows.append(row)

# Create dataframe with rows as categories and each metric as a column
metrics_df = pd.DataFrame(category_rows)

# Save to CSV
filename = "per_category_metrics.csv"
metrics_df.to_csv(filename, index=False)

print("Results saved to per_category_metrics.csv")

"""

Reminder: Graphs are created on an online google sheet see the link below for reference
https://docs.google.com/spreadsheets/d/1M2TWhAH-0tazEH4AJgSOvjWm4w9YWnKaM0-hGu9ELtY/edit?usp=sharing

"""

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Results saved to per_category_metrics.csv


'\n\nReminder: Graphs are created on an online google sheet see the link below for reference\nhttps://docs.google.com/spreadsheets/d/1M2TWhAH-0tazEH4AJgSOvjWm4w9YWnKaM0-hGu9ELtY/edit?usp=sharing\n\n'