**1. Environment Setup**

In [2]:
!pip install transformers datasets peft accelerate bitsandbytes -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

**2. Dataset Preparation**

In [3]:
# Load the AG News dataset using the Hugging Face `datasets` library
# It includes both "train" and "test" splits automatically

from datasets import load_dataset

dataset = load_dataset("ag_news")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

**3. Preprocess and Tokenize**

In [4]:
# Load the DistilBERT tokenizer and model for sequence classification
# We're using the base uncased version of DistilBERT
# `num_labels=4` corresponds to the four classes in AG News
# The model is moved to GPU using .to("cuda")

from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to("cuda")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from datasets import load_dataset

# Load the AG News dataset (includes "train" and "test" splits)
dataset = load_dataset("ag_news")

# Define a tokenization function that pads and truncates to a max length of 128 tokens
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# Apply the tokenization function to the dataset in batches
tokenized_dataset = dataset.map(tokenize, batched=True)

# Remove the original "text" column as it's no longer needed
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [6]:
split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)
train_ds = split_dataset["train"]
val_ds = split_dataset["test"]

**4. Fine-tuning setup with Hyperparamter Optimization**

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import pandas as pd

# Define the learning rates to try during hyperparameter tuning
learning_rates = [5e-5, 1e-4, 2e-4]
results = []

# Use a padding collator to ensure uniform input length within each batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Loop through each learning rate and train a new model
for lr in learning_rates:
    print(f"\n=== Training with learning_rate = {lr} ===\n")

    # Reload the model for each run to reset weights
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to("cuda")

    # Define training arguments, including evaluation and save steps
    training_args = TrainingArguments(
        output_dir=f"./results_distilbert_lr_{str(lr).replace('.', '-')}",  # Output directory for results
        num_train_epochs=0.2,  # Short training for quick experimentation
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        eval_steps=100,  # Evaluate every 100 steps
        save_steps=100,  # Save a checkpoint every 100 steps
        logging_dir="./logs",  # Directory for logging
        learning_rate=lr,  # Current learning rate
        report_to="none",  # Disable reporting to W&B or other loggers
        fp16=True  # Use mixed precision if supported by hardware
    )

    # Set up the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Train the model
    trainer.train()

    # Evaluate and store the evaluation loss
    eval_result = trainer.evaluate()
    results.append((lr, eval_result["eval_loss"]))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Training with learning_rate = 5e-05 ===



  trainer = Trainer(


Step,Training Loss
500,0.4415
1000,0.3399
1500,0.2921
2000,0.2877


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Training with learning_rate = 0.0001 ===



  trainer = Trainer(


Step,Training Loss
500,0.4728
1000,0.3999
1500,0.3287
2000,0.3158


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Training with learning_rate = 0.0002 ===



  trainer = Trainer(


Step,Training Loss
500,0.617
1000,0.5397
1500,0.4383
2000,0.4143


**5. Model Evaluation**

In [9]:
# Create a DataFrame to display evaluation loss for each learning rate
df = pd.DataFrame(results, columns=["learning_rate", "eval_loss"])

# Sort the DataFrame by evaluation loss in ascending order
df = df.sort_values("eval_loss")

# Print the sorted results
print("\nHyperparameter Tuning Results:\n")
print(df)


🔍 Hyperparameter Tuning Results:

   learning_rate  eval_loss
0        0.00005   0.269533
1        0.00010   0.293541
2        0.00020   0.402862


**Hyperparameter Tuning Results**

We fine-tuned the DistilBERT model using three different learning rates: `5e-5`, `1e-4`, and `2e-4`. The evaluation loss for each configuration is summarized below:

| Learning Rate | Eval Loss |
|---------------|-----------|
| 5e-5          | 0.2695    |
| 1e-4          | 0.2935    |
| 2e-4          | 0.4029    |

The lowest evaluation loss was observed at a learning rate of **5e-5**, indicating that this configuration allowed the model to generalize best on the validation set. Higher learning rates led to increased loss, likely due to overshooting during weight updates. Based on these results, `5e-5` was selected as the optimal learning rate for final training and evaluation.


**6. Error Analysis**

In [14]:
# Initialize a list to store misclassified examples
errors = []

# Loop through a subset of the validation set (first 100 samples)
for i in range(100):
    sample = val_ds[i]

    # Decode input tokens back to raw text
    text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)

    # Predict the label using the classification function
    pred = classify(text)

    # Compare with the actual label and store if incorrect
    if pred != sample['label']:
        errors.append((text, sample['label'], pred))

# Display the first 5 misclassified examples
for e in errors[:5]:
    print(f"\nInput: {e[0]}\nActual Label: {e[1]}\nPredicted Label: {e[2]}")


Input: nbc exec dick ebersol survives jet crash ( ap ) ap - nbc sports chairman dick ebersol and his college - aged son emerged from the fiery wreckage of a corporate jet after it crashed during takeoff and burst into flames, killing two crew members. rescuers were still searching for ebersol ' s younger son, whose seat was missing from the smoldering ruins.
Actual Label: 1
Predicted Label: 0

Input: new \ $ 50 bill begins circulating washington - coming to cash registers near you : colorful new \ $ 50 bills sporting splashes of red, blue and yellow. next up for a makeover, the government said tuesday : \ $ 10 bills...
Actual Label: 0
Predicted Label: 2

Input: the cash heads home ( businessweek online ) businessweek online - it ' s a cash bundle that would have made howard hughes blush : as u. s. multinationals such as ibm and pfizer inc. ( pfz ) have extended their reach across the globe, they ' ve built up a mountain of profits earned abroad - - as much as # 36 ; 750 billion, by so

**Potential Improvements (Based on Error Analysis)**

The model made several misclassifications that suggest opportunities for further improvement:

1. **Label Confusion**:
   - The model sometimes confused `World` (0), `Business` (2), and `Sci/Tech` (3), likely due to overlapping terminology such as "tax cuts", "economic output", and "government bills" which could reasonably belong to multiple categories.

2. **Insufficient Context**:
   - News headlines like *“NBC exec survives jet crash”* were misclassified (`Sports` → `World`), potentially due to the presence of an “NBC” figure, which the model may have associated with sports reporting.
   - Some headlines had multiple domain overlaps (e.g., *tax legislation* might imply both `Business` and `World`).

3. **Headline-Only Limitations**:
   - Since the AG News dataset only contains short text snippets, the model often lacks sufficient context to make fine-grained distinctions between similar topics.

4. **Entity and Keyword Bias**:
   - Terms like “Yankees” led to confident classification into `Sports`, even when the actual context might belong elsewhere.
   - Similarly, government and finance-related terms were over-associated with `Business`.


**Suggested Improvements**

- **Use longer article summaries** instead of headlines alone to provide richer context.
- **Incorporate entity-aware embeddings** (e.g., via SpaCy NER or entity linking) to disambiguate terms like “NBC” or “tax cut”.
- **Fine-tune a larger model** (e.g., `bert-base-uncased`) with better contextual encoding capabilities.
- **Introduce label smoothing** or **class-weighted loss functions** to reduce overconfidence on dominant labels.
- **Manually inspect and augment training data** with examples that involve overlapping topics to teach the model subtle distinctions.



**7. Inference Pipeline**

In [10]:
import torch

# Define a function to classify input text using the fine-tuned model
def classify(text):
    # Tokenize the input text and move to GPU
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to("cuda")

    # Disable gradient calculation for inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class label (index of the highest logit)
    predicted = torch.argmax(outputs.logits, dim=-1).item()
    return predicted

# Perform predictions on the first 5 examples in the validation set
for i in range(5):
    example = val_ds[i]

    # Decode input tokens to readable text
    decoded = tokenizer.decode(example['input_ids'], skip_special_tokens=True)

    # Print input text, actual label, and predicted label
    print(f"\nInput: {decoded}\nActual: {example['label']}\nPredicted: {classify(decoded)}")


Input: nbc exec dick ebersol survives jet crash ( ap ) ap - nbc sports chairman dick ebersol and his college - aged son emerged from the fiery wreckage of a corporate jet after it crashed during takeoff and burst into flames, killing two crew members. rescuers were still searching for ebersol ' s younger son, whose seat was missing from the smoldering ruins.
Actual: 1
Predicted: 0

Input: citigroup buys texas bank continuing to snap up branch networks in states with large hispanic populations, citigroup inc. tuesday, aug. 24, agreed to buy first american bank of texas for an undisclosed price.
Actual: 2
Predicted: 2

Input: top seeds hewitt, sharapova safely through in japan top seeds lleyton hewitt and maria sharapova were taken to three sets in their second round matches before safely advancing at the \ $ 860, 000 japan open on wednesday.
Actual: 1
Predicted: 1

Input: wi - fi brings broadband to rural washington san francisco - technologies that extend the range of wi - fi wireless

**Comparison with base line model**

In [8]:
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score
import torch

# Load the pre-trained DistilBERT model without any fine-tuning
# This serves as the baseline for comparison
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=4
).to("cuda")

# Define a classification function using the baseline (pretrained) model
def baseline_classify(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    with torch.no_grad():
        outputs = baseline_model(**inputs)
    return torch.argmax(outputs.logits, dim=-1).item()

# Evaluate baseline accuracy on the validation set
true_labels = []
predicted_labels = []

for sample in val_ds:
    text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
    true_labels.append(sample["label"])
    predicted_labels.append(baseline_classify(text))

# Calculate and print baseline accuracy
baseline_accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Baseline Accuracy before training: {baseline_accuracy:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline Accuracy before training: 0.2710


In [11]:
 # Accuracy after training
from sklearn.metrics import accuracy_score

# Collect predictions
preds, labels = [], []
for sample in val_ds:
    text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
    labels.append(sample['label'])
    preds.append(classify(text))

accuracy_score(labels, preds)


0.8982083333333334

**Accuracy Improvement Summary**

We evaluated the accuracy of the pre-trained `distilbert-base-uncased` model on the AG News validation set **before** and **after** fine-tuning:

- **Baseline Accuracy (Pre-fine-tuning)**: 27.10%
- **Final Accuracy (Post-fine-tuning)**: 89.82%

This demonstrates a substantial improvement of **over 62 percentage points**, validating the effectiveness of task-specific fine-tuning. The baseline accuracy (~25%) aligns closely with random guessing across 4 classes, whereas the fine-tuned model exhibits strong understanding of the news categories.


**Label Details**

In [13]:
import pandas as pd

# AG News label mapping
label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# Convert to DataFrame
label_df = pd.DataFrame(list(label_map.items()), columns=["Label ID", "Category"])
print(label_df)


   Label ID  Category
0         0     World
1         1    Sports
2         2  Business
3         3  Sci/Tech


**Reproducibility Notes**

- **Model**: DistilBERT (`distilbert-base-uncased`)
- **Dataset**: AG News (via `datasets`)
- **Task**: 4-class text classification
- **Tokenizer**: Max length 128, truncation and padding enabled
- **Training**:
  - Epochs: 0.2
  - Batch size: 8
  - Learning rates tested: `5e-5`, `1e-4`, `2e-4`
- **Evaluation**:
  - Metric: Eval loss, Accuracy
  - Best LR: `1e-4`
- **Environment**: Google Colab (T4 GPU)
- **Dependencies**: `transformers`, `datasets`, `scikit-learn`, `torch`
