In [9]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


CUDA available: True
GPU: Tesla T4


In [10]:
!pip -q install -U transformers datasets accelerate evaluate scikit-learn

In [11]:
import transformers, datasets, evaluate
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("evaluate:", evaluate.__version__)

transformers: 4.57.3
datasets: 4.4.2
evaluate: 0.4.6


In [12]:
from datasets import load_dataset

ds = load_dataset("ag_news")

print(ds)
print("Train size:", len(ds["train"]))
print("Test size:", len(ds["test"]))
print("Columns:", ds["train"].column_names)

# show one example
print("\nExample row:")
print(ds["train"][0])

# label names (0-3)
label_names = ds["train"].features["label"].names
print("\nLabel names:", label_names)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
Train size: 120000
Test size: 7600
Columns: ['text', 'label']

Example row:
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}

Label names: ['World', 'Sports', 'Business', 'Sci/Tech']


In [13]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_ds = ds.map(tokenize_fn, batched=True)

# Remove raw text to keep dataset clean
tokenized_ds = tokenized_ds.remove_columns(["text"])

# Rename label column to 'labels' (required by Trainer)
tokenized_ds = tokenized_ds.rename_column("label", "labels")

# Set PyTorch format
tokenized_ds.set_format("torch")

print(tokenized_ds)
print("\nSample encoded item:")
print({k: v.shape for k, v in tokenized_ds["train"][0].items()})

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 7600
    })
})

Sample encoded item:
{'labels': torch.Size([]), 'input_ids': torch.Size([128]), 'attention_mask': torch.Size([128])}


In [14]:
# Create a validation split from the training set
split = tokenized_ds["train"].train_test_split(test_size=0.1, seed=42)

train_ds = split["train"]
val_ds = split["test"]
test_ds = tokenized_ds["test"]

print("Train:", len(train_ds))
print("Val:", len(val_ds))
print("Test:", len(test_ds))

Train: 108000
Val: 12000
Test: 7600


In [15]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

In [17]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)

training_args = TrainingArguments(
    output_dir="models_artifacts/agnews_distilbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,  # good on T4
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Trainer ready ✅")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer ready ✅


  trainer = Trainer(


In [18]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 1.3916208744049072,
 'eval_model_preparation_time': 0.0195,
 'eval_accuracy': 0.22316666666666668,
 'eval_f1_macro': 0.14913259737681242,
 'eval_runtime': 14.1256,
 'eval_samples_per_second': 849.52,
 'eval_steps_per_second': 26.547}

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1 Macro
1,0.1996,0.186885,0.0195,0.942083,0.941764
2,0.1448,0.193537,0.0195,0.94575,0.945492


TrainOutput(global_step=13500, training_loss=0.1843505736456977, metrics={'train_runtime': 898.6564, 'train_samples_per_second': 240.359, 'train_steps_per_second': 15.022, 'total_flos': 7153494663168000.0, 'train_loss': 0.1843505736456977, 'epoch': 2.0})

In [20]:
test_metrics = trainer.evaluate(test_ds)
test_metrics

{'eval_loss': 0.18877087533473969,
 'eval_model_preparation_time': 0.0195,
 'eval_accuracy': 0.9471052631578948,
 'eval_f1_macro': 0.947132949013737,
 'eval_runtime': 8.8067,
 'eval_samples_per_second': 862.978,
 'eval_steps_per_second': 27.025,
 'epoch': 2.0}

In [21]:
save_dir = "models_artifacts/agnews_distilbert/best"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print("Saved to:", save_dir)

Saved to: models_artifacts/agnews_distilbert/best


In [22]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Get test predictions
preds = trainer.predict(test_ds)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

cm = confusion_matrix(y_true, y_pred)
print("Confusion matrix:\n", cm)

print("\nClassification report:")
print(classification_report(
    y_true, y_pred, target_names=label_names, digits=4
))

Confusion matrix:
 [[1811   11   43   35]
 [   9 1875   10    6]
 [  31    7 1738  124]
 [  27   11   88 1774]]

Classification report:
              precision    recall  f1-score   support

       World     0.9643    0.9532    0.9587      1900
      Sports     0.9848    0.9868    0.9858      1900
    Business     0.9250    0.9147    0.9198      1900
    Sci/Tech     0.9149    0.9337    0.9242      1900

    accuracy                         0.9471      7600
   macro avg     0.9472    0.9471    0.9471      7600
weighted avg     0.9472    0.9471    0.9471      7600



In [23]:
sample_texts = [
    "The stock market rallied today as major tech companies reported strong earnings.",
    "The local team won the championship after a dramatic final match.",
    "Scientists discovered a new particle that could change physics theories."
]

inputs = tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
with torch.no_grad():
    outputs = model(**inputs)
preds = torch.argmax(outputs.logits, dim=-1)

for text, label_id in zip(sample_texts, preds):
    print(f"[{label_names[label_id]}] {text}")

[Business] The stock market rallied today as major tech companies reported strong earnings.
[Sports] The local team won the championship after a dramatic final match.
[Sci/Tech] Scientists discovered a new particle that could change physics theories.
