In [1]:
!pip install -q transformers datasets accelerate evaluate scikit-learn



[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: C:\Users\karan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
import evaluate





In [2]:
dataset = load_dataset("ag_news")
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [8]:
train_dataset = train_dataset.shuffle(seed=42).select(range(3000))
test_dataset = test_dataset.shuffle(seed=42).select(range(1000))


In [9]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="weighted")["recall"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    logging_steps=25,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    max_steps=200,        # âœ… HARD LIMIT
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none"
)




In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [13]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.9847,0.768086,0.834,0.844866,0.834,0.827688
100,0.5489,0.468084,0.876,0.875211,0.876,0.874932
150,0.4694,0.410682,0.876,0.877389,0.876,0.875581
200,0.4729,0.395729,0.882,0.88272,0.882,0.881696


TrainOutput(global_step=200, training_loss=0.6525976133346557, metrics={'train_runtime': 755.1474, 'train_samples_per_second': 2.119, 'train_steps_per_second': 0.265, 'total_flos': 52988849356800.0, 'train_loss': 0.6525976133346557, 'epoch': 0.5333333333333333})

In [14]:
results = trainer.evaluate()
results


{'eval_loss': 0.3957291841506958,
 'eval_accuracy': 0.882,
 'eval_precision': 0.882720268904532,
 'eval_recall': 0.882,
 'eval_f1': 0.8816961307662811,
 'eval_runtime': 74.142,
 'eval_samples_per_second': 13.488,
 'eval_steps_per_second': 1.686,
 'epoch': 0.5333333333333333}

In [15]:
text = "The government announced new economic policies today."

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)

prediction = torch.argmax(outputs.logits, dim=1).item()
prediction


0