In [2]:
# Step 2: Import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [3]:
# Step 3: Load dataset
dataset = load_dataset("imdb")

Downloading readme: 0.00B [00:00, ?B/s]

In [4]:
# Step 4: Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

# Apply tokenizer to dataset
encoded_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [5]:
# Step 5: Load pretrained DistilBERT for classification (2 classes)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Step 6: Training setup
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

In [8]:
# Step 7: Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"].shuffle(seed=42).select(range(5000)), # subset for quick demo
    eval_dataset=encoded_dataset["test"].shuffle(seed=42).select(range(2000)),
)

In [9]:
# Step 8: Train
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.285095
2,0.297900,0.257951


TrainOutput(global_step=626, training_loss=0.27347692056966666, metrics={'train_runtime': 148.1028, 'train_samples_per_second': 67.521, 'train_steps_per_second': 4.227, 'total_flos': 1324673986560000.0, 'train_loss': 0.27347692056966666, 'epoch': 2.0})

In [10]:
# Step 9: Evaluate
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.25795140862464905, 'eval_runtime': 14.7375, 'eval_samples_per_second': 135.708, 'eval_steps_per_second': 8.482, 'epoch': 2.0}


In [16]:
# Step 10: Make predictions
text = "The movie was absolutely fantastic, loved every moment!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
from torch import device

# Pick GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Send model to device
model.to(device)

# Send inputs to device
inputs = {k: v.to(device) for k, v in inputs.items()}

# Now safe to run
outputs = model(**inputs)
pred = torch.argmax(outputs.logits)
print("Prediction:", "Positive" if pred.item() == 1 else "Negative")

Prediction: Positive
