In [None]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score



1.   Prepare dataset
2.   Load pre-trained tokenizer, call it with dataset -> encoding
3.   Build PyTorch Dataset with encodings
4.   Load pre-trained model
1.   a) load trainer and train it  
b) or use native PyTorch training pipeline

In [None]:
model_name = 'distilbert-base-uncased'
from pathlib import Path

def read_imdb_split(split_dir):
  split_dir = Path(split_dir)
  texts = []
  labels = []

  for label_dir in ["pos", "neg"]:
    for text_file in (split_dir/label_dir).iterdir():
      texts.append(text_file.read_text())
      labels.append(0 if label_dir == "neg" else 1)

  return texts, labels

In [None]:
# Large movie review dataset
# http://ai.stanford.edu/~amaas/data/sentiment/

train_texts, train_labels = read_imdb_split(split_dir='aclImdb/train')
test_texts, test_labels = read_imdb_split(split_dir='aclImdb/test')

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


class IMDbDataset(Dataset):

  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key,val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[index])
    return item

  def __len__(self):
    return len(self.labels)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Ensure that all of our sequences are padded to the same length and are truncated to be no longer than model's maximum input length.
# this will allow us to feed batches of sequences into the model at the same time.

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
al_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10
)

model = DistilBertForSequenceClassification.from_pretrained(model_name)

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()


FileNotFoundError: [Errno 2] No such file or directory: 'aclImdb/train/pos'

In [None]:
# Download the dataset
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

--2025-10-31 09:23:17--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: â€˜aclImdb_v1.tar.gzâ€™


2025-10-31 09:23:21 (21.5 MB/s) - â€˜aclImdb_v1.tar.gzâ€™ saved [84125825/84125825]



# Task
Replace the `Trainer` API with a native PyTorch training loop for the `DistilBertForSequenceClassification` model.

## Define optimizer and loss function

### Subtask:
Choose and define a suitable optimizer and loss function for the task.


**Reasoning**:
Import the necessary optimizer and define the loss function.



In [None]:
from transformers.optimization import AdamW
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

ImportError: cannot import name 'AdamW' from 'transformers.optimization' (/usr/local/lib/python3.12/dist-packages/transformers/optimization.py)

**Reasoning**:
The previous import of AdamW failed. The AdamW optimizer is available in `torch.optim`. Import AdamW from `torch.optim` instead and define the optimizer and loss function.



In [None]:
from torch.optim import AdamW
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

NameError: name 'model' is not defined

**Reasoning**:
The error indicates that the `model` variable is not defined. The model was defined in the previous code cell which failed. Re-execute the cell that defines the model and the necessary components before defining the optimizer.



In [None]:
from sklearn.model_selection import train_test_split

# Large movie review dataset
# http://ai.stanford.edu/~amaas/data/sentiment/

train_texts, train_labels = read_imdb_split(split_dir='aclImdb/train')
test_texts, test_labels = read_imdb_split(split_dir='aclImdb/test')

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


class IMDbDataset(Dataset):

  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key,val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[index])
    return item

  def __len__(self):
    return len(self.labels)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Ensure that all of our sequences are padded to the same length and are truncated to be no longer than model's maximum input length.
# this will allow us to feed batches of sequences intothe model at the same time.

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

model = DistilBertForSequenceClassification.from_pretrained(model_name)

from torch.optim import AdamW
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Create dataloaders

### Subtask:
Create PyTorch DataLoaders for the training and validation datasets.


**Reasoning**:
Import the DataLoader class and create DataLoaders for the training and validation datasets with the specified batch sizes and shuffling.



In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

## Implement training loop

### Subtask:
Write the code for the training loop, including iterating through epochs and batches, performing forward and backward passes, and updating model parameters.


**Reasoning**:
Implement the native PyTorch training loop as described in the instructions, iterating through epochs and batches, performing forward and backward passes, and updating model parameters.



In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
  model.train()
  total_loss = 0

  for batch in train_dataloader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dataloader)}")

  model.eval()
  # Optional: Add evaluation on validation set here if needed

## Implement evaluation loop

### Subtask:
Write the code for the evaluation loop to assess the model's performance on the validation set.


**Reasoning**:
Write the code for the evaluation loop to assess the model's performance on the validation set.



In [None]:
total_eval_loss = 0
total_eval_correct = 0
total_eval_samples = 0

with torch.no_grad():
  for batch in val_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    logits = outputs.logits

    total_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    total_eval_correct += (predictions == labels).sum().item()
    total_eval_samples += labels.size(0)

avg_eval_loss = total_eval_loss / len(val_dataloader)
accuracy = total_eval_correct / total_eval_samples

print(f"Validation Loss: {avg_eval_loss}, Validation Accuracy: {accuracy}")

NameError: name 'torch' is not defined

**Reasoning**:
The previous code block failed because the `torch` library was not imported in the current session. Re-running the code block with the necessary import.



In [None]:
import torch

total_eval_loss = 0
total_eval_correct = 0
total_eval_samples = 0

with torch.no_grad():
  for batch in val_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    logits = outputs.logits

    total_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    total_eval_correct += (predictions == labels).sum().item()
    total_eval_samples += labels.size(0)

avg_eval_loss = total_eval_loss / len(val_dataloader)
accuracy = total_eval_correct / total_eval_samples

print(f"Validation Loss: {avg_eval_loss}, Validation Accuracy: {accuracy}")

NameError: name 'val_dataloader' is not defined

**Reasoning**:
The previous code block failed because `val_dataloader` was not defined in the current session. This variable was defined in a previous code block that was executed successfully. The current code block will be executed again along with the previous code blocks that define `val_dataloader` and other necessary variables.



In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW

# Large movie review dataset
# http://ai.stanford.edu/~amaas/data/sentiment/

train_texts, train_labels = read_imdb_split(split_dir='aclImdb/train')
test_texts, test_labels = read_imdb_split(split_dir='aclImdb/test')

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


class IMDbDataset(Dataset):

  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key,val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[index])
    return item

  def __len__(self):
    return len(self.labels)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Ensure that all of our sequences are padded to the same length and are truncated to be no longer than model's maximum input length.
# this will allow us to feed batches of sequences intothe model at the same time.

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

model = DistilBertForSequenceClassification.from_pretrained(model_name)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

total_eval_loss = 0
total_eval_correct = 0
total_eval_samples = 0

with torch.no_grad():
  for batch in val_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    logits = outputs.logits

    total_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    total_eval_correct += (predictions == labels).sum().item()
    total_eval_samples += labels.size(0)

avg_eval_loss = total_eval_loss / len(val_dataloader)
accuracy = total_eval_correct / total_eval_samples

print(f"Validation Loss: {avg_eval_loss}, Validation Accuracy: {accuracy}")

NameError: name 'read_imdb_split' is not defined

**Reasoning**:
The previous code block failed because the `read_imdb_split` function was not defined. This function was defined in a previous code block that was executed successfully. The current code block will be executed again along with the previous code blocks that define `read_imdb_split` and other necessary variables.



In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset
from pathlib import Path

def read_imdb_split(split_dir):
  split_dir = Path(split_dir)
  texts = []
  labels = []

  for label_dir in ["pos", "neg"]:
    for text_file in (split_dir/label_dir).iterdir():
      texts.append(text_file.read_text())
      labels.append(0 if label_dir == "neg" else 1)

  return texts, labels

model_name = 'distilbert-base-uncased'

# Large movie review dataset
# http://ai.stanford.edu/~amaas/data/sentiment/

train_texts, train_labels = read_imdb_split(split_dir='aclImdb/train')
test_texts, test_labels = read_imdb_split(split_dir='aclImdb/test')

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


class IMDbDataset(Dataset):

  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key,val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[index])
    return item

  def __len__(self):
    return len(self.labels)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Ensure that all of our sequences are padded to the same length and are truncated to be no longer than model's maximum input length.
# this will allow us to feed batches of sequences intothe model at the same time.

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

model = DistilBertForSequenceClassification.from_pretrained(model_name)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

total_eval_loss = 0
total_eval_correct = 0
total_eval_samples = 0

with torch.no_grad():
  for batch in val_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    logits = outputs.logits

    total_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    total_eval_correct += (predictions == labels).sum().item()
    total_eval_samples += labels.size(0)

avg_eval_loss = total_eval_loss / len(val_dataloader)
accuracy = total_eval_correct / total_eval_samples

print(f"Validation Loss: {avg_eval_loss}, Validation Accuracy: {accuracy}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Validation Loss: 0.6932007542139367, Validation Accuracy: 0.496
