In [None]:
# Colab setup: upgrade build tools, install libs
!pip install --upgrade pip setuptools wheel --quiet
!pip install transformers datasets scikit-learn torch --quiet


In [None]:
# Imports
import torch
from torch.utils.data import DataLoader
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup, # Removed AdamW from here
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.auto import tqdm
from torch.optim import AdamW # Import AdamW from torch.optim instead

In [None]:
# 1. Load Amazon Polarity
# ~3.6M train, ~400K test
dataset = load_dataset("amazon_polarity")

# Quick peek
print(dataset["train"][0])
# → {'label': 0, 'title': '...', 'content': '...'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/260M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

{'label': 1, 'title': 'Stuning even for the non-gamer', 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}


In [None]:
# 2. Tokenization
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    # Combine title + content; you may drop title if too long
    texts = [t + ". " + c for t, c in zip(batch["title"], batch["content"])]
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,
    )

# Apply
dataset = dataset.map(tokenize_batch, batched=True, remove_columns=["title", "content"])
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [None]:
# Optional: use a smaller subset for quick iteration
# train_ds = dataset["train"].shuffle(seed=42).select(range(200_000))
train_ds = dataset["train"]
val_ds   = dataset["test"]


In [None]:
# 3. DataLoaders
batch_size = 16
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size)


In [None]:
# 4. Model & optimizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
# Scheduler: warmup + linear decay
total_steps = len(train_loader) * 3  # epochs=3
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = total_steps // 10,
    num_training_steps = total_steps,
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 5. Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    train_pbar = tqdm(train_loader, desc=f"Train Epoch {epoch+1}")
    for batch in train_pbar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        masks     = batch["attention_mask"].to(device)
        labels    = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_pbar.set_postfix(loss=loss.item())

    # 6. Validation
    model.eval()
    all_preds, all_labels = [], []
    for batch in tqdm(val_loader, desc="Validation"):
        with torch.no_grad():
            logits = model(
                batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device)
            ).logits
        preds = logits.argmax(dim=-1).cpu().tolist()
        labels = batch["label"].cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Epoch {epoch+1} ▶  Acc: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")


Train Epoch 1:   0%|          | 0/225000 [00:00<?, ?it/s]

KeyboardInterrupt: 