In [7]:
!pip install transformers datasets scikit-learn torch

# 1. Imports
import torch
from torch.utils.data import DataLoader
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.auto import tqdm
from torch.optim import AdamW

# 2. Load and subset the dataset
dataset = load_dataset("amazon_polarity")

# Subset to 100,000 training samples and 10,000 validation samples
train_ds = dataset["train"].shuffle(seed=42).select(range(100_000))
val_ds   = dataset["test"].shuffle(seed=42).select(range(10_000))

# 3. Tokenization
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    texts = [t + ". " + c for t, c in zip(batch["title"], batch["content"])]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=["title", "content"])
val_ds   = val_ds.map(tokenize_batch, batched=True, remove_columns=["title", "content"])

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# 4. Dataloaders
batch_size = 16
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size)

# 5. Model, optimizer, scheduler
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# 🟡 Updated for 1 epoch only
epochs = 1
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps // 10,
    num_training_steps=total_steps
)

# 6. Training Loop
for epoch in range(epochs):
    model.train()
    train_pbar = tqdm(train_loader, desc=f"Train Epoch {epoch+1}")
    for batch in train_pbar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        masks     = batch["attention_mask"].to(device)
        labels    = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_pbar.set_postfix(loss=loss.item())

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    for batch in tqdm(val_loader, desc="Validation"):
        with torch.no_grad():
            logits = model(
                batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device)
            ).logits
        preds = logits.argmax(dim=-1).cpu().tolist()
        labels = batch["label"].cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    print(f"Epoch {epoch+1} ▶ Acc: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Map: 100%|██████████| 100000/100000 [00:13<00:00, 7362.25 examples/s]
Map: 100%|██████████| 10000/10000 [00:01<00:00, 7459.14 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train Epoch 1: 100%|██████████| 6250/6250 [3:43:52<00:00,  2.15s/it, loss=0.253]    
Validation: 100%|██████████| 625/625 [06:41<00:00,  1.56it/s]

Epoch 1 ▶ Acc: 0.9557 | Precision: 0.9558 | Recall: 0.9564 | F1: 0.9561





In [8]:
# Save directory
save_path = "saved_bert_sentiment_model2"

# Save the model
model.save_pretrained(save_path)

# Save the tokenizer
tokenizer.save_pretrained(save_path)

print("✅ Model and tokenizer saved to:", save_path)


✅ Model and tokenizer saved to: saved_bert_sentiment_model2


In [3]:
from transformers import BertTokenizerFast, BertForSequenceClassification

# Load the saved model and tokenizer
loaded_tokenizer = BertTokenizerFast.from_pretrained("saved_bert_sentiment_model")
loaded_model = BertForSequenceClassification.from_pretrained("saved_bert_sentiment_model")
loaded_model.to(device)  # Use "mps" or "cpu" based on your earlier code
loaded_model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
def predict_sentiment(text):
    # Tokenize the input
    tokens = loaded_tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Move tensors to device (MPS or CPU)
    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)

    # Run prediction
    with torch.no_grad():
        logits = loaded_model(input_ids, attention_mask=attention_mask).logits
    prediction = torch.argmax(logits, dim=1).item()

    # Interpret result
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"🧠 Sentiment: {sentiment} (label: {prediction})")

# Example usage:
predict_sentiment("This product is good! ok recommended.")
predict_sentiment("ok experience ever. ok the money.")


🧠 Sentiment: Positive (label: 1)
🧠 Sentiment: Positive (label: 1)
