In [None]:
pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
pip install onnx onnxruntime

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 k

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel,BertForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
import onnx
import onnxruntime as ort
import os
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


In [None]:


# Use GPU if available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load Amazon Polarity dataset
dataset = load_dataset("amazon_polarity")
train_texts, train_labels = dataset["train"]["content"], dataset["train"]["label"]
test_texts, test_labels = dataset["test"]["content"], dataset["test"]["label"]

# Reduce dataset size for faster training (optional)
train_texts, train_labels = train_texts[:20000], train_labels[:20000]
test_texts, test_labels = test_texts[:5000], test_labels[:5000]

# Split the training data into train and validation sets (80% training, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

# Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create Dataset class for Amazon Polarity
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create DataLoader for training, validation, and test sets
train_dataset = ReviewDataset(train_texts, train_labels, tokenizer)
val_dataset = ReviewDataset(val_texts, val_labels, tokenizer)
test_dataset = ReviewDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = model.to(device)

# Multi-GPU Support
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

# Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)


# Resume Training if Checkpoint Exists
checkpoint_path = "./checkpoint/bert_model_epoch_0.pth"
start_epoch = 0
if os.path.exists(checkpoint_path):
    print("Loading checkpoint...")
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    print(f"Resuming from Epoch {start_epoch}")

# Training Loop with Checkpoint Saving
epochs = 5
for epoch in range(start_epoch, epochs):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    # Training loop
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    # Calculate F1-Score (Not Accuracy) for training set
    train_f1 = f1_score(all_labels, all_preds, average="weighted")

    # Validation loop
    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            val_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.cpu().numpy())

    # Calculate F1-Score for validation set
    val_f1 = f1_score(val_labels, val_preds, average="weighted")


    # Save Checkpoint Every Epoch
    os.makedirs("./checkpoint", exist_ok=True)
    checkpoint_path = f"./checkpoint/bert_model_epoch_{epoch+1}.pth"
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        #'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss / len(train_loader),
        'val_loss': val_loss / len(val_loader),
        'train_f1': train_f1,
        'val_f1': val_f1,
    }, checkpoint_path)

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss:.4f} | Train F1-Score: {train_f1:.4f} | Val Loss: {val_loss:.4f} | Val F1-Score: {val_f1:.4f}")



cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 | Train Loss: 258.9968 | Train F1-Score: 0.8927 | Val Loss: 56.1669 | Val F1-Score: 0.9111
Epoch 2/5 | Train Loss: 125.5399 | Train F1-Score: 0.9563 | Val Loss: 53.7263 | Val F1-Score: 0.9188
Epoch 3/5 | Train Loss: 64.2727 | Train F1-Score: 0.9802 | Val Loss: 80.6036 | Val F1-Score: 0.9124
Epoch 4/5 | Train Loss: 38.1195 | Train F1-Score: 0.9882 | Val Loss: 86.5619 | Val F1-Score: 0.9161
Epoch 5/5 | Train Loss: 30.7245 | Train F1-Score: 0.9910 | Val Loss: 90.3781 | Val F1-Score: 0.9146


In [None]:
# Save Model to ONNX for Faster Inference
onnx_path = "./bert_model.onnx"
dummy_input = (torch.randint(0, 30522, (1, 128)).to(device), torch.ones(1, 128).to(device))
torch.onnx.export(model.module if isinstance(model, nn.DataParallel) else model,
                  dummy_input,
                  onnx_path,
                  input_names=["input_ids", "attention_mask"],
                  output_names=["output"],
                  opset_version=14)

print("Model exported to ONNX format!")

# ONNX Runtime Inference
def onnx_infer(input_text):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    encoding = tokenizer(input_text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

    ort_session = ort.InferenceSession(onnx_path)
    inputs = {
        "input_ids": encoding["input_ids"].to(dtype=torch.int64).numpy(),
        "attention_mask": encoding["attention_mask"].to(dtype=torch.float32).numpy(),
    }
    outputs = ort_session.run(None, inputs)
    predicted_class = torch.argmax(torch.tensor(outputs[0])).item()
    return predicted_class

# Example ONNX Inference
test_text = "This is an amazing product!"
predicted_class = onnx_infer(test_text)
print(f"ONNX Inference Result: Class {predicted_class}")


Model exported to ONNX format!
ONNX Inference Result: Class 1


In [None]:
checkpoint_path = "./checkpoint/bert_model_epoch_5.pth"  # Change to the best epoch if needed

if os.path.exists(checkpoint_path):
    print(f"Loading checkpoint from {checkpoint_path}...")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.to(device)
    print(f"Checkpoint loaded from epoch {checkpoint['epoch']}")

def evaluate(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_preds, all_labels = [], []

    with torch.no_grad():  # No gradient computation during testing
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average="weighted")
    print(f"Test F1-Score: {f1:.4f}")
    return f1

# Run evaluation after training
evaluate(model, test_loader)

Loading checkpoint from ./checkpoint/bert_model_epoch_5.pth...


  checkpoint = torch.load(checkpoint_path, map_location=device)


Checkpoint loaded from epoch 5
Test F1-Score: 0.9176


0.9176129765461719