<a href="https://colab.research.google.com/github/kawsarahmd/transformer-text-classifier/blob/main/text_multiclassification_hf_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers numpy pandas scikit-learn
!pip install datasets
!pip install tqdm



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
from tqdm import tqdm
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder

# =========================
# 1. Load HuggingFace dataset
# =========================
# This assumes the split is named "train". If the dataset has different splits,
# adjust split="train" accordingly (e.g., split="train[:90%]").
hf_dataset = load_dataset("okite97/news-data", split="train")

# Convert to pandas DataFrame
df = hf_dataset.to_pandas()

# Ensure columns exist (Title, Excerpt, Category)
# If needed, you can print(df.columns) to verify.
print("Columns:", df.columns)

# =========================
# 2. Build text_data and label
# =========================
# Combine Title + Excerpt into single text column
df["Title"] = df["Title"].fillna("")
df["Excerpt"] = df["Excerpt"].fillna("")
df["text_data"] = df["Title"] + " " + df["Excerpt"]

# Encode Category â†’ integer label
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Category"])

print("Unique categories:", label_encoder.classes_)
print("Label mapping:", {cls: idx for idx, cls in enumerate(label_encoder.classes_)})

# =========================
# 3. Shuffle and split
# =========================
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_data, val_data = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]  # keep class balance
)

# =========================
# 4. Initialize tokenizer
# =========================
# You are using bert-base-uncased in your code; keeping same.
# (For Bangla, you'd usually prefer a multilingual model like xlm-roberta-base.)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_seq_len = 128

def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    for index, row in tqdm(data.iterrows(), total=len(data)):
        encoded = tokenizer.encode_plus(
            row["text_data"],
            add_special_tokens=True,     # [CLS], [SEP]
            max_length=max_seq_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
        )

        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
        labels.append(row["label"])

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids, dtype=torch.long)
    attention_masks = torch.tensor(attention_masks, dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)

    return input_ids, attention_masks, labels

# =========================
# 5. Tokenize train & validation sets
# =========================
train_input_ids, train_attention_masks, train_labels = tokenize_data(
    train_data, tokenizer, max_seq_len
)
val_input_ids, val_attention_masks, val_labels = tokenize_data(
    val_data, tokenizer, max_seq_len
)

print("Train shapes:", train_input_ids.shape, train_attention_masks.shape, train_labels.shape)
print("Val   shapes:", val_input_ids.shape, val_attention_masks.shape, val_labels.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/4686 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/828 [00:00<?, ? examples/s]

Columns: Index(['Title', 'Excerpt', 'Category'], dtype='object')
Unique categories: ['business' 'entertainment' 'health' 'politics' 'sports' 'tech']
Label mapping: {'business': 0, 'entertainment': 1, 'health': 2, 'politics': 3, 'sports': 4, 'tech': 5}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3748/3748 [00:06<00:00, 584.19it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 938/938 [00:01<00:00, 659.58it/s]


Train shapes: torch.Size([3748, 128]) torch.Size([3748, 128]) torch.Size([3748])
Val   shapes: torch.Size([938, 128]) torch.Size([938, 128]) torch.Size([938])


In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

# Create a TensorDataset object for the training set
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
# Use RandomSampler to shuffle the samples in the dataset
train_sampler = RandomSampler(train_dataset)
# Create DataLoader for the training set using dataset, sampler, and batch size
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Create a TensorDataset object for the validation set
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
# Use SequentialSampler to process the validation dataset sequentially
val_sampler = SequentialSampler(val_dataset)
# Create DataLoader for the validation set using dataset, sampler, and batch size
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

In [5]:
from transformers import BertForSequenceClassification, BertConfig
from torch.optim import AdamW

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=20,  # Number of labels (20) corresponds to the 20 newsgroups dataset
    output_attentions=False,  # Do not output attention weights
    output_hidden_states=False,  # Do not output hidden states
)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# ðŸš‚ **BERT Model Training** ðŸš‚

This final part involves fine-tuning the BERT model on the provided dataset and evaluating its performance. We define functions to train and evaluate the model after every epoch and calculate loss and accuracy metrics during training and validation, respectively. Furthermore, components such as optimizer and scheduler are introduced for efficient model training to help improve the results on each step. This section helps you understand the overall process of training BERT for a classification task and assessing the model's performance.

In [6]:
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report

num_epochs = 3
total_steps = len(train_dataloader) * num_epochs

# Create the optimizer and scheduler for fine-tuning the model
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    # Use a progress bar during training
    progress_bar = tqdm(dataloader, desc="Training", position=0, leave=True)

    # Iterate through each batch in a training epoch
    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        # Zero out gradients before each backward pass
        optimizer.zero_grad()

        # Forward pass to compute the outputs and loss
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()

        # Perform a backward pass and update optimizer/scheduler steps
        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_description(f"Training - Loss: {loss.item():.4f}")

    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0

    # Use a progress bar during evaluation
    progress_bar = tqdm(dataloader, desc="Evaluation", position=0, leave=True)

    # Iterate through each batch in a validation epoch
    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        # Disable gradient calculations during evaluation
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()

        # Calculate accuracy for the current batch
        batch_accuracy = accuracy_score(label_ids, logits.argmax(axis=-1))
        total_eval_accuracy += batch_accuracy

        progress_bar.set_description(f"Evaluation - Batch Accuracy: {batch_accuracy:.4f}")

    return total_eval_accuracy / len(dataloader)

# Train and evaluate the model for 'num_epochs' times
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy = evaluate(model, val_dataloader, device)

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

Training - Loss: 0.1810: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 235/235 [01:18<00:00,  2.98it/s]
Evaluation - Batch Accuracy: 0.8000: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 59/59 [00:06<00:00,  9.21it/s]



Epoch 1/3
Loss: 0.8787 - Validation Accuracy: 0.8970


Training - Loss: 0.0397: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 235/235 [01:19<00:00,  2.96it/s]
Evaluation - Batch Accuracy: 0.8000: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 59/59 [00:06<00:00,  8.95it/s]



Epoch 2/3
Loss: 0.2511 - Validation Accuracy: 0.8981


Training - Loss: 0.0751: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 235/235 [01:19<00:00,  2.94it/s]
Evaluation - Batch Accuracy: 0.8000: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 59/59 [00:06<00:00,  9.24it/s]


Epoch 3/3
Loss: 0.1662 - Validation Accuracy: 0.8960





In [7]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def get_predictions(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    # Use tqdm for a progress bar
    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        # Disable gradient calculations during evaluation
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()

        # Add batch logits and labels to the list of predictions and true labels
        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    # Convert the lists to NumPy arrays
    return np.array(predictions), np.array(true_labels)

# Obtain the model's predictions and true labels for the validation dataset
predictions, true_labels = get_predictions(model, val_dataloader, device)

# Calculate the accuracy on the validation dataset
accuracy = accuracy_score(true_labels, predictions)

# Generate a classification report with more detailed performance metrics
report = classification_report(true_labels, predictions, digits=4)

# Print the accuracy and classification report
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 59/59 [00:06<00:00,  9.26it/s]

Validation Accuracy: 0.8966
Classification Report:
              precision    recall  f1-score   support

           0     0.8429    0.8661    0.8544       254
           1     0.8919    0.8684    0.8800        76
           2     0.9196    0.9364    0.9279       110
           3     0.9481    0.9349    0.9415       215
           4     0.9909    0.9689    0.9798       225
           5     0.5593    0.5690    0.5641        58

    accuracy                         0.8966       938
   macro avg     0.8588    0.8573    0.8579       938
weighted avg     0.8980    0.8966    0.8972       938






# ðŸ’¾ **Save Model**

In [8]:
output_dir = "./model/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [18]:
def classify_news_article(model, tokenizer, device, text, max_len=128):
    # Tokenize the text using BERT tokenizer
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",  # Return tensors
    )

    # Move tensors to device (GPU or CPU)
    input_ids, attention_mask = encoded["input_ids"].to(device), encoded["attention_mask"].to(device)

    # Predict the label using the trained model
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask)

    # Find the index of the class with the highest score
    predicted_label_index = logits[0].argmax(-1).item()

    return predicted_label_index


sample_news_article = "Messi won world cup"

# Get the predicted label index for the given sample article
predicted_label_index = classify_news_article(model, tokenizer, device, sample_news_article)

# Print the predicted label index and name
# Print the predicted label index and name
print(f"Predicted Label Index: {predicted_label_index}")
print(f"Predicted Class: {label_encoder.classes_[predicted_label_index]}")

Predicted Label Index: 4
Predicted Class: sports


In [13]:
label_encoder.classes_

array(['business', 'entertainment', 'health', 'politics', 'sports',
       'tech'], dtype=object)

In [10]:
predicted_label_index

5