### Training a BERT model on ABSA tasks with specific dataset

This notebook trains an evaluates a BERT model on ABSA tasks by utilising pairwise classification where each pair consists of a (sentence, aspect) and it has a corresponding label. Label = 2 is positive, label = 1 is neutral and label = 0 is negative. 

Referenced from the linked tutorial on training a BERT model with ABSA tasks 
(https://dzone.com/articles/aspect-based-sentiment-analysis-with-gpt-and-bert())

Code was generated with some aid of AI

In [None]:
# transforming data to be used for model training 

import pandas as pd
from pathlib import Path

SENTIMENT2ID = {
    "negative": 0,
    "neutral":  1,
    "positive": 2,
    "neg": 0, "neu": 1, "pos": 2,
}

def _is_sentiment_line(s: str) -> bool:
    return s.strip().lower() in SENTIMENT2ID

def _label_id(s: str) -> int:
    return SENTIMENT2ID[s.strip().lower()]

def _inject_aspect(sentence_with_T: str, aspect: str) -> str:
    s = sentence_with_T.replace("$T$", aspect).replace("$T", aspect)
    s = s.replace("$t$", aspect).replace("$t", aspect)
    return " ".join(s.split())

def transform_edurabsa_file(path: str | Path, verbose: bool = True):
    """
    Robustly parse lines using a sliding window:
      [sentence_with_$T$, aspect, sentiment] where sentiment ∈ SENTIMENT2ID.
    If a window doesn't end with a valid sentiment, shift by 1 line and keep scanning.
    Returns (df, stats).
    """
    path = Path(path)
    raw = path.read_text(encoding="utf-8", errors="ignore").splitlines()
    # strip empties
    lines = [ln.strip() for ln in raw if ln.strip() != ""]
    n = len(lines)

    rows = []
    i = 0
    skipped_windows = 0

    while i <= n - 3:
        s0, s1, s2 = lines[i], lines[i+1], lines[i+2]
        if _is_sentiment_line(s2):
            # accept this triple
            sent = _inject_aspect(s0, s1)
            label = _label_id(s2)
            rows.append({"sentence": sent, "aspect": s1, "label": label})
            i += 3
        else:
            # not a valid triple end — shift window by one to resync
            skipped_windows += 1
            i += 1

    orphan_lines = max(0, (n - i))

    if verbose:
        print(f"[transform_edurabsa_file] Parsed {len(rows)} triples.")
        print(f"[transform_edurabsa_file] Skipped {skipped_windows} non-matching window(s).")
        if orphan_lines:
            print(f"[transform_edurabsa_file] {orphan_lines} trailing orphan line(s) ignored.")

    df = pd.DataFrame(rows, columns=["sentence", "aspect", "label"])
    stats = {
        "parsed_triples": len(rows),
        "skipped_windows": skipped_windows,
        "total_nonempty_lines": n,
        "orphan_lines_ignored": orphan_lines
    }
    return df, stats



In [None]:
data, stats = transform_edurabsa_file("dataset/apc_25p.txt", verbose=True)
print(stats)
data["text_pair"] = data["sentence"] + " [SEP] " + data["aspect"]

[transform_edurabsa_file] Parsed 3408 triples.
[transform_edurabsa_file] Skipped 24 non-matching window(s).
{'parsed_triples': 3408, 'skipped_windows': 24, 'total_nonempty_lines': 10248, 'orphan_lines_ignored': 0}


In [None]:
# save transfromed data 
output_path = "dataset/absa_output_apc_v4.xlsx"
data.to_excel(output_path, index=False, engine="openpyxl")

print(f"Saved to {output_path}")

Saved to dataset/absa_output_apc_v4.xlsx


In [None]:
# model training 

import os
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import (
    BertTokenizerFast,        
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)


# enable use of mps
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = "mps" if torch.backends.mps.is_available() else (
         "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# generating train and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# initalising model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(device)

# computing max token length
def choose_max_length(df, tokenizer, pct=95, floor=32, cap=128):
    enc = tokenizer(
        df["sentence"].tolist(),
        df["aspect"].tolist(),
        add_special_tokens=True,
        truncation=False,
        padding=False,
    )
    lengths = [len(ids) for ids in enc["input_ids"]]
    p = int(np.percentile(lengths, pct))
    return min(max(p, floor), cap)

max_len = choose_max_length(train_data, tokenizer)
print(f"Using max_length={max_len}")


class ABSADataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence = row["sentence"]
        aspect   = row["aspect"]
        label    = int(row["label"])

        inputs = self.tokenizer(
            sentence, aspect,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",  
            truncation=True,
            return_tensors="pt"
        )

        item = {k: v.squeeze(0) for k, v in inputs.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

train_dataset = ABSADataset(train_data, tokenizer, max_length=max_len)
test_dataset  = ABSADataset(test_data,  tokenizer, max_length=max_len)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,               
    per_device_train_batch_size=8,     
    per_device_eval_batch_size=8,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    use_mps_device=True if device == "mps" else False,  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
print(trainer.evaluate())


Using device: mps


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using max_length=128




Step,Training Loss
50,0.9395
100,0.9731
150,0.8728
200,0.8437
250,0.7165
300,0.7877
350,0.7876
400,0.6086
450,0.7316
500,0.583




{'eval_loss': 0.6586470007896423, 'eval_accuracy': 0.7991202346041055, 'eval_f1': 0.7952269604804679, 'eval_precision': 0.792713990840949, 'eval_recall': 0.7991202346041055, 'eval_runtime': 7.1323, 'eval_samples_per_second': 95.622, 'eval_steps_per_second': 12.058, 'epoch': 3.0}


In [None]:
# model prediction 
import torch

def predict_sentiment(model, tokenizer, sentence, aspect, max_length=128):
    model.eval()
    device = next(model.parameters()).device  # whatever device the model is on
    inputs = tokenizer(
        sentence, aspect,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )
    # move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
    pred_id = int(torch.argmax(logits, dim=1).item())
    return {0: "Negative", 1: "Neutral", 2: "Positive"}[pred_id]

model.to(trainer.args.device)

print(predict_sentiment(model, tokenizer,
                        "The instructor is not prepared for lesson and is moves a too slow pace", "pace"))

Negative


In [None]:
# save model and tokeniser 

import os, json

save_dir = "artifacts/bert-absa-v1"
os.makedirs(save_dir, exist_ok=True)

# attach label to the model config
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {v: k for k, v in id2label.items()}
trainer.model.config.id2label = id2label
trainer.model.config.label2id = label2id

# save trainer and tokenizer
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

# save eval metrics
metrics = trainer.evaluate()
with open(os.path.join(save_dir, "eval_metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)

# save run metadata for reproducibility
run_meta = {
    "max_length": max_len,
    "num_train_epochs": training_args.num_train_epochs,
    "per_device_train_batch_size": training_args.per_device_train_batch_size,
    "per_device_eval_batch_size": training_args.per_device_eval_batch_size,
    "learning_rate": getattr(training_args, "learning_rate", None),
    "weight_decay": training_args.weight_decay,
    "seed": getattr(training_args, "seed", None),
    "model_name": "bert-base-uncased",
    "labels": id2label,
}
with open(os.path.join(save_dir, "run_meta.json"), "w") as f:
    json.dump(run_meta, f, indent=2)

# save trainer state
trainer.state.save_to_json(os.path.join(save_dir, "trainer_state.json"))





In [None]:
# load and test model 
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else (-1 if not torch.backends.mps.is_available() else "mps")

clf = pipeline("text-classification", model=save_dir, tokenizer=save_dir, device=device)
print(clf({"text": "Many others would say the instructor is really prepared for lesson but I personally do not agree. Otherwise, lesson moves at a decent pace", "text_pair": "preparedness"},
          truncation=True))


Device set to use mps


{'label': 'neutral', 'score': 0.4217434823513031}
