## Build Dataset

In [65]:
!pip install -q datasets==3.2.0

from datasets import load_dataset

ds = load_dataset("thainq107/abte-restaurants")

## Tokenization

In [66]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def tokenize_and_align_labels(examples):
    sentences, sentence_tags = [], []
    labels = []

    for tokens, pols in zip(examples['Tokens'], examples['Polarities']):
        # Clean and split tokens and tags
        tokens = str(tokens).replace("’", "").strip("][").split(", ")
        pols = [pol.strip("'\"") for pol in str(pols).strip("][").split(", ")]

        bert_tokens, bert_att = [], []
        pols_label = 0

        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i])  # Tokenize each token
            bert_tokens += t
            if int(pols[i]) != -1:  # Only add tokens with a valid polarity
                bert_att += t  # Add tokens to sentence_tags
                pols_label = int(pols[i])  # Use the last valid polarity as the label

        # Filter sentence_tags to only include unique aspect words
        sentence_tags.append(" ".join(set(bert_att)))  # Remove duplicates
        sentences.append(" ".join(bert_tokens))  # Full tokenized sentence
        labels.append(pols_label)  # Final polarity label

    # Tokenize and prepare tensors with padding and truncation
    tokenized_inputs = tokenizer(sentences, sentence_tags, padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and label alignment to the dataset
preprocessed_ds = ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [67]:
preprocessed_ds.column_names

{'train': ['Tokens',
  'Tags',
  'Polarities',
  'input_ids',
  'attention_mask',
  'labels'],
 'test': ['Tokens',
  'Tags',
  'Polarities',
  'input_ids',
  'attention_mask',
  'labels']}

In [68]:
print("Tokens:", preprocessed_ds['train']['Tokens'][0])
print("Tags:", preprocessed_ds['train']['Tags'][0])
print("Polarities:", preprocessed_ds['train']['Polarities'][0])
print("Input IDs:", preprocessed_ds['train']['input_ids'][0])
print("Attention Mask:", preprocessed_ds['train']['attention_mask'][0])
print("Labels:", preprocessed_ds['train']['labels'][0])

Tokens: ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.']
Tags: ['0', '0', '1', '0', '0', '0', '0', '0', '0']
Polarities: ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']
Input IDs: [101, 1005, 2021, 1005, 1005, 1996, 1005, 1005, 3095, 1005, 1005, 2001, 1005, 1005, 2061, 1005, 1005, 9202, 1005, 1005, 2000, 1005, 1005, 2149, 1005, 1005, 1012, 1005, 102, 1005, 3095, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Evaluate

In [69]:
!pip install -q evaluate==0.4.3

In [70]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Model

In [71]:
from transformers import AutoModelForSequenceClassification

id2label = {
    0: "Negative",
    1: "Neutral",
    2: "Positive"
}
label2id = {
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
}

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

In [72]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [73]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="abte-restaurants-distilbert-base-uncased",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.606808,0.756032
2,No log,0.554166,0.778374
3,No log,0.518584,0.793566
4,No log,0.500031,0.809651
5,0.569200,0.515907,0.807864




TrainOutput(global_step=565, training_loss=0.5443617018978153, metrics={'train_runtime': 329.331, 'train_samples_per_second': 54.687, 'train_steps_per_second': 1.716, 'total_flos': 1369959836789880.0, 'train_loss': 0.5443617018978153, 'epoch': 5.0})

In [74]:
trainer.save_model("abte-restaurants-distilbert-base-uncased")
tokenizer.save_pretrained("abte-restaurants-distilbert-base-uncased")

('abte-restaurants-distilbert-base-uncased/tokenizer_config.json',
 'abte-restaurants-distilbert-base-uncased/special_tokens_map.json',
 'abte-restaurants-distilbert-base-uncased/vocab.txt',
 'abte-restaurants-distilbert-base-uncased/added_tokens.json',
 'abte-restaurants-distilbert-base-uncased/tokenizer.json')

## Prediction

In [78]:
from transformers import pipeline

token_classifier = pipeline(
    "ner",
    model="abte-restaurants-distilbert-base-uncased",
    tokenizer="abte-restaurants-distilbert-base-uncased",
    aggregation_strategy="simple"  # This returns individual tokens
)

classifier = pipeline(
    "text-classification",
    model="abte-restaurants-distilbert-base-uncased",
    tokenizer="abte-restaurants-distilbert-base-uncased",
)


test_sentence = "The bread is top notch as well"
results = token_classifier(test_sentence)

# Joining recognized words from the NER pipeline
sentence_tags = " ".join([result["word"] for result in results])

# Classifying the sentence with additional sentence tags
pred_label = classifier(f"{test_sentence} [SEP] {sentence_tags}")

sentence_tags, pred_label

Device set to use cuda:0
Device set to use cuda:0


('the bread is top notch as well',
 [{'label': 'Positive', 'score': 0.9333567023277283}])