## Build Dataset

In [1]:
!pip install -q datasets==3.2.0

from datasets import load_dataset

ds = load_dataset("thainq107/abte-restaurants")

README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

## Tokenization

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = []
    labels = []

    for tokens, tags in zip(examples['Tokens'], examples['Tags']):
        # Clean and split tokens and tags
        tokens = str(tokens).replace("’", "").strip("][").split(", ")
        tags = str(tags).strip("][").split(", ")
        tags = [tag.strip("'\"") for tag in tags]
        
        bert_tokens = []
        bert_tags = []

        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i]) # Tokenize each token
            bert_tokens += t
            bert_tags += [int(tags[i])] * len(t) # Repeat the label for each sub-token

        # Convert tokens to input IDs
        bert_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
        tokenized_inputs.append(bert_ids)
        labels.append(bert_tags)

    return {
        'input_ids': tokenized_inputs,
        'labels': labels
    }

# Apply the tokenization and label alignment to the dataset
preprocessed_ds = ds.map(tokenize_and_align_labels, batched=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [3]:
preprocessed_ds.column_names

{'train': ['Tokens', 'Tags', 'Polarities', 'input_ids', 'labels'],
 'test': ['Tokens', 'Tags', 'Polarities', 'input_ids', 'labels']}

In [4]:
print("Input IDs:", preprocessed_ds['train']['input_ids'][0])
print("Labels:", preprocessed_ds['train']['labels'][0])

Input IDs: [1005, 2021, 1005, 1005, 1996, 1005, 1005, 3095, 1005, 1005, 2001, 1005, 1005, 2061, 1005, 1005, 9202, 1005, 1005, 2000, 1005, 1005, 2149, 1005, 1005, 1012, 1005]
Labels: [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Data Collator

In [5]:
from transformers import DataCollatorForTokenClassification

# Create a data collator for token classification tasks
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Evaluate

In [6]:
!pip install -q seqeval==1.2.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [7]:
import numpy as np
from seqeval.metrics import accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Extract true predictions by filtering out tokens with label -100 (ignored during padding).
    true_predictions = [
        [str(p) for (p, l) in zip(prediction, label) if l != 100]
        for prediction, label in zip(predictions, labels)
    ]

    # Extract true labels by filtering out the ignored tokens (-100).
    true_labels = [
        [str(l) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = accuracy_score(true_predictions, true_labels)
    return {"accuracy": results}

## Model

In [8]:
from transformers import AutoModelForTokenClassification

id2label = {
    0: "O",
    1: "B-Term",
    2: "I-Term"
}
label2id = {
    "O": 0,
    "B-Term": 1,
    "I-Term": 2
}

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

In [9]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="abte-restaurants-distilbert-base-uncased",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.193533,0.1966
2,No log,0.149018,0.197288
3,No log,0.152572,0.196709
4,No log,0.165556,0.196773
5,0.120300,0.168672,0.196882




TrainOutput(global_step=565, training_loss=0.11125038746183952, metrics={'train_runtime': 190.7199, 'train_samples_per_second': 94.432, 'train_steps_per_second': 2.962, 'total_flos': 730006263413100.0, 'train_loss': 0.11125038746183952, 'epoch': 5.0})

In [12]:
trainer.save_model("abte-restaurants-distilbert-base-uncased")
tokenizer.save_pretrained("abte-restaurants-distilbert-base-uncased")

('abte-restaurants-distilbert-base-uncased/tokenizer_config.json',
 'abte-restaurants-distilbert-base-uncased/special_tokens_map.json',
 'abte-restaurants-distilbert-base-uncased/vocab.txt',
 'abte-restaurants-distilbert-base-uncased/added_tokens.json',
 'abte-restaurants-distilbert-base-uncased/tokenizer.json')

## Prediction

In [22]:
from transformers import pipeline

token_classifier = pipeline(
    "ner",
    model="abte-restaurants-distilbert-base-uncased",
    tokenizer="abte-restaurants-distilbert-base-uncased",
    aggregation_strategy="simple"
)

test_sentence = "The bred is top notch as well"
result = token_classifier(test_sentence)
result

Device set to use cuda:0


[{'entity_group': 'Term',
  'score': 0.48610213,
  'word': 'bred',
  'start': 4,
  'end': 8},
 {'entity_group': 'Term',
  'score': 0.46678838,
  'word': 'top',
  'start': 12,
  'end': 15}]