In [None]:
!pip install -q seqeval==1.2.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import numpy as np
from google.colab import files
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents, Sequence
from transformers import PreTrainedTokenizerFast
import torch
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig, TrainingArguments, Trainer
from seqeval.metrics import f1_score, classification_report

# **Data**

In [None]:
ds = load_dataset("thainq107/abte-restaurants")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 1119
    })
})

# Tokenizer

In [None]:
# corpus = danh sách câu (train)
corpus = [" ".join(i) for i in ds["train"]["Tokens"]]
# định ngĩa tách từ wordlevel
tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))
# tách từ bằng khoảng trắng
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
#
trainer = trainers.WordLevelTrainer(vocab_size=5000, special_tokens=["<pad>", "<unk>"])
tokenizer.train_from_iterator(corpus, trainer=trainer)

tokenizer.save("word_tokenizer.json")

# Padding

In [None]:
MAX_LEN = max([len(tokens) for tokens in ds["train"]["Tokens"]])
print(MAX_LEN)

79


In [None]:
def pad_and_truncate(input, pad_id):
  if len(input) < MAX_LEN:
    padded_input = input + [pad_id] * (MAX_LEN - len(input))
  else:
    padded_input = input[:MAX_LEN]
  return padded_input

def tokenize_and_align_labels(examples):
    tokenized_inputs = []
    labels = []
    for tokens, tags in zip(examples["Tokens"], examples["Tags"]):
      token_ids = [
         tokenizer.token_to_id(token.lower())
         if tokenizer.token_to_id(token.lower()) is not None else 0 # Handle None case explicitly
         for token in tokens
      ]
      tags = [int(tag) for tag in tags]
      assert len(token_ids) == len(tags)

      tokenized_inputs.append(pad_and_truncate(token_ids, tokenizer.token_to_id("<pad>")))
      labels.append(pad_and_truncate(tags, -100)) # Append the padded tags

    return {"input_ids": tokenized_inputs, "labels": labels}

In [None]:
preprocessed_ds = ds.map(tokenize_and_align_labels, batched=True)
preprocessed_ds['train']

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

Dataset({
    features: ['Tokens', 'Tags', 'Polarities', 'input_ids', 'labels'],
    num_rows: 3602
})

# Model

### CNN - Conv1D

In [None]:
class ABTEConv1DClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes, embedding_dim=256, num_filters=256, kernel_size=3, pad_idx=0):
        super().__init__(config)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size, padding=1)
        self.fc = nn.Linear(num_filters, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, input_ids, labels=None):
        embedded = self.embedding(input_ids)
        embedded = embedded.permute(0, 2, 1)

        # conv1d
        features = torch.relu(self.conv(embedded))

        features = features.permute(0, 2, 1)
        logits = self.fc(features)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.permute(0, 2, 1), labels)
        return SequenceClassifierOutput(loss=loss, logits=logits)

In [None]:
config = PretrainedConfig()
model = ABTEConv1DClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

## LSTM

In [None]:
class ABTELSTMClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 embedding_dim=256, hidden_dim=256, pad_idx=0):

        super().__init__(config)
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=pad_idx
        )

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        self.fc = nn.Linear(hidden_dim, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        embedded = self.embedding(input_ids)
        outputs, _ = self.lstm(embedded)
        logits = self.fc(outputs)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.permute(0, 2, 1), labels)

        return {"loss": loss, "logits": logits}

In [None]:
config = PretrainedConfig()
model = ABTELSTMClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

## Tranformer

In [None]:
class ABTETransformerClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 max_len=512, embedding_dim=256, num_heads=8,
                 num_layers=6, hidden_dim=1024, pad_idx=0):

        super().__init__(config)
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx = pad_idx)
        self.position_embedding = nn.Embedding(max_len, embedding_dim)

        encoder_layers = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers, num_layers=num_layers)

        self.fc = nn.Linear(embedding_dim, num_classes)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        seq_length = input_ids.size(1)
        positions = torch.arange(
            seq_length, device=input_ids.device).unsqueeze(0)

        outputs = self.embedding(input_ids) + self.position_embedding(positions)

        outputs = outputs.permute(1, 0, 2)
        outputs = self.transformer_encoder(outputs)

        outputs = outputs.permute(1, 0, 2) #=> BxSxE
        logits = self.fc(outputs)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.permute(0, 2, 1), labels)
        return {"loss": loss, "logits": logits}

In [None]:
config = PretrainedConfig()
model = ABTETransformerClassifier(config, len(tokenizer.get_vocab()), num_classes=3)



# Evaluation

In [None]:
id2label = {0: "O", 1: "B-Term", 2: "I-Term"}
label2id = {"O": 0, "B-Term": 1, "I-Term": 2}

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for [p, l] in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = f1_score(true_predictions, true_labels)
    return {"f1-score": results}

# Trainer

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
training_args = TrainingArguments(
    output_dir="abte-restaurants-Transformer", # "transformer-encoder", "lstm", "conv1d"
    logging_dir="logs",
    learning_rate=2e-5,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=100,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1-score",
    # report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## CNN - Conv1D

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1-score
1,1.0202,0.992206,0.179278
2,0.9546,0.932082,0.224542
3,0.8952,0.878204,0.263747
4,0.8418,0.829877,0.304574
5,0.7961,0.786985,0.324314
6,0.7523,0.748592,0.339758
7,0.7144,0.714194,0.352072
8,0.6787,0.683451,0.361029
9,0.6502,0.655916,0.362347
10,0.62,0.630792,0.372032




TrainOutput(global_step=1500, training_loss=0.40054638767242434, metrics={'train_runtime': 1198.9685, 'train_samples_per_second': 300.425, 'train_steps_per_second': 1.251, 'total_flos': 33743172198000.0, 'train_loss': 0.40054638767242434, 'epoch': 100.0})

## LSTM

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1-score
1,1.073,1.058903,0.208113
2,1.0479,1.034591,0.210485
3,1.0238,1.010454,0.20953
4,0.999,0.986081,0.195829
5,0.9742,0.961075,0.183955
6,0.9479,0.935121,0.162724
7,0.9215,0.907805,0.159808
8,0.8911,0.878928,0.15033
9,0.8613,0.848191,0.147045
10,0.8259,0.815778,0.099225




TrainOutput(global_step=1500, training_loss=0.5242962853113811, metrics={'train_runtime': 2596.6048, 'train_samples_per_second': 138.72, 'train_steps_per_second': 0.578, 'total_flos': 89995508223600.0, 'train_loss': 0.5242962853113811, 'epoch': 100.0})

## Transformer

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1-score
1,0.6492,0.638389,0.0
2,0.5508,0.601405,0.0
3,0.5204,0.581249,0.0
4,0.4924,0.554008,0.186293
5,0.4765,0.527038,0.282887
6,0.4518,0.506867,0.336779
7,0.4286,0.490607,0.36671
8,0.4072,0.481482,0.387796
9,0.3978,0.461415,0.444247
10,0.3811,0.470124,0.44642


