In [1]:
!pip install -q tensorflow
!pip install -q datasets
!pip install -q accelerate
!pip install -q transformers
!pip install -q emoji

In [3]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
sys.path.append('..')
tf.__version__

'2.19.0'

In [None]:
if tf.config.list_physical_devices('GPU'):
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('Using GPU:', tf.test.gpu_device_name())
    !nvcc -V
else: raise ValueError('Running on CPU is not recommended.')

In [None]:
!git clone https://github.com/khamdd/absa-2025
%cd ./absa-2025
!mkdir predictions
!ls

# Constants Setup

In [5]:
TRAIN_PATH = '../datasets/eval_2014/Restaurants_Train_v2.xml'
PREPROCESSED_TRAIN_PATH = '../datasets/eval_2014/Restaurants_Train_v2.csv'
PRETRAINED_MODEL = 'bert-base-uncased'
MODEL_NAME = "Restaurant-v1"
MAX_LENGTH = 256
BATCH_SIZE = 21
EPOCHS = 20

In [7]:
from processors.eval2014_processor import Eval2014Loader

Eval2014Loader.xmlToCSV(TRAIN_PATH, PREPROCESSED_TRAIN_PATH)

CSV file generated:../datasets/eval_2014/Restaurants_Train_v2.csv


In [9]:
raw_datasets = Eval2014Loader.load(PREPROCESSED_TRAIN_PATH)
raw_datasets

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Review', 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'],
        num_rows: 3041
    })
})

# Preprocess and Tokenize data

In [11]:
from processors.english_processor import EnglishTextPreprocessor

eng_preprocessor = EnglishTextPreprocessor()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dangd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dangd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dangd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokens = tokenizer.encode("This is a auto tokenizer test string")
print('Encode:', tokens, '\nDecode:', tokenizer.decode(tokens))
tokenizer.model_input_names

Encode: [101, 2023, 2003, 1037, 8285, 19204, 17629, 3231, 5164, 102] 
Decode: [CLS] this is a auto tokenizer test string [SEP]


['input_ids', 'token_type_ids', 'attention_mask']

In [15]:
preprocessed_datasets = Eval2014Loader.preprocess_and_tokenize(raw_datasets, eng_preprocessor, tokenizer, BATCH_SIZE * 2, MAX_LENGTH)
preprocessed_datasets.save_to_disk('../datasets/preprocessed_restaurant')
display(preprocessed_datasets)
pd.DataFrame({
    'raw_datasets': raw_datasets['train']['Review'][:10],
    'encoded_input_ids': preprocessed_datasets['train']['input_ids'][:10],
    'decoded_input_ids': [tokenizer.decode(preprocessed_datasets['train'][i]['input_ids']) for i in range(10)]
})

[INFO] Tokenizing text data...


Map:   0%|          | 0/3041 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3041 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3041
    })
})

Unnamed: 0,raw_datasets,encoded_input_ids,decoded_input_ids
0,But the staff was so horrible to us.,"[101, 2021, 1996, 3095, 2001, 2061, 9202, 2000...",[CLS] but the staff was so horrible to us. [SE...
1,"To be completely fair, the only redeeming fact...","[101, 2000, 2022, 3294, 4189, 1010, 1996, 2069...","[CLS] to be completely fair, the only redeemin..."
2,"The food is uniformly exceptional, with a very...","[101, 1996, 2833, 2003, 27423, 11813, 1010, 20...","[CLS] the food is uniformly exceptional, with ..."
3,Where Gabriela personaly greets you and recomm...,"[101, 2073, 6127, 2050, 3167, 2100, 17021, 201...",[CLS] where gabriela personaly greets you and ...
4,"For those that go once and don't enjoy it, all...","[101, 2005, 2216, 2008, 2175, 2320, 1998, 2123...",[CLS] for those that go once and don ' t enjoy...
5,"Not only was the food outstanding, but the lit...","[101, 2025, 2069, 2001, 1996, 2833, 5151, 1010...","[CLS] not only was the food outstanding, but t..."
6,It is very overpriced and not very tasty.,"[101, 2009, 2003, 2200, 2058, 18098, 6610, 209...",[CLS] it is very overpriced and not very tasty...
7,Our agreed favorite is the orrechiete with sau...,"[101, 2256, 3530, 5440, 2003, 1996, 26914, 159...",[CLS] our agreed favorite is the orrechiete wi...
8,The Bagels have an outstanding taste with a te...,"[101, 1996, 4524, 9050, 2031, 2019, 5151, 5510...",[CLS] the bagels have an outstanding taste wit...
9,Nevertheless the food itself is pretty good.,"[101, 6600, 1996, 2833, 2993, 2003, 3492, 2204...",[CLS] nevertheless the food itself is pretty g...


In [None]:
from datasets import load_from_disk
preprocessed_datasets = load_from_disk('../datasets/preprocessed_restaurant')
preprocessed_datasets

In [None]:
# Using Categories

categories = df['category'].unique()

categories = categories.tolist()

print(categories)

In [None]:
class PolarityMapping:
    INDEX_TO_POLARITY = {0: None, 1: 'positive', 2: 'negative', 3: 'neutral'}
    INDEX_TO_ONEHOT = {0: [1, 0, 0, 0], 1: [0, 1, 0, 0], 2: [0, 0, 1, 0], 3: [0, 0, 0, 1]}
    POLARITY_TO_INDEX = {None: 0, 'positive': 1, 'negative': 2, 'neutral': 3}

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

def preprocess_and_tokenize(data, tokenizer, max_length=128, batch_size=32):
    print("[INFO] Preprocessing and tokenizing data...")

    def process_batch(batch):
        sentences = [item["sentence"] for item in batch]
        labels = [item["label"] for item in batch]

        # Tokenize sentences
        tokenized = tokenizer(
            sentences,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors=None
        )

        # Add one-hot encoded labels
        tokenized["FlattenOneHotLabels"] = [
            PolarityMapping.INDEX_TO_ONEHOT[label] for label in labels
        ]
        return tokenized

    tokenized_data = {
        "input_ids": [],
        "attention_mask": [],
        "FlattenOneHotLabels": []
    }
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        tokenized_batch = process_batch(batch)
        tokenized_data["input_ids"].extend(tokenized_batch["input_ids"])
        tokenized_data["attention_mask"].extend(tokenized_batch["attention_mask"])
        tokenized_data["FlattenOneHotLabels"].extend(tokenized_batch["FlattenOneHotLabels"])

    return tokenized_data

In [None]:
data = []
for sentence in root.iter("sentence"):
    text = sentence.find("text").text
    aspect_categories = sentence.find("aspectCategories")
    if aspect_categories is not None:
        for category in aspect_categories.findall("aspectCategory"):
            polarity = category.attrib["polarity"]
            if polarity != "conflict":
                data.append({
                    "sentence": text,
                    "label": PolarityMapping.POLARITY_TO_INDEX[polarity]
                })

# Preprocess and tokenize
BATCH_SIZE = 32
MAX_LENGTH = 128
tokenized_data = preprocess_and_tokenize(data, tokenizer, max_length=MAX_LENGTH, batch_size=BATCH_SIZE)

# Save to disk
output_path = ''
pd.DataFrame(tokenized_data).to_csv(f"{output_path}.csv", index=False)

# Display a sample
sample_df = pd.DataFrame({
    'raw_sentence': [item["sentence"] for item in data[:10]],
    'encoded_input_ids': tokenized_data["input_ids"][:10],
    'decoded_input_ids': [tokenizer.decode(ids) for ids in tokenized_data["input_ids"][:10]],
    'FlattenOneHotLabels': tokenized_data["FlattenOneHotLabels"][:10]
})
print(sample_df)

In [None]:
ASPECT_CATEGORY_NAMES = df['category'].unique().tolist()
steps_per_epoch = len(tokenized_data['input_ids']) // BATCH_SIZE
total_steps = EPOCHS * steps_per_epoch
print(ASPECT_CATEGORY_NAMES)
print("Steps per epoch:", steps_per_epoch)
print("Total steps:", total_steps)

# Train the model

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import CosineDecay
from tensorflow.keras.callbacks import EarlyStopping

optimizer = Adam(learning_rate=CosineDecay(
    initial_learning_rate = 1e-4,
    warmup_target = 2e-4,
    warmup_steps = int(total_steps * 0.15), # 15% of total_steps
    decay_steps = int(total_steps * 0.3), # Next 30% of total_steps
    alpha = 0.1, # Minimum lr for decay as a fraction of initial_learning_rate
))

early_stop_callback = EarlyStopping(
    monitor = 'val_loss',
    patience = 3, # Stop if no improvement after 3 epochs
    restore_best_weights = True, # You may obtain HIGHER metrics on the test set with longer training time if you set this to False
    # Because after some experiments, I found that even with higher val_loss, it still results in better metric reports on the test set. 
    # This maybe because the training set and the test set have more similarities than the validation data.
    # But I think this is not fair, as we already have prior knowledge about the test set and we modified our training based on the performance on this set. 
    # In real-world, we should only modify our training based on the performance on the validation data
    verbose = 1
)

In [None]:
%%time
from acsa_model import VLSP2018MultiTask
from helper import plot_training_history
model = VLSP2018MultiTask(PRETRAINED_MODEL, ASPECT_CATEGORY_NAMES, optimizer, name=MODEL_NAME)

history = model.fit(
    train_tf_dataset,
    validation_data = val_tf_dataset,
    callbacks = [early_stop_callback],
    epochs = EPOCHS,
    verbose = 1
).history

model.save_weights(f'./weights/{MODEL_NAME}/{MODEL_NAME}', save_format='tf')
plot_training_history(history)