In [1]:
!pip install -q tensorflow
!pip install -q datasets
!pip install -q accelerate
!pip install -q transformers
!pip install -q emoji
!pip install -q tf-keras

In [2]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
sys.path.append('..')
tf.__version__

'2.19.0'

In [None]:
# Will result in error, ignore it

# if tf.config.list_physical_devices('GPU'):
#     physical_devices = tf.config.list_physical_devices('GPU')
#     tf.config.experimental.set_memory_growth(physical_devices[0], True)
#     print('Using GPU:', tf.test.gpu_device_name())
#     !nvcc -V
# else: raise ValueError('Running on CPU is not recommended.')

In [None]:
# Skip this step if run locally

# !git clone https://github.com/khamdd/absa-2025
# %cd ./absa-2025
# !mkdir predictions
# !ls

# Constants Setup

In [5]:
RAW_DATASET_PATH = r'../datasets/eval_2014/Restaurants_Train_v2.xml'
TRAIN_PATH = r'../datasets/eval_2014/Restaurants_Train_v2_train.csv'
VAL_PATH = r'../datasets/eval_2014/Restaurants_Train_v2_dev.csv'
TEST_PATH = r'../datasets/eval_2014/Restaurants_Train_v2_test.csv'
PRETRAINED_MODEL = 'bert-base-uncased'
MODEL_NAME = "Restaurant-v1"
MAX_LENGTH = 256
BATCH_SIZE = 21
EPOCHS = 20

In [7]:
from processors.eval2014_processor import Eval2014Loader

Eval2014Loader.xmlToCSV(RAW_DATASET_PATH, TRAIN_PATH)

CSV file generated:../datasets/eval_2014/Restaurants_Train_v2_train.csv


In [9]:
from sklearn.model_selection import train_test_split
import pandas as pd

dataset = pd.read_csv(TRAIN_PATH)

train_size = 0.7  # 70% for training
test_val_size = 0.3  # Remaining 30% for testing and validation
val_size = 0.5  # 50% of the test_val set for validation

train_data, test_val_data = train_test_split(dataset, test_size=test_val_size, random_state=42)

val_data, test_data = train_test_split(test_val_data, test_size=val_size, random_state=42)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

train_data.to_csv(TRAIN_PATH, index=False)
val_data.to_csv(VAL_PATH, index=False)
test_data.to_csv(TEST_PATH, index=False)

Train size: 2128
Validation size: 456
Test size: 457


In [11]:
raw_datasets = Eval2014Loader.load(TRAIN_PATH, VAL_PATH, TEST_PATH)
raw_datasets

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Review', 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'],
        num_rows: 2128
    })
    val: Dataset({
        features: ['Review', 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'],
        num_rows: 456
    })
    test: Dataset({
        features: ['Review', 'ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'],
        num_rows: 457
    })
})

# Preprocess and Tokenize data

In [14]:
from processors.english_processor import EnglishTextPreprocessor

eng_preprocessor = EnglishTextPreprocessor()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dangd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dangd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dangd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokens = tokenizer.encode("This is a auto tokenizer test string")
print('Encode:', tokens, '\nDecode:', tokenizer.decode(tokens))
tokenizer.model_input_names

Encode: [101, 2023, 2003, 1037, 8285, 19204, 17629, 3231, 5164, 102] 
Decode: [CLS] this is a auto tokenizer test string [SEP]


['input_ids', 'token_type_ids', 'attention_mask']

In [18]:
preprocessed_datasets = Eval2014Loader.preprocess_and_tokenize(raw_datasets, eng_preprocessor, tokenizer, BATCH_SIZE * 2, MAX_LENGTH)
preprocessed_datasets.save_to_disk('../datasets/preprocessed_restaurant')
display(preprocessed_datasets)
pd.DataFrame({
    'raw_datasets': raw_datasets['train']['Review'][:10],
    'encoded_input_ids': preprocessed_datasets['train']['input_ids'][:10],
    'decoded_input_ids': [tokenizer.decode(preprocessed_datasets['train'][i]['input_ids']) for i in range(10)]
})

[INFO] Tokenizing text data...


Map:   0%|          | 0/2128 [00:00<?, ? examples/s]

Map:   0%|          | 0/456 [00:00<?, ? examples/s]

Map:   0%|          | 0/457 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2128 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/456 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/457 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2128
    })
    val: Dataset({
        features: ['ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 456
    })
    test: Dataset({
        features: ['ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 457
    })
})

Unnamed: 0,raw_datasets,encoded_input_ids,decoded_input_ids
0,I loved this place!!,"[101, 1045, 3866, 2023, 2173, 102, 0, 0, 0, 0,...",[CLS] i loved this place [SEP] [PAD] [PAD] [PA...
1,The rice was poor quality and was cooked so ba...,"[101, 1996, 5785, 11333, 3532, 3737, 1998, 113...",[CLS] the rice wa poor quality and wa cooked s...
2,The wine list is interesting and has many good...,"[101, 1996, 4511, 2862, 2003, 5875, 1998, 5292...",[CLS] the wine list is interesting and ha many...
3,Plus they made a perfect martini.,"[101, 4606, 2027, 2081, 1037, 3819, 24480, 102...",[CLS] plus they made a perfect martini [SEP] [...
4,I've lived in NYC all my life and had never be...,"[101, 4921, 2063, 2973, 1999, 16392, 2035, 202...",[CLS] ive lived in nyc all my life and had nev...
5,THE FOOD PORTIONS ARE REALLY LARGE.,"[101, 1996, 2833, 4664, 2024, 2428, 2312, 102,...",[CLS] the food portion are really large [SEP] ...
6,"As we were leaving, the couple standing by the...","[101, 1037, 2057, 2020, 2975, 1996, 3232, 3061...",[CLS] a we were leaving the couple standing by...
7,They came out over cooked and the cheese was a...,"[101, 2027, 2234, 2041, 2058, 12984, 1998, 199...",[CLS] they came out over cooked and the cheese...
8,Brava La Pizza Fresca!,"[101, 11655, 3567, 2474, 10733, 10424, 2229, 3...",[CLS] brava la pizza fresca [SEP] [PAD] [PAD] ...
9,"If you go for the pre-theatre menu, it's an ev...","[101, 2065, 2017, 2175, 2005, 1996, 3653, 1076...",[CLS] if you go for the pretheatre menu it an ...


In [20]:
from datasets import load_from_disk
preprocessed_datasets = load_from_disk('../datasets/preprocessed_restaurant')
preprocessed_datasets = Eval2014Loader.labels_to_flatten_onehot(preprocessed_datasets)
preprocessed_datasets

[INFO] Transforming "Aspect#Categoy,Polarity" labels to flattened one-hot encoding...


Map (num_proc=8):   0%|          | 0/2128 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/456 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/457 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['FlattenOneHotLabels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2128
    })
    val: Dataset({
        features: ['FlattenOneHotLabels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 456
    })
    test: Dataset({
        features: ['FlattenOneHotLabels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 457
    })
})

# Prepare for TensorFlow Training

In [23]:
ASPECT_CATEGORY_NAMES = raw_datasets['train'].column_names[1:]
steps_per_epoch = len(preprocessed_datasets['train']) // BATCH_SIZE
total_steps = EPOCHS * steps_per_epoch
print(ASPECT_CATEGORY_NAMES)

['ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service']


In [25]:
train_tf_dataset = preprocessed_datasets['train'].to_tf_dataset(
    columns=tokenizer.model_input_names, label_cols='FlattenOneHotLabels',
    batch_size=BATCH_SIZE, shuffle=True, num_workers=8
)
val_tf_dataset = preprocessed_datasets['val'].to_tf_dataset(
    columns=tokenizer.model_input_names, label_cols='FlattenOneHotLabels',
    batch_size=BATCH_SIZE, shuffle=False, num_workers=8
)
test_tf_dataset = preprocessed_datasets['test'].to_tf_dataset(
    columns=tokenizer.model_input_names, label_cols='FlattenOneHotLabels',
    batch_size=BATCH_SIZE, shuffle=False, num_workers=8
)

In [27]:
from tf_keras.optimizers import Adam
from tf_keras.optimizers.schedules import CosineDecay
from tf_keras.callbacks import EarlyStopping

optimizer = Adam(learning_rate=CosineDecay(
    initial_learning_rate = 1e-4,
    warmup_target = 2e-4,
    warmup_steps = int(total_steps * 0.15), # 15% of total_steps
    decay_steps = int(total_steps * 0.3), # Next 30% of total_steps
    alpha = 0.1, # Minimum lr for decay as a fraction of initial_learning_rate
))

early_stop_callback = EarlyStopping(
    monitor = 'val_loss',
    patience = 3, # Stop if no improvement after 3 epochs
    restore_best_weights = True, # You can obtain HIGHER metrics on the test set with longer training time if you set this to False
    # Because after some experiments, I found that even with higher val_loss, it still results in better metric reports on the test set. 
    # This maybe because the training set and the test set have more similarities than the validation data.
    # But I think this is not fair, as we already have prior knowledge about the test set and we modified our training based on the performance on this set. 
    # In real-world, we should only modify our training based on the performance on the validation data
    verbose = 1
)




# Fine Tuning with TensorFlow

In [30]:
%%time
from acsa_model import ABSA2025MultiTask
from helper import plot_training_history
model = ABSA2025MultiTask(PRETRAINED_MODEL, ASPECT_CATEGORY_NAMES, optimizer, name=MODEL_NAME)

history = model.fit(
    train_tf_dataset,
    validation_data = val_tf_dataset,
    callbacks = [early_stop_callback],
    epochs = EPOCHS,
    verbose = 1
).history

model.save_weights(f'./weights/{MODEL_NAME}/{MODEL_NAME}', save_format='tf')
plot_training_history(history)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/20

  2/102 [..............................] - ETA: 58:03 - loss: 0.5360  

KeyboardInterrupt: 