In [1]:
!pip install -q tensorflow
!pip install -q datasets
!pip install -q accelerate
!pip install -q transformers
!pip install -q emoji
!pip install -q tf-keras

In [3]:
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
sys.path.append('..')
tf.__version__

'2.19.0'

In [None]:
# Will result in error, ignore it

# if tf.config.list_physical_devices('GPU'):
#     physical_devices = tf.config.list_physical_devices('GPU')
#     tf.config.experimental.set_memory_growth(physical_devices[0], True)
#     print('Using GPU:', tf.test.gpu_device_name())
#     !nvcc -V
# else: raise ValueError('Running on CPU is not recommended.')

In [None]:
# Skip this step if run locally

# !git clone https://github.com/khamdd/absa-2025
# %cd ./absa-2025
# !mkdir predictions
# !ls

# Constants Setup

In [13]:
RAW_DATASET_PATH = r'../datasets/eval_2014/Restaurants_Train_v2.xml'
TRAIN_PATH = r'../datasets/eval_2014/Restaurants_Train_v2_train.csv'
VAL_PATH = r'../datasets/eval_2014/Restaurants_Train_v2_dev.csv'
TEST_PATH = r'../datasets/eval_2014/Restaurants_Train_v2_test.csv'
PRETRAINED_MODEL = 'bert-base-uncased'
MODEL_NAME = "Restaurant-v1"
MAX_LENGTH = 256
BATCH_SIZE = 21
EPOCHS = 20

In [15]:
from processors.eval2014_processor import Eval2014Loader

Eval2014Loader.xmlToCSV(RAW_DATASET_PATH, TRAIN_PATH)

CSV file generated:../datasets/eval_2014/Restaurants_Train_v2_train.csv


In [19]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your dataset (assuming it's a CSV)
dataset = pd.read_csv(TRAIN_PATH)

# Define the split ratios
train_size = 0.7  # 70% for training
test_val_size = 0.3  # Remaining 30% for testing and validation
val_size = 0.5  # 50% of the test_val set for validation

# Step 1: Split into train and remaining (test + validation)
train_data, test_val_data = train_test_split(dataset, test_size=test_val_size, random_state=42)

# Step 2: Split the remaining data into test and validation
val_data, test_data = train_test_split(test_val_data, test_size=val_size, random_state=42)

# Verify the sizes
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

# Save the splits into separate files (optional)
train_data.to_csv(TRAIN_PATH, index=False)
val_data.to_csv(VAL_PATH, index=False)
test_data.to_csv(TEST_PATH, index=False)

Train size: 2128
Validation size: 456
Test size: 457


In [None]:
raw_datasets = Eval2014Loader.load(TRAIN_PATH, VAL_PATH, TEST_PATH)
raw_datasets

# Preprocess and Tokenize data

In [None]:
from processors.english_processor import EnglishTextPreprocessor

eng_preprocessor = EnglishTextPreprocessor()

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokens = tokenizer.encode("This is a auto tokenizer test string")
print('Encode:', tokens, '\nDecode:', tokenizer.decode(tokens))
tokenizer.model_input_names

In [None]:
preprocessed_datasets = Eval2014Loader.preprocess_and_tokenize(raw_datasets, eng_preprocessor, tokenizer, BATCH_SIZE * 2, MAX_LENGTH)
preprocessed_datasets.save_to_disk('../datasets/preprocessed_restaurant')
display(preprocessed_datasets)
pd.DataFrame({
    'raw_datasets': raw_datasets['train']['Review'][:10],
    'encoded_input_ids': preprocessed_datasets['train']['input_ids'][:10],
    'decoded_input_ids': [tokenizer.decode(preprocessed_datasets['train'][i]['input_ids']) for i in range(10)]
})

In [None]:
from datasets import load_from_disk
preprocessed_datasets = load_from_disk('../datasets/preprocessed_restaurant')
preprocessed_datasets = Eval2014Loader.labels_to_flatten_onehot(preprocessed_datasets)
preprocessed_datasets

# Prepare for TensorFlow Training

In [None]:
ASPECT_CATEGORY_NAMES = raw_datasets['train'].column_names[1:]
steps_per_epoch = len(preprocessed_datasets['train']) // BATCH_SIZE
total_steps = EPOCHS * steps_per_epoch
print(ASPECT_CATEGORY_NAMES)

In [None]:
train_tf_dataset = preprocessed_datasets['train'].to_tf_dataset(
    columns=tokenizer.model_input_names, label_cols='FlattenOneHotLabels',
    batch_size=BATCH_SIZE, shuffle=True, num_workers=8
)

In [None]:
from tf_keras.optimizers import Adam
from tf_keras.optimizers.schedules import CosineDecay
from tf_keras.callbacks import EarlyStopping

optimizer = Adam(learning_rate=CosineDecay(
    initial_learning_rate = 1e-4,
    warmup_target = 2e-4,
    warmup_steps = int(total_steps * 0.15), # 15% of total_steps
    decay_steps = int(total_steps * 0.3), # Next 30% of total_steps
    alpha = 0.1, # Minimum lr for decay as a fraction of initial_learning_rate
))

early_stop_callback = EarlyStopping(
    monitor = 'val_loss',
    patience = 3, # Stop if no improvement after 3 epochs
    restore_best_weights = True, # You can obtain HIGHER metrics on the test set with longer training time if you set this to False
    # Because after some experiments, I found that even with higher val_loss, it still results in better metric reports on the test set. 
    # This maybe because the training set and the test set have more similarities than the validation data.
    # But I think this is not fair, as we already have prior knowledge about the test set and we modified our training based on the performance on this set. 
    # In real-world, we should only modify our training based on the performance on the validation data
    verbose = 1
)

# Fine Tuning with TensorFlow

In [None]:
%%time
from acsa_model import ABSA2025MultiTask
from helper import plot_training_history
model = ABSA2025MultiTask(PRETRAINED_MODEL, ASPECT_CATEGORY_NAMES, optimizer, name=MODEL_NAME)

history = model.fit(
    train_tf_dataset,
    # validation_data = val_tf_dataset,
    callbacks = [early_stop_callback],
    epochs = EPOCHS,
    verbose = 1
).history

model.save_weights(f'./weights/{MODEL_NAME}/{MODEL_NAME}', save_format='tf')
plot_training_history(history)