## Instruction fine tuning on Flan-T5: Aspect Sentiment (pair) Extraction

In [2]:
# For google colab
!pip install transformers -q
!pip install datasets -q
!pip install evaluate -q
!pip install sentencepiece -q
!pip install accelerate -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import random
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import torch
from transformers import AutoTokenizer
from datasets import DatasetDict
# import wandb

root_path = '/content/drive/MyDrive/instruction_tuning'
use_mps = True if torch.has_mps else False
os.chdir(root_path)

from data_preprocessing import promptBuilder, create_hf_dataset
from utils import instructionModelHandler

In [4]:
# Define task + base model
task_name = 'ase'
base_model = 'google/flan-t5-base'
model_out_path = './Models'
model_out_path = os.path.join(model_out_path, task_name, f"{base_model.replace('/', '')}")
print('Model output path: ', model_out_path)

# Load the data - Multi-Aspect-Multi-Sentiment (MAMS)
m_train_file_path = './Datasets/MAMS/train.csv'
m_test_file_path = './Datasets/MAMS/test.csv'
m_val_file_path = './Datasets/MAMS/val.csv'
m_tr_df = pd.read_csv(m_train_file_path)
m_te_df = pd.read_csv(m_test_file_path)
m_val_df = pd.read_csv(m_val_file_path)

# Convert strings to lists
for df in [m_tr_df, m_te_df, m_val_df]:
  df['aspects'] = [eval(i) for i in df['aspects']]
  df['sentiments'] = [eval(i) for i in df['sentiments']]

Model output path:  ./Models/ase/googleflan-t5-base


In [5]:
# Pre-proccess data - create input prompts and ouput label sequences
prompt_builder = promptBuilder(task="ase")

tr_input = prompt_builder.generate_inputs(m_tr_df['raw_text'])
te_input = prompt_builder.generate_inputs(m_te_df['raw_text'])
val_input = prompt_builder.generate_inputs(m_val_df['raw_text'])

tr_output = prompt_builder.generate_target_outputs(aspects=m_tr_df['aspects'], sentiments=m_tr_df['sentiments'])
te_output = prompt_builder.generate_target_outputs(aspects=m_te_df['aspects'], sentiments=m_te_df['sentiments'])
val_output = prompt_builder.generate_target_outputs(aspects=m_val_df['aspects'], sentiments=m_val_df['sentiments'])

# Create HuggingFace dataset object
hf_dataset = create_hf_dataset(tr_input=tr_input, tr_output=tr_output,
                               te_input=te_input, te_output=te_output,
                               val_input=val_input, val_output=val_output)

In [6]:
def tokenize_function(sample):
    """ Tokenize HF dataset
    """
    model_inputs = tokenizer(sample['text'], max_length=512, truncation=True)
    model_inputs["labels"] = tokenizer(sample["labels"], max_length=64, truncation=True).input_ids
    return model_inputs

# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained(base_model)
hf_tr_tokenized = hf_dataset['train'].map(tokenize_function, batched=True)
hf_te_tokenized = hf_dataset['test'].map(tokenize_function, batched=True)
hf_val_tokenized = hf_dataset['validation'].map(tokenize_function, batched=True)

hf_tokenized_dataset = DatasetDict({'train': hf_tr_tokenized, 'test': hf_te_tokenized, 'validation': hf_val_tokenized})

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/4297 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
# Initialise model
experiment = instructionModelHandler(base_model=base_model, task='ase')

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
# Training arguments
training_args = {
    'output_dir':model_out_path,
    'lr_scheduler_type':'cosine',
    'per_device_train_batch_size':8,
    'per_device_eval_batch_size':16,
    'push_to_hub':False,
    'eval_accumulation_steps':1,
    'predict_with_generate':True,

    ## wandb
    # 'report_to':'wandb',
    # 'run_name':'flan-t5_ft_mams',

    ## model save strategy
    'save_total_limit':1,
    'save_strategy':'epoch',
    'evaluation_strategy':"epoch",
    'metric_for_best_model':'eval_loss',
    'greater_is_better':False,
    'load_best_model_at_end':True,

    # hyperparams
    'learning_rate':5e-5,
    'num_train_epochs':4,
    'weight_decay':0.01,
    'warmup_ratio':0.1,

}

In [None]:
# # wandb
# wandb.login()
# %env WANDB_PROJECT=ft_all_data

In [9]:
# Fine tuning
model_trainer = experiment.train(hf_tokenized_dataset, **training_args)

Trainer device: cuda:0


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Eval precision,Eval recall,Eval f1
1,0.9503,0.384875,0.606987,0.625656,0.61618
2,0.4244,0.352917,0.652141,0.651163,0.651652
3,0.3518,0.344764,0.659124,0.677419,0.668147
4,0.3202,0.348987,0.664956,0.68042,0.672599


In [None]:
# wandb.finish()

In [10]:
# MAMS prediction labels
tr_pred_labels = experiment.generate_output(tokenized_dataset = hf_tokenized_dataset, sample_set = 'train')
te_pred_labels = experiment.generate_output(tokenized_dataset = hf_tokenized_dataset, sample_set = 'test')
val_pred_labels = experiment.generate_output(tokenized_dataset = hf_tokenized_dataset, sample_set = 'validation')

Model loaded to:  cuda


100%|██████████| 1075/1075 [10:42<00:00,  1.67it/s]


Model loaded to:  cuda


100%|██████████| 125/125 [01:14<00:00,  1.67it/s]


Model loaded to:  cuda


100%|██████████| 125/125 [01:17<00:00,  1.62it/s]


In [11]:
# Target output text
tr_labels = [i.strip() for i in hf_dataset['train']['labels']]
te_labels = [i.strip() for i in hf_dataset['test']['labels']]
val_labels = [i.strip() for i in hf_dataset['validation']['labels']]

In [18]:
# Calculate and print metrics
def print_metrics(experiment_obj, true, predicted, dataset):
    p, r, f1 = experiment_obj.get_sentiment_analysis_metrics(true, predicted, task='ase')
    print(f'{dataset} Precision: ', p)
    print(f'{dataset} Recall: ', r)
    print(f'{dataset} F1: ', f1, '\n')

In [21]:
# MAMS
print(' --- Flan-T5 + FT: MAMS Dataset ---\n')
print_metrics(experiment, tr_labels, tr_pred_labels, 'Train')
print_metrics(experiment, te_labels, te_pred_labels, 'Test')
print_metrics(experiment, val_labels, val_pred_labels, 'Validation')

 --- Flan-T5 + FT: MAMS Dataset ---

Train Precision:  0.6810052784999606
Train Recall:  0.7719235577781747
Train F1:  0.7236197731363275 

Test Precision:  0.6323822163238222
Test Recall:  0.712789827973074
Test F1:  0.670182841068917 

Validation Precision:  0.6449226630800269
Validation Recall:  0.719429857464366
Validation F1:  0.6801418439716311 



In [24]:
# Print a sample of the output
for i in random.sample(range(len(tr_labels)), 10):
  print(f'Predicted: {tr_pred_labels[i]}')
  print(f'True:      {tr_labels[i]}')

Predicted: Appetizers:positive, soup:negative
True:      Appetizers:positive, soup:negative
Predicted: Chef:positive, kitchen:neutral
True:      Chef:positive, table:neutral
Predicted: soy sauce:neutral, pish sauce:neutral, garLIC:neutral, ginger:neutral, chicken:neutral, lechon:positive
True:      soy sauce:neutral, pish sauce:positive, garLIC, ginger, chicken:positive
Predicted: dining beacon:positive, beet salad starters:neutral, scallops:neutral, steak tartare:neutral, melon soup:positive, flan-esque orange-and-caramel custard:positive
True:      dining:neutral, tomato soup:positive, beet salad:neutral, scallops:positive, steak tartare:positive, melon soup:positive, orange-and-caramel:positive
Predicted: waitress:positive, food:negative
True:      waitress:positive, food:negative
Predicted: red Borsht:positive, meal:neutral
True:      red Borsht:neutral, meal:positive
Predicted: reservation:neutral, hostess:positive
True:      reservation:neutral, hostess:positive
Predicted: food:n

In [27]:
# Save predicted and true labels
model_name = 'flan-t5_ft_MAMS_'

# MAMS
filenames = ['tr_pred_labels.txt', 'tr_labels.txt',
             'te_pred_labels.txt', 'te_labels.txt',
             'val_pred_labels.txt', 'val_labels.txt']
output = [tr_pred_labels, tr_labels,
          te_pred_labels, te_labels,
          val_pred_labels, val_labels]

for fn, op in zip(filenames, output):
  path = './Output/' + model_name + fn
  with open(path, 'w') as f:
    for line in op:
        f.write(f"{line}\n")