<a href="https://colab.research.google.com/github/kevinscaria/InstructABSA/blob/main/InstructABSA%20-%20ATE%20-%20Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [2]:
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    IN_COLAB = True
except:
    IN_COLAB = False

Mounted at /content/drive


In [None]:
  if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install sentencepiece

In [4]:
import os
import torch

if IN_COLAB:
    root_path = '/content/drive/MyDrive/Knowledge/MSIT/Research/Completed Work/InstructABSA'
else:
    root_path = '/Users/kscaria/Library/CloudStorage/GoogleDrive-scariakevin1@gmail.com/My Drive/Knowledge/MSIT/Research/Completed Work/InstructABSA'
    
use_mps = True if torch.has_mps else False
os.chdir(root_path)

In [5]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from InstructABSA.data_prep import DatasetLoader
from InstructABSA.utils import T5Generator, T5Classifier, Evaluator
from instructions import InstructionsHandler

In [6]:
task_name = 'ate'
experiment_name = 'rest2015_iabsa1'
model_checkpoint = 'allenai/tk-instruct-base-def-pos'
print('Experiment Name: ', experiment_name)
model_out_path = '/content/drive/MyDrive/Knowledge/MSIT/Research/Completed Work/InstructABSA/Models'
model_out_path = os.path.join(model_out_path, task_name, f"{model_checkpoint.replace('/', '')}-{experiment_name}")
print('Model output path: ', model_out_path)

Experiment Name:  rest2015_iabsa1
Model output path:  /content/drive/MyDrive/Knowledge/MSIT/Research/Completed Work/InstructABSA/Models/ate/allenaitk-instruct-base-def-pos-rest2015_iabsa1


In [5]:
# Load the data
id_train_file_path = './Dataset/Restaurants_Train.csv'
id_test_file_path = './Dataset/Restaurants_Test.csv'
id_tr_df = pd.read_csv(id_train_file_path)
id_te_df = pd.read_csv(id_test_file_path)

# Get the input text into the required format using Instructions
instruct_handler = InstructionsHandler()
instruct_handler.load_instruction_set2()

loader = DatasetLoader(id_tr_df, id_te_df)
if loader.train_df_id is not None:
    loader.train_df_id = loader.create_data_in_ate_format(loader.train_df_id, 'term', 'raw_text', 'aspectTerms', instruct_handler.ate['bos_instruct1'], instruct_handler.ate['eos_instruct'])
if loader.test_df_id is not None:
    loader.test_df_id = loader.create_data_in_ate_format(loader.test_df_id, 'term', 'raw_text', 'aspectTerms', instruct_handler.ate['bos_instruct1'], instruct_handler.ate['eos_instruct'])

In [6]:
print(loader.train_df_id['text'].iloc[1] + ' ' + loader.train_df_id['labels'].iloc[1])

Definition: The output will be the aspects (both implicit and explicit) which have an associated opinion that are extracted from the input text. In cases where there are no aspects the output should be noaspectterm.
                            Positive example 1-
                            input: I charge it at night and skip taking the cord with me because of the good battery life.
                            output: battery life
                            Positive example 2-
                            input: I even got my teenage son one, because of the features that it offers, like, iChat, Photobooth, garage band and more!.
                            output: features, iChat, Photobooth, garage band
                            Negative example 1-
                            input: Speaking of the browser, it too has problems.
                            output: browser
                            Negative example 2-
                            input: The keyboard is too slick.
  

In [10]:
# Create T5 utils object
t5_exp = T5Generator(model_checkpoint)

# Tokenize Dataset
id_ds, id_tokenized_ds, ood_ds, ood_tokenzed_ds = loader.set_data_for_training_semeval(t5_exp.tokenize_function_inputs)

# Training arguments
training_args = {
    'output_dir':model_out_path,
    'evaluation_strategy':"epoch",
    'learning_rate':5e-5,
    'per_device_train_batch_size':8,
    'per_device_eval_batch_size':8,
    'num_train_epochs':4,
    'weight_decay':0.01,
    'warmup_ratio':0.1,
    'save_strategy':'no',
    'load_best_model_at_end':False,
    'push_to_hub':False,
    'eval_accumulation_steps':1,
    'predict_with_generate':True,
    'use_mps_device':use_mps
}

Experiment Name:  rest2015_iabsa1
Model output path:  /content/drive/MyDrive/Knowledge/MSIT/Research/Completed Work/InstructABSA/Models/ate/allenaitk-instruct-base-def-pos-rest2015_iabsa1


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/495M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/1028 [00:00<?, ? examples/s]

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

In [31]:
# Train model
model_trainer = t5_exp.train(id_tokenized_ds, **training_args)

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, __index_level_0__, aspectTerms, raw_text. If text, __index_level_0__, aspectTerms, raw_text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1028
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 516
  Number of trainable parameters = 247534848
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Trainer device: cuda:0

Model training started ....


Epoch,Training Loss,Validation Loss
1,No log,0.632365
2,No log,0.525082
3,No log,0.52159
4,0.525700,0.494442


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, aspectTerms, raw_text. If text, aspectTerms, raw_text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 504
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, aspectTerms, raw_text. If text, aspectTerms, raw_text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 504
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: text, aspectTerms, raw_text. If text, aspectTerms, raw_text are not expected by `T5ForConditionalGeneration.forward`,  you 

In [None]:
# Model inference
best_model = 'checkpoints'
print('Getting model from path: ', model_out_path)

try:
    model_trainer
except:
    model_trainer = None

# Get prediction labels - Training set
id_tr_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = id_tokenized_ds, sample_set = 'train', trained_model_path = model_out_path)

# Get prediction labels - Testing set
id_te_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = id_tokenized_ds, sample_set = 'test', trained_model_path = model_out_path)

# # Get prediction labels - OOD Testing set
# ood_te_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = ood_tokenized_dataset, sample_set = 'validation', trained_model_path = model_out_path)

In [12]:
# Add new column in the respective dataframes
id_tr_df = pd.DataFrame(id_ds['train'])
id_te_df = pd.DataFrame(id_ds['test'])

id_tr_df['pred_labels'] = id_tr_pred_labels
id_te_df['pred_labels'] = id_te_pred_labels

# Metrics
def get_f1_acc(dataframe, true_col, pred_col):
    total_pred = 0
    total_gt = 0
    tp = 0
    for gt, pred in zip(dataframe[true_col][:], dataframe[pred_col][:]):
        gt_list = gt.split(', ')
        pred_list = pred.split(', ')
        total_pred+=len(pred_list)
        total_gt+=len(gt_list)
        for gt_val in gt_list:
            for pred_val in pred_list:
                if pred_val.lower().strip() == gt_val.lower().strip():
                    tp+=1
    p = tp/total_pred
    r = tp/total_gt
    return p, r, 2*p*r/(p+r)

print('Train precision:', get_f1_acc(id_tr_df, 'labels', 'pred_labels')[0])
print('Test precision:', get_f1_acc(id_te_df, 'labels', 'pred_labels')[0])

print('Train recall:', get_f1_acc(id_tr_df, 'labels', 'pred_labels')[1])
print('Test recall:', get_f1_acc(id_te_df, 'labels', 'pred_labels')[1])

print('Train F1:', get_f1_acc(id_tr_df, 'labels', 'pred_labels')[2])
print('Test F1:', get_f1_acc(id_te_df, 'labels', 'pred_labels')[2])

# #Dump outputs
id_tr_df.to_csv(os.path.join(model_out_path, 'train.csv'), index=False)
id_te_df.to_csv(os.path.join(model_out_path, 'test.csv'), index=False)

Train precision: 0.9429945054945055
Test precision: 0.7931547619047619
Train recall: 0.9302168021680217
Test recall: 0.760342368045649
Train F1: 0.93656207366985
Test F1: 0.7764020393299343
