## Libraries

In [1]:
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    IN_COLAB = True
except:
    IN_COLAB = False

Mounted at /content/drive


In [None]:
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 3.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 4.1 MB/s

In [2]:
import os
import torch

if IN_COLAB:
    root_path = '/content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA'
else:
    root_path = '/Users/kscaria/Library/CloudStorage/GoogleDrive-scariakevin1@gmail.com/My Drive/Knowledge/MSIT/Research/InstructABSA'
    
use_mps = True if torch.has_mps else False
os.chdir(root_path)

In [3]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from InstructABSA.utils import T5AteAtsc
from InstructABSA.data_prep import ModelReadyData

In [None]:
rest_train_file_path = './Data/semeval14/ABSA_TrainData/Restaurants_Train_v2.csv'
laptops_train_file_path = './Data/semeval14/ABSA_TrainData/Laptop_Train_v2.csv'
rest_test_file_path = './Data/semeval14/ABSA_Gold_TestData/Restaurants_Test_Gold.csv'
laptops_test_file_path = './Data/semeval14/ABSA_Gold_TestData/Laptops_Test_Gold.csv'

# Load the data
res_tr_df = pd.read_csv(rest_train_file_path)
lap_tr_df = pd.read_csv(laptops_train_file_path)
res_te_df = pd.read_csv(rest_test_file_path)
lap_te_df = pd.read_csv(laptops_test_file_path)

In [None]:
model_data = ModelReadyData()

 # Get the input text into the required format
bos_instruction = 'Review: '
eos_instruction = ' Aspect Sentiments: '
res_tr_df = model_data.create_data_in_ate_atsc_format(res_tr_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction, eos_instruction)
lap_tr_df = model_data.create_data_in_ate_atsc_format(lap_tr_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction, eos_instruction)
res_te_df = model_data.create_data_in_ate_atsc_format(res_te_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction, eos_instruction)
lap_te_df = model_data.create_data_in_ate_atsc_format(lap_te_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction, eos_instruction)

In [None]:
# Experimentation
experiment_id = 'restaurants'
experiment_name = 'restaurants_vanilla'
model_checkpoint = 't5-base'

# Create T5 ATE ATSC Utils object
t5_exp = T5AteAtsc(model_checkpoint, experiment_id, res_tr_df, res_te_df, lap_tr_df, lap_te_df)

if IN_COLAB:
    model_out_path = os.path.join(root_path, 'T5', 'ATE_ATSC')
else:
    model_out_path = os.getcwd()

model_out_path = os.path.join(model_out_path, f"{model_checkpoint}-{experiment_name}", "checkpoints")
print('Model output path: ', model_out_path)

# Tokenize Datasets
id_dataset, ood_dataset, id_tokenized_dataset, ood_tokenized_dataset = t5_exp.set_data_for_training_semeval(experiment_id)

# Training arguments
training_args = {
    'output_dir':model_out_path,
    'evaluation_strategy':"epoch",
    'learning_rate':5e-5,
    'per_device_train_batch_size':16,
    'per_device_eval_batch_size':16,
    'num_train_epochs':4,
    'weight_decay':0.01,
    'warmup_ratio':0.1,
    'save_strategy':'no',
    'load_best_model_at_end':False,
    'push_to_hub':False,
    'eval_accumulation_steps':1,
    'predict_with_generate':True,
    'use_mps_device':use_mps
}

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Model output path:  /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/t5-base-restaurants_vanilla/checkpoints


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# Train Model
model_trainer = t5_exp.train(id_tokenized_dataset, **training_args)

***** Running training *****
  Num examples = 3041
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 764
  Number of trainable parameters = 222903552
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Trainer device: cuda:0

Model training started ....


Epoch,Training Loss,Validation Loss
1,No log,0.279215
2,No log,0.218735
3,0.918000,0.204715
4,0.918000,0.202582


***** Running Evaluation *****
  Num examples = 800
  Batch size = 16
***** Running Evaluation *****
  Num examples = 800
  Batch size = 16
***** Running Evaluation *****
  Num examples = 800
  Batch size = 16
***** Running Evaluation *****
  Num examples = 800
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/t5-base-restaurants_vanilla/checkpoints
Configuration saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/t5-base-restaurants_vanilla/checkpoints/config.json
Model weights saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/t5-base-restaurants_vanilla/checkpoints/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/t5-base-restaurants_vanilla/checkpoints/tokenizer_config.json
Special tokens file saved in /

In [None]:
# Model inference
best_model = 'checkpoints'
print('Getting model from path: ', model_out_path)

# Get prediction labels - Training set (Restaurants Domain)
id_tr_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = id_tokenized_dataset, sample_set = 'train')

# Get prediction labels - Testing set (Restaurants Domain)
id_te_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = id_tokenized_dataset, sample_set = 'validation')

# Get prediction labels - Training set (Laptops Domain)
ood_tr_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = ood_tokenized_dataset, sample_set = 'train')

# Get prediction labels - Testing set (Laptops Domain)
ood_te_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = ood_tokenized_dataset, sample_set = 'validation')

***** Running Prediction *****
  Num examples = 3041
  Batch size = 16


Getting model from path:  /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/t5-base-restaurants_vanilla/checkpoints


***** Running Prediction *****
  Num examples = 800
  Batch size = 16


***** Running Prediction *****
  Num examples = 3045
  Batch size = 16


***** Running Prediction *****
  Num examples = 800
  Batch size = 16


In [None]:
# Add new column in the respective dataframes
id_tr_df = pd.DataFrame(id_dataset['train'])
id_te_df = pd.DataFrame(id_dataset['validation'])
ood_tr_df = pd.DataFrame(ood_dataset['train'])
ood_te_df = pd.DataFrame(ood_dataset['validation'])

id_tr_df['pred_labels'] =  id_tr_pred_labels
id_te_df['pred_labels'] =  id_te_pred_labels
ood_tr_df['pred_labels'] = ood_tr_pred_labels
ood_te_df['pred_labels'] =  ood_te_pred_labels

# Metrics
print('In domain train accuracy: ', id_tr_df[['pred_labels', 'labels']].apply(lambda x: len([i for i in x[1].split(', ') if i in x[0].split(', ')]) == len(x[1].split(', ')), axis=1).sum()/len(id_tr_df))
print('In domain test accuracy: ', id_te_df[['pred_labels', 'labels']].apply(lambda x: len([i for i in x[1].split(', ') if i in x[0].split(', ')]) == len(x[1].split(', ')), axis=1).sum()/len(id_te_df))

ood_df = pd.concat([ood_tr_df, ood_te_df])
print('Out of domain train accuracy: ', ood_tr_df[['pred_labels', 'labels']].apply(lambda x: len([i for i in x[1].split(', ') if i in x[0].split(', ')]) == len(x[1].split(', ')), axis=1).sum()/len(ood_tr_df))
print('Out of domain test accuracy: ', ood_te_df[['pred_labels', 'labels']].apply(lambda x: len([i for i in x[1].split(', ') if i in x[0].split(', ')]) == len(x[1].split(', ')), axis=1).sum()/len(ood_te_df))
print('Out of domain accuracy: ', ood_df[['pred_labels', 'labels']].apply(lambda x: len([i for i in x[1].split(', ') if i in x[0].split(', ')]) == len(x[1].split(', ')), axis=1).sum()/len(ood_df))

#Dump outputs
dump_path = '/'.join(model_out_path.split('/')[:-1])
id_tr_filename = t5_exp.get_csv_filename(experiment_id)[0]
id_tr_df.to_csv(os.path.join(dump_path, id_tr_filename), index = False)

id_te_filename = t5_exp.get_csv_filename(experiment_id)[1]
id_te_df.to_csv(os.path.join(dump_path, id_te_filename), index = False)

ood_tr_filename = t5_exp.get_csv_filename(experiment_id)[2]
ood_tr_df.to_csv(os.path.join(dump_path, ood_tr_filename), index = False)

ood_te_filename = t5_exp.get_csv_filename(experiment_id)[3]
ood_te_df.to_csv(os.path.join(dump_path, ood_te_filename), index = False)

In domain train accuracy:  0.7211443604077606
In domain test accuracy:  0.67625
Out of domain train accuracy:  0.5842364532019705
Out of domain test accuracy:  0.5775
Out of domain accuracy:  0.5828348504551365


In [4]:
id_trdf = pd.read_csv('./T5/ATE_ATSC/t5-base-restaurants_vanilla/restaurants_train.csv')
id_tedf = pd.read_csv('./T5/ATE_ATSC/t5-base-restaurants_vanilla/restaurants_test.csv')
ood_tedf = pd.read_csv('./T5/ATE_ATSC/t5-base-restaurants_vanilla/laptops_test.csv')

def get_f1_p_r(dataframe, true_col, pred_col):
    total_pred = 0
    total_gt = 0
    tp = 0
    for gt, pred in zip(dataframe[true_col][:], dataframe[pred_col][:]):
        gt_list = gt.split(', ')
        pred_list = pred.split(', ')
        total_pred+=len(pred_list)
        total_gt+=len(gt_list)
        for gt_val in gt_list:
            for pred_val in pred_list:
                if pred_val in gt_val:
                    tp+=1
    p = tp/total_pred
    r = tp/total_gt
    return p, r, 2*p*r/(p+r)

print('Train precision:', get_f1_p_r(id_trdf, 'labels', 'pred_labels')[0])
print('Test precision:', get_f1_p_r(id_tedf, 'labels', 'pred_labels')[0])
print('OOD Test precision:',get_f1_p_r(ood_tedf, 'labels', 'pred_labels')[0])

print('Train Recall:', get_f1_p_r(id_trdf, 'labels', 'pred_labels')[1])
print('Test Recall:', get_f1_p_r(id_tedf, 'labels', 'pred_labels')[1])
print('OOD Test Recall:',get_f1_p_r(ood_tedf, 'labels', 'pred_labels')[1])

print('Train f1:', get_f1_p_r(id_trdf, 'labels', 'pred_labels')[2])
print('Test f1:', get_f1_p_r(id_tedf, 'labels', 'pred_labels')[2])
print('OOD Test f1:', get_f1_p_r(ood_tedf, 'labels', 'pred_labels')[2])

Train precision: 0.8082191780821918
Test precision: 0.7971246006389776
OOD Test precision: 0.6572052401746725
Train Recall: 0.7754928980284079
Test Recall: 0.7498121712997746
OOD Test Recall: 0.5833333333333334
Train f1: 0.791517905441956
Test f1: 0.7727448703058459
OOD Test f1: 0.618069815195072


In [6]:
# id_trdf = pd.read_csv('./T5/ATE_ATSC/t5-base-restaurants_vanilla/restaurants_train.csv')
# id_tedf = pd.read_csv('./T5/ATE_ATSC/t5-base-restaurants_vanilla/restaurants_test.csv')
# ood_tedf = pd.read_csv('./T5/ATE_ATSC/t5-base-restaurants_vanilla/laptops_test.csv')

# def get_f1_p_r(dataframe, true_col, pred_col):
#     total_pred = 0
#     total_gt = 0
#     tp = 0
#     for gt, pred in zip(dataframe[true_col][:], dataframe[pred_col][:]):
#         gt_list = gt.split(', ')
#         pred_list = pred.split(', ')
#         total_pred+=len(pred_list)
#         total_gt+=len(gt_list)
#         for gt_val in gt_list:
#             gt_asp = gt_val.split(':')[0]
#             try:
#                 gt_sent = gt_val.split(':')[1]
#             except:
#                 continue
#             for pred_val in pred_list:
#                 pr_asp = pred_val.split(':')[0]
#                 try:
#                     pr_sent = pred_val.split(':')[1]
#                 except:
#                     continue
#                 if pr_asp in gt_asp and gt_sent == pr_sent:
#                     tp+=1
#     p = tp/total_pred
#     r = tp/total_gt
#     return p, r, 2*p*r/(p+r)
    
# print('Train precision:', get_f1_p_r(id_trdf, 'labels', 'pred_labels')[0])
# print('Test precision:', get_f1_p_r(id_tedf, 'labels', 'pred_labels')[0])
# print('OOD Test precision:',get_f1_p_r(ood_tedf, 'labels', 'pred_labels')[0])

# print('Train Recall:', get_f1_p_r(id_trdf, 'labels', 'pred_labels')[1])
# print('Test Recall:', get_f1_p_r(id_tedf, 'labels', 'pred_labels')[1])
# print('OOD Test Recall:',get_f1_p_r(ood_tedf, 'labels', 'pred_labels')[1])

# print('Train f1:', get_f1_p_r(id_trdf, 'labels', 'pred_labels')[2])
# print('Test f1:', get_f1_p_r(id_tedf, 'labels', 'pred_labels')[2])
# print('OOD Test f1:', get_f1_p_r(ood_tedf, 'labels', 'pred_labels')[2])