## Libraries

In [1]:
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    IN_COLAB = True
except:
    IN_COLAB = False

Mounted at /content/drive


In [3]:
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m111.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, https://u

In [2]:
import os
import torch

if IN_COLAB:
    root_path = '/content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA'
else:
    root_path = '/Users/kscaria/Library/CloudStorage/GoogleDrive-scariakevin1@gmail.com/My Drive/Knowledge/MSIT/Research/InstructABSA'
    
use_mps = True if torch.has_mps else False
os.chdir(root_path)

In [3]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from InstructABSA.utils import T5AteAtsc
from InstructABSA.data_prep import ModelReadyData

In [4]:
rest_train_file_path = './Data/semeval14/ABSA_TrainData/Restaurants_Train_v2.csv'
laptops_train_file_path = './Data/semeval14/ABSA_TrainData/Laptop_Train_v2.csv'
rest_test_file_path = './Data/semeval14/ABSA_Gold_TestData/Restaurants_Test_Gold.csv'
laptops_test_file_path = './Data/semeval14/ABSA_Gold_TestData/Laptops_Test_Gold.csv'

# Load the data
res_tr_df = pd.read_csv(rest_train_file_path)
lap_tr_df = pd.read_csv(laptops_train_file_path)
res_te_df = pd.read_csv(rest_test_file_path)
lap_te_df = pd.read_csv(laptops_test_file_path)

In [5]:
# Extract the aspect term and polarity for each review
model_data = ModelReadyData()

# Get the input text into the required format
bos_instruction_lap = """Definition: The output will be the aspects (both implicit and explicit) and the aspects sentiment polarity. In cases where there are no aspects the output should be noaspectterm:none.
Positive example 1-
input: I charge it at night and skip taking the cord with me because of the good battery life.
output: battery life:positive, 
Positive example 2-
input: I even got my teenage son one, because of the features that it offers, like, iChat, Photobooth, garage band and more!.
output: features:positive, iChat:positive, Photobooth:positive, garage band:positive
Negative example 1-
input: Speaking of the browser, it too has problems.
output: browser:negative
Negative example 2-
input: The keyboard is too slick.
output: keyboard:negative
Neutral example 1-
input: I took it back for an Asus and same thing- blue screen which required me to remove the battery to reset.
output: battery:neutral
Neutral example 2-
input: Nightly my computer defrags itself and runs a virus scan.
output: virus scan:neutral
Now complete the following example-
input: """

bos_instruction_res = """Definition: The output will be the aspects (both implicit and explicit) and the aspects sentiment polarity. In cases where there are no aspects the output should be noaspectterm:none.
Positive example 1-
input: With the great variety on the menu , I eat here often and never get bored.
output: menu:positive
Positive example 2- 
input: Great food, good size menu, great service and an unpretensious setting.
output: food:positive, menu:positive, service:positive, setting:positive
Negative example 1-
input: They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it.
output: toast:negative, mayonnaise:negative, bacon:negative, ingredients:negative, plate:negative
Negative example 2-
input: The seats are uncomfortable if you are sitting against the wall on wooden benches.
output: seats:negative
Neutral example 1-
input: I asked for seltzer with lime, no ice.
output: seltzer with lime:neutral
Neutral example 2-
input: They wouldnt even let me finish my glass of wine before offering another.
output: glass of wine:neutral
Now complete the following example-
input: """
eos_instruction = ' \noutput:'
res_tr_df = model_data.create_data_in_ate_atsc_format(res_tr_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction_res, eos_instruction)
lap_tr_df = model_data.create_data_in_ate_atsc_format(lap_tr_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction_lap, eos_instruction)
res_te_df = model_data.create_data_in_ate_atsc_format(res_te_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction_res, eos_instruction)
lap_te_df = model_data.create_data_in_ate_atsc_format(lap_te_df, 'term', 'raw_text', 'aspectTerms', 'labels', bos_instruction_lap, eos_instruction)

In [6]:
print(res_tr_df['text'][0])

Definition: The output will be the aspects (both implicit and explicit) and the aspects sentiment polarity. In cases where there are no aspects the output should be noaspectterm:none.
Positive example 1-
input: With the great variety on the menu , I eat here often and never get bored.
output: menu:positive
Positive example 2- 
input: Great food, good size menu, great service and an unpretensious setting.
output: food:positive, menu:positive, service:positive, setting:positive
Negative example 1-
input: They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it.
output: toast:negative, mayonnaise:negative, bacon:negative, ingredients:negative, plate:negative
Negative example 2-
input: The seats are uncomfortable if you are sitting against the wall on wooden benches.
output: seats:negative
Neutral example 1-
input: I asked for seltzer with lime, no ice.

In [7]:
# Experimentation
experiment_id = 'combined100'
experiment_name = 'combined100_instruct_pos_neg_neut'
model_checkpoint = 'allenai/tk-instruct-base-def-pos'

# Create T5 utils object
t5_exp = T5AteAtsc(model_checkpoint, experiment_id, res_tr_df, res_te_df, lap_tr_df, lap_te_df)

if IN_COLAB:
    model_out_path = os.path.join(root_path, 'T5', 'ATE_ATSC')
else:
    model_out_path = os.getcwd()

model_out_path = os.path.join(model_out_path, f"{model_checkpoint}-{experiment_name}", "checkpoints")
print('Model output path: ', model_out_path)


# Tokenize Datasets
dataset, tokenized_dataset = t5_exp.set_data_for_training_semeval(experiment_id)

# Training arguments
training_args = {
    'output_dir':model_out_path,
    'evaluation_strategy':"epoch",
    'learning_rate':5e-5,
    'per_device_train_batch_size':8,
    'per_device_eval_batch_size':8,
    'num_train_epochs':4,
    'weight_decay':0.01,
    'warmup_ratio':0.1,
    'save_strategy':'no',
    'load_best_model_at_end':False,
    'push_to_hub':False,
    'eval_accumulation_steps':1,
    'predict_with_generate':True,
    'use_mps_device':use_mps
}

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model output path:  /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/allenai/tk-instruct-base-def-pos-combined100_instruct/checkpoints


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [8]:
# Train Model
model_trainer = t5_exp.train(tokenized_dataset, **training_args)

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6086
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3044
  Number of trainable parameters = 247534848
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Trainer device: cuda:0

Model training started ....


Epoch,Training Loss,Validation Loss
1,0.5589,0.194247
2,0.2604,0.187404
3,0.2169,0.177466
4,0.176,0.181641


***** Running Evaluation *****
  Num examples = 1600
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1600
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1600
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1600
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/allenai/tk-instruct-base-def-pos-combined100_instruct/checkpoints
Configuration saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/allenai/tk-instruct-base-def-pos-combined100_instruct/checkpoints/config.json
Configuration saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/allenai/tk-instruct-base-def-pos-combined100_instruct/checkpoints/generation_config.json
Model weights saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATE_ATSC/allenai/tk-instru

In [None]:
# Model inference
best_model = 'checkpoints'
print('Getting model from path: ', model_out_path)

# Get prediction labels - Training set
tr_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = tokenized_dataset, sample_set = 'train')

# Get prediction labels - Testing set
te_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = tokenized_dataset, sample_set = 'validation')

In [10]:
# Metrics
def get_f1_acc(dataframe, true_col, pred_col):
    total_pred = 0
    total_gt = 0
    tp = 0
    for gt, pred in zip(dataframe[true_col][:], dataframe[pred_col][:]):
        gt_list = gt.split(', ')
        pred_list = pred.split(', ')
        total_pred+=len(pred_list)
        total_gt+=len(gt_list)
        for gt_val in gt_list:
            for pred_val in pred_list:
                if pred_val in gt_val:
                    tp+=1
    p = tp/total_pred
    r = tp/total_gt
    return p, r, 2*p*r/(p+r)

# Add new column in the respective dataframes
tr_df = pd.DataFrame(dataset['train'])
tr_df['pred_labels'] =  tr_pred_labels

res_te_df['pred_labels'] = te_pred_labels[:800]
lap_te_df['pred_labels'] = te_pred_labels[800:]
res_te_df = res_te_df[['labels', 'text', 'pred_labels']]
lap_te_df = lap_te_df[['labels', 'text', 'pred_labels']]


print('Train precision (Lap + Res):', get_f1_acc(tr_df, 'labels', 'pred_labels')[0])
print('Restaurants test precision:', get_f1_acc(res_te_df, 'labels', 'pred_labels')[0])
print('Laptops test precision:', get_f1_acc(lap_te_df, 'labels', 'pred_labels')[0])
print()
print('Train recall (Lap + Res):', get_f1_acc(tr_df, 'labels', 'pred_labels')[1])
print('Restaurants test recall:', get_f1_acc(res_te_df, 'labels', 'pred_labels')[1])
print('Laptops test  recall:', get_f1_acc(lap_te_df, 'labels', 'pred_labels')[1])
print()
print('Train F1 (Lap + Res):', get_f1_acc(tr_df, 'labels', 'pred_labels')[2])
print('Restaurants test F1:', get_f1_acc(res_te_df, 'labels', 'pred_labels')[2])
print('Laptops test  F1:', get_f1_acc(lap_te_df, 'labels', 'pred_labels')[2])


#Dump outputs
dump_path = '/'.join(model_out_path.split('/')[:-1])

tr_df.to_csv(os.path.join(dump_path, 'combined_train.csv'), index = False)
res_te_df.to_csv(os.path.join(dump_path, 'combined_restaurants_test.csv'), index = False)
lap_te_df.to_csv(os.path.join(dump_path, 'combined_laptops_test.csv'), index = False)

Train precision (Lap + Res): 0.851575456053068
Restaurants test precision: 0.8165354330708662
Laptops test precision: 0.8171828171828172

Train recall (Lap + Res): 0.8328313253012049
Restaurants test recall: 0.7791134485349361
Laptops test  recall: 0.7926356589147286

Train F1 (Lap + Res): 0.8420990980438092
Restaurants test F1: 0.7973856209150326
Laptops test  F1: 0.8047220855878012
