## Libraries

In [None]:
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    IN_COLAB = True
except:
    IN_COLAB = False

Mounted at /content/drive


In [None]:
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 24.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 89.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 75.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 31.3 MB

In [None]:
import os
import torch

if IN_COLAB:
    root_path = '/content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA'
else:
    root_path = '/Users/kscaria/Library/CloudStorage/GoogleDrive-scariakevin1@gmail.com/My Drive/Knowledge/MSIT/Research/InstructABSA'
    
use_mps = True if torch.has_mps else False
os.chdir(root_path)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from transformers import TrainingArguments

from InstructATE.utils import T5Utils
from InstructATE.data_prep import ModelReadyData

In [None]:
rest_train_file_path = './Data/semeval14/ABSA_TrainData/Restaurants_Train_v2.csv'
laptops_train_file_path = './Data/semeval14/ABSA_TrainData/Laptop_Train_v2.csv'
rest_test_file_path = './Data/semeval14/ABSA_Gold_TestData/Restaurants_Test_Gold.csv'
laptops_test_file_path = './Data/semeval14/ABSA_Gold_TestData/Laptops_Test_Gold.csv'

# Load the data
res_tr_df = pd.read_csv(rest_train_file_path)
lap_tr_df = pd.read_csv(laptops_train_file_path)
res_te_df = pd.read_csv(rest_test_file_path)
lap_te_df = pd.read_csv(laptops_test_file_path)

In [None]:
# Extract the aspect term and polarity for each review
model_data = ModelReadyData()
res_tr_df = model_data.extract_rowwise_aspect_polarity(res_tr_df, on='aspectTerms', by="term", min_val=1)
lap_tr_df = model_data.extract_rowwise_aspect_polarity(lap_tr_df, on='aspectTerms', by="term", min_val=1)
res_te_df = model_data.extract_rowwise_aspect_polarity(res_te_df, on='aspectTerms', by="term", min_val=1)
lap_te_df = model_data.extract_rowwise_aspect_polarity(lap_te_df, on='aspectTerms', by="term", min_val=1)

 # Get the input text into the required format
bos_instruction = 'Review: '
delim_instruction = ' Aspect: '
eos_insrtuction = ' Sentiment:'
res_tr_df = model_data.create_data_in_format(res_tr_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)
lap_tr_df = model_data.create_data_in_format(lap_tr_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)
res_te_df = model_data.create_data_in_format(res_te_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)
lap_te_df = model_data.create_data_in_format(lap_te_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)

In [None]:
# Experimentation
experiment_id = 'combined100'
experiment_name = 'combined100_vanilla'
model_checkpoint = 't5-base'

# Create T5 utils object
t5_exp = T5Utils(model_checkpoint, experiment_id, res_tr_df, res_te_df, lap_tr_df, lap_te_df)

if IN_COLAB:
    model_out_path = os.path.join(root_path, 'T5', 'ATSC')
else:
    model_out_path = os.getcwd()

model_out_path = os.path.join(model_out_path, f"{model_checkpoint}-{experiment_name}", "checkpoints")
print('Model output path: ', model_out_path)


# Tokenize Datasets
dataset, tokenized_dataset = t5_exp.set_data_for_training_semeval(experiment_id)

# Training arguments
training_args = {
    'output_dir':model_out_path,
    'evaluation_strategy':"epoch",
    'learning_rate':5e-5,
    'per_device_train_batch_size':16,
    'per_device_eval_batch_size':16,
    'num_train_epochs':4,
    'weight_decay':0.01,
    'warmup_ratio':0.1,
    'save_strategy':'no',
    'load_best_model_at_end':False,
    'push_to_hub':False,
    'eval_accumulation_steps':1,
    'use_mps_device':use_mps
}

Model output path:  /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATSC/t5-base-combined_vanilla/checkpoints


  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
# Train Model
model_trainer = t5_exp.train(tokenized_dataset, **training_args)

***** Running training *****
  Num examples = 8628
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2160
  Number of trainable parameters = 222903552


Trainer device: cuda:0

Model training started ....


Epoch,Training Loss,Validation Loss
1,1.7822,0.206807
2,0.2148,0.18037
3,0.1637,0.186007
4,0.1386,0.207353


***** Running Evaluation *****
  Num examples = 2360
  Batch size = 16
***** Running Evaluation *****
  Num examples = 2360
  Batch size = 16
***** Running Evaluation *****
  Num examples = 2360
  Batch size = 16
***** Running Evaluation *****
  Num examples = 2360
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATSC/t5-base-combined_vanilla/checkpoints
Configuration saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATSC/t5-base-combined_vanilla/checkpoints/config.json
Model weights saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATSC/t5-base-combined_vanilla/checkpoints/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATSC/t5-base-combined_vanilla/checkpoints/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Kn

In [None]:
# Model inference
best_model = 'checkpoints'
print('Getting model from path: ', model_out_path)

# Get prediction labels - Training set
tr_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = tokenized_dataset, sample_set = 'train')

# Get prediction labels - Testing set
te_pred_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = tokenized_dataset, sample_set = 'validation')

***** Running Prediction *****
  Num examples = 8628
  Batch size = 16


Getting model from path:  /content/drive/MyDrive/Knowledge/MSIT/Research/InstructABSA/T5/ATSC/t5-base-combined_vanilla/checkpoints


***** Running Prediction *****
  Num examples = 2360
  Batch size = 16


In [None]:
# Add new column in the respective dataframes
tr_df = pd.DataFrame(dataset['train'])
te_df = pd.DataFrame(dataset['validation'])

tr_df['pred_labels'] =  tr_pred_labels
te_df['pred_labels'] =  te_pred_labels

# Metrics
print('Train accuracy: ', tr_df[['labels', 'pred_labels']].apply(lambda x: x[0] == x[1], axis=1).sum()*100/len(tr_df))
print('Test accuracy: ', te_df[['labels', 'pred_labels']].apply(lambda x: x[0] == x[1], axis=1).sum()*100/len(te_df))

#Dump outputs
dump_path = '/'.join(model_out_path.split('/')[:-1])
id_tr_filename = t5_exp.get_csv_filename(experiment_id)[0]
tr_df.to_csv(os.path.join(dump_path, id_tr_filename), index = False)

id_te_filename = t5_exp.get_csv_filename(experiment_id)[1]
te_df.to_csv(os.path.join(dump_path, id_te_filename), index = False)

Train accuracy:  93.15020862308762
Test accuracy:  87.33050847457628
