## Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [3]:
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 49.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 69.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 4.1 MB/

In [4]:
import os
import pandas as pd

# Custom Packages
root_path = '/content/drive/MyDrive/Knowledge/MSIT/Research/'
os.chdir(root_path)

from InstructATSC.utils import T5Utils
from InstructATSC.Data.data_prep import ModelReadyData

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
train_out_path = './InstructATSC/Data/semeval14/ABSA_TrainData/'
test_out_path = './InstructATSC/Data/semeval14/ABSA_Gold_TestData/'
rest_train_file_path = './InstructATSC/Data/semeval14/ABSA_TrainData/Restaurants_Train_v2.xml'
laptops_train_file_path = './InstructATSC/Data/semeval14/ABSA_TrainData/Laptop_Train_v2.xml'
rest_test_file_path = './InstructATSC/Data/semeval14/ABSA_Gold_TestData/Restaurants_Test_Gold.xml'
laptops_test_file_path = './InstructATSC/Data/semeval14/ABSA_Gold_TestData/Laptops_Test_Gold.xml'

# Extract data from XML to CSV
model_data = ModelReadyData()
rest_train_file_path_csv = model_data.parse_xml(rest_train_file_path, save_csv = True, output_path = train_out_path, overwrite = False)
laptops_train_file_path_csv = model_data.parse_xml(laptops_train_file_path, save_csv = True, output_path = train_out_path, overwrite = False)
rest_test_file_path_csv = model_data.parse_xml(rest_test_file_path, save_csv = True, output_path = test_out_path, overwrite = False)
laptops_test_file_path_csv = model_data.parse_xml(laptops_test_file_path, save_csv = True, output_path = test_out_path, overwrite = False)

# Load the data
id_tr_df = pd.read_csv(rest_train_file_path_csv)
ood_tr_df = pd.read_csv(laptops_train_file_path_csv)
id_te_df = pd.read_csv(rest_test_file_path_csv)
ood_te_df = pd.read_csv(laptops_test_file_path_csv)

# Extract the aspect term and polarity for each review
id_tr_df = model_data.extract_rowwise_aspect_polarity(id_tr_df, on='aspectTerms', by="term", min_val=1)
ood_tr_df = model_data.extract_rowwise_aspect_polarity(ood_tr_df, on='aspectTerms', by="term", min_val=1)
id_te_df = model_data.extract_rowwise_aspect_polarity(id_te_df, on='aspectTerms', by="term", min_val=1)
ood_te_df = model_data.extract_rowwise_aspect_polarity(ood_te_df, on='aspectTerms', by="term", min_val=1)

# Get the input text into the required format
bos_instruction = 'Review: '
delim_instruction = ' Aspect: '
eos_insrtuction = 'Sentiment:'
id_tr_df = model_data.create_data_in_format(id_tr_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)
ood_tr_df = model_data.create_data_in_format(ood_tr_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)
id_te_df = model_data.create_data_in_format(id_te_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)
ood_te_df = model_data.create_data_in_format(ood_te_df, 'raw_text', 'aspect', 'labels', bos_instruction, delim_instruction, eos_insrtuction)


# Experimentation
experiment_id = 'restaurants'
experiment_name = 'restaurants_vanilla'
model_checkpoint = 't5-base'

# Create T5 utils object
t5_exp = T5Utils(model_checkpoint, experiment_id, id_tr_df, id_te_df, ood_tr_df, ood_te_df)

if IN_COLAB:
    model_out_path = os.path.join(root_path, 'InstructATSC', 'T5', 'ATSC')
else:
    model_out_path = os.getcwd()

model_out_path = os.path.join(model_out_path, f"{model_checkpoint}-{experiment_name}", "checkpoints")
print('Model output path: ', model_out_path)


# Tokenize Datasets
id_dataset, ood_dataset, id_tokenized_dataset, ood_tokenized_dataset = t5_exp.set_data_for_training_semeval(experiment_id)

# Train model
training_args = {
    'output_dir':model_out_path,
    'evaluation_strategy':"epoch",
    'learning_rate':5e-5,
    'per_device_train_batch_size':4,
    'per_device_eval_batch_size':4,
    'num_train_epochs':4,
    'weight_decay':0.01,
    'warmup_ratio':0.1,
    'save_strategy':'no',
    'load_best_model_at_end':False,
    'push_to_hub':False,
    'eval_accumulation_steps':1
}

model_trainer = t5_exp.train(id_tokenized_dataset, **training_args)

File ./InstructATSC/Data/semeval14/ABSA_TrainData/Restaurants_Train_v2.csv has already been extracted!!
File ./InstructATSC/Data/semeval14/ABSA_TrainData/Laptop_Train_v2.csv has already been extracted!!
File ./InstructATSC/Data/semeval14/ABSA_Gold_TestData/Restaurants_Test_Gold.csv has already been extracted!!
File ./InstructATSC/Data/semeval14/ABSA_Gold_TestData/Laptops_Test_Gold.csv has already been extracted!!


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Model output path:  /content/drive/MyDrive/Knowledge/MSIT/Research/InstructATSC/T5/ATSC/t5-base-restaurants_vanilla/checkpoints


  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 5474
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5476
  Number of trainable parameters = 222903552



Model training started ....


Epoch,Training Loss,Validation Loss
1,0.4838,0.381841
2,0.3579,0.354233
3,0.2909,0.331698
4,0.2366,0.382619


***** Running Evaluation *****
  Num examples = 1461
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1461
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1461
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1461
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/Knowledge/MSIT/Research/InstructATSC/T5/ATSC/t5-base-restaurants_vanilla/checkpoints
Configuration saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructATSC/T5/ATSC/t5-base-restaurants_vanilla/checkpoints/config.json
Model weights saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructATSC/T5/ATSC/t5-base-restaurants_vanilla/checkpoints/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Knowledge/MSIT/Research/InstructATSC/T5/ATSC/t5-base-restaurants_vanilla/checkpoints/tokenizer_config.json
Special tokens file saved in /content/drive/My

In [8]:
# Model inference
best_model = 'checkpoints'
print('Getting model from path: ', model_out_path)

# Get prediction labels - Training set (Restaurants Domain)
rest_train_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = id_tokenized_dataset, sample_set = 'train')

# Get prediction labels - Testing set (Restaurants Domain)
rest_test_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = id_tokenized_dataset, sample_set = 'validation')

# Get prediction labels - Training set (Laptops Domain)
lap_train_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = ood_tokenized_dataset, sample_set = 'train')

# Get prediction labels - Testing set (Laptops Domain)
lap_test_labels = t5_exp.get_labels(predictor = model_trainer, tokenized_dataset = ood_tokenized_dataset, sample_set = 'validation')

# Add new column in the respective dataframes
id_tr_df['pred_labels'] = rest_train_labels
id_te_df['pred_labels'] = rest_test_labels
ood_tr_df['pred_labels'] = lap_train_labels
ood_te_df['pred_labels'] = lap_test_labels

# Metrics
print('In domain train accuracy: ', id_tr_df[['labels', 'pred_labels']].apply(lambda x: x[0] == x[1], axis=1).sum()*100/len(id_tr_df))
print('In domain test accuracy: ', id_te_df[['labels', 'pred_labels']].apply(lambda x: x[0] == x[1], axis=1).sum()*100/len(id_te_df))

ood_df = pd.concat([ood_tr_df, ood_te_df])
print('Out of domain accuracy: ', ood_df[['labels', 'pred_labels']].apply(lambda x: x[0] == x[1], axis=1).sum()*100/len(ood_df))

#Dump outputs
dump_path = '/'.join(model_out_path.split('/')[:-1])
id_tr_filename = t5_exp.get_csv_filename(experiment_id)[0]
id_tr_df.to_csv(os.path.join(dump_path, id_tr_filename), index = False)

id_te_filename = t5_exp.get_csv_filename(experiment_id)[1]
id_te_df.to_csv(os.path.join(dump_path, id_te_filename), index = False)

ood_tr_filename = t5_exp.get_csv_filename(experiment_id)[2]
ood_tr_df.to_csv(os.path.join(dump_path, ood_tr_filename), index = False)

ood_te_filename = t5_exp.get_csv_filename(experiment_id)[3]
ood_te_df.to_csv(os.path.join(dump_path, ood_te_filename), index = False)

***** Running Prediction *****
  Num examples = 5474
  Batch size = 4


Getting model from path:  /content/drive/MyDrive/Knowledge/MSIT/Research/InstructATSC/T5/ATSC/t5-base-restaurants_vanilla/checkpoints


***** Running Prediction *****
  Num examples = 1461
  Batch size = 4


***** Running Prediction *****
  Num examples = 4951
  Batch size = 4


***** Running Prediction *****
  Num examples = 1238
  Batch size = 4


In domain train accuracy:  87.77858969674827
In domain test accuracy:  81.38261464750171
Out of domain accuracy:  59.75117143318791
