<a href="https://colab.research.google.com/github/ishathombre/ASBA/blob/main/ASBA_2.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Requirements

In [11]:
!pip install transformers datasets evaluate peft sentencepiece accelerate torch torchvision



In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

##Data download and parsing

Datasets downloaded from https://alt.qcri.org/semeval2014/task4/ Semeval2014 Task 4 (ABSA)

In [5]:
!git clone https://github.com/ishathombre/ASBA.git

Cloning into 'ASBA'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 36 (delta 5), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (36/36), 992.80 KiB | 9.37 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [3]:


def parse_data_2014(xml_path):
    container = []  # Initialize Container (List) for Parse Data
    sentences = ET.parse(xml_path).getroot()  # Get Sentence-Level Nodes

    for sentence in sentences:  # Loop Through Sentences
        sentence_id = sentence.attrib["id"]  # Save ID
        sentence_text = sentence.find('text').text  # Save Text
        aspects = sentence.findall('*')  # Get Aspect-Level Nodes

        found_category = False

        for aspect in aspects:  # Loop Through Aspects
            if aspect.tag == "aspectCategories":
                opinions = aspect.findall('*')  # Get Opinion-Level Nodes
                for opinion in opinions:
                    category = opinion.attrib["category"]
                    polarity = opinion.attrib.get("polarity", np.nan)
                    row = {"sentence_id": sentence_id, "sentence": sentence_text, "category": category, "polarity": polarity}
                    container.append(row)
                found_category = True

        if not found_category:
            row = {"sentence_id": sentence_id, "sentence": sentence_text, "category": np.nan, "polarity": np.nan}
            container.append(row)

    return pd.DataFrame(container)

In [8]:
xml_file = 'ASBA/Datasets/Restaurants_Train.xml'
parsed_data = parse_data_2014(xml_file)
parsed_data=parsed_data[parsed_data['category']=='food']

In [9]:
parsed_data['polarity'].value_counts()

positive    867
negative    209
neutral      90
conflict     67
Name: polarity, dtype: int64

##Tokenization

In [12]:
import torch
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = parsed_data

df = Dataset.from_pandas(df)

train_df = df.select([i for i in range(500)])
eval_df = df.select([i for i in range(500,1000)])

label_map = {'negative': 0, 'neutral': 1, 'positive': 2, 'conflict':3}  # Mapping of string labels to integer values


# Remove any NaN values or missing entries
#df = df.dropna(subset=['category'])

# Ensure each entry represents a single sentiment category
# Convert entries to lowercase to standardize the format
#df['category'] = df['category'].str.lower()

train_texts = train_df['sentence']
train_labels = train_df['polarity']
train_labels =  [label_map[label] for label in train_labels]


eval_texts = eval_df['sentence']
eval_labels = eval_df['polarity']
eval_labels =  [label_map[label] for label in eval_labels]

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
eval_encodings = tokenizer(eval_texts, padding="max_length", truncation=True, max_length=512)


class ABSA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ABSA_Dataset(train_encodings, train_labels)
val_dataset = ABSA_Dataset(eval_encodings, eval_labels)


In [14]:
train_dataset

<__main__.ABSA_Dataset at 0x7bc3fc4b9690>

In [None]:
train_encodings

##Model and Training

In [17]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [25]:
from transformers import TrainingArguments, Trainer


from transformers import logging
logging.set_verbosity_debug()


epochs = 5
batch_size = 16
num_steps = len(train_dataset) * epochs // batch_size
warmup_steps = num_steps // 10  # 10% of the training steps
save_steps = num_steps // epochs    # Save a checkpoint at the end of each epoch

training_args = TrainingArguments(output_dir="test_trainer/checkpoint1",num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    warmup_steps = warmup_steps,
    weight_decay = 0.01,
    logging_dir = 'logs',
    logging_steps = 10,
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    save_steps = save_steps)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
trainer = Trainer(model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [27]:
trainer.train()

Currently training with a batch size of: 16
***** Running training *****
  Num examples = 500
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 160
  Number of trainable parameters = 109,485,316


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4972,0.566518,0.814
2,0.3189,0.58666,0.818
3,0.2647,0.611939,0.812
4,0.191,0.6725,0.808


Saving model checkpoint to test_trainer/checkpoint1/tmp-checkpoint-31
Configuration saved in test_trainer/checkpoint1/tmp-checkpoint-31/config.json
Model weights saved in test_trainer/checkpoint1/tmp-checkpoint-31/model.safetensors
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Checkpoint destination directory test_trainer/checkpoint1/checkpoint-62 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Saving model checkpoint to test_trainer/checkpoint1/checkpoint-62
Configuration saved in test_trainer/checkpoint1/checkpoint-62/config.json
Model weights saved in test_trainer/checkpoint1/checkpoint-62/model.safetensors
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Saving model checkpoint to test_trainer/checkpoint1/tmp-checkpoint-93
Configuration saved in test_trainer/checkpoint1/tmp-checkpoint-93/config.json
Model weights saved in test_trainer/checkpoint1/tmp-checkpoint-93/model.safetensors
***** Running Eva

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4972,0.566518,0.814
2,0.3189,0.58666,0.818
3,0.2647,0.611939,0.812
4,0.191,0.6725,0.808
5,0.1106,0.660362,0.81




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=160, training_loss=0.29823418483138087, metrics={'train_runtime': 340.842, 'train_samples_per_second': 7.335, 'train_steps_per_second': 0.469, 'total_flos': 657789450240000.0, 'train_loss': 0.29823418483138087, 'epoch': 5.0})