<a href="https://colab.research.google.com/github/ishathombre/ASBA/blob/main/ASBA_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Requirements

In [None]:
!pip install transformers datasets evaluate peft sentencepiece accelerate torch torchvision

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

##Data download and parsing

Datasets downloaded from https://alt.qcri.org/semeval2014/task4/ Semeval2014 Task 4 (ABSA)

In [3]:
!git clone https://github.com/ishathombre/ASBA.git

Cloning into 'ASBA'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 48 (delta 11), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (48/48), 1009.41 KiB | 3.44 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [4]:
# function for data parsing

def parse_data_2014(xml_path):
    container = []  # Initialize Container (List) for Parse Data
    sentences = ET.parse(xml_path).getroot()  # Get Sentence-Level Nodes

    for sentence in sentences:  # Loop Through Sentences
        sentence_id = sentence.attrib["id"]  # Save ID
        sentence_text = sentence.find('text').text  # Save Text
        aspects = sentence.findall('*')  # Get Aspect-Level Nodes

        found_category = False

        for aspect in aspects:  # Loop Through Aspects
            if aspect.tag == "aspectCategories":
                opinions = aspect.findall('*')  # Get Opinion-Level Nodes
                for opinion in opinions:
                    category = opinion.attrib["category"]
                    polarity = opinion.attrib.get("polarity", np.nan)
                    row = {"sentence_id": sentence_id, "sentence": sentence_text, "category": category, "polarity": polarity}
                    container.append(row)
                found_category = True

        if not found_category:
            row = {"sentence_id": sentence_id, "sentence": sentence_text, "category": np.nan, "polarity": np.nan}
            container.append(row)

    return pd.DataFrame(container)

In [5]:
xml_file = 'ASBA/Datasets/Restaurants_Train.xml'
parsed_data = parse_data_2014(xml_file)
parsed_data=parsed_data[parsed_data['category']=='food']

In [6]:
parsed_data['polarity'].value_counts()

positive    867
negative    209
neutral      90
conflict     67
Name: polarity, dtype: int64

##Tokenization

In [7]:
import torch
from transformers import AutoTokenizer, RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from datasets import Dataset

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

df = parsed_data

df = Dataset.from_pandas(df)

train_df = df.select([i for i in range(500)])
eval_df = df.select([i for i in range(500,1000)])

label_map = {'negative': 0, 'neutral': 1, 'positive': 2, 'conflict':3}  # Mapping of string labels to integer values


# Remove any NaN values or missing entries
#df = df.dropna(subset=['category'])

# Ensure each entry represents a single sentiment category
# Convert entries to lowercase to standardize the format
#df['category'] = df['category'].str.lower()

train_texts = train_df['sentence']
train_labels = train_df['polarity']
train_labels =  [label_map[label] for label in train_labels]


eval_texts = eval_df['sentence']
eval_labels = eval_df['polarity']
eval_labels =  [label_map[label] for label in eval_labels]

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
eval_encodings = tokenizer(eval_texts, padding="max_length", truncation=True, max_length=512)


class ABSA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ABSA_Dataset(train_encodings, train_labels)
val_dataset = ABSA_Dataset(eval_encodings, eval_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
train_dataset

<__main__.ABSA_Dataset at 0x7e1de4676bc0>

In [None]:
train_encodings

##Model and Training

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model_1 = "roBERTa"
model_2 =  "deBERTa"

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [39]:
from torch import nn

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute accuracy
    accuracy = (predictions == labels).mean()
    # Compute loss
    loss = nn.CrossEntropyLoss()(torch.tensor(logits), torch.tensor(labels)).item()
    return {'accuracy': accuracy, 'loss': loss}

In [14]:
from transformers import TrainingArguments, Trainer


from transformers import logging
logging.set_verbosity_debug()


epochs = 3
batch_size = 16
num_steps = len(train_dataset) * epochs // batch_size
warmup_steps = num_steps // 10  # 10% of the training steps
save_steps = num_steps // epochs    # Save a checkpoint at the end of each epoch

training_args = TrainingArguments(output_dir="test_trainer/checkpoint1",num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    warmup_steps = warmup_steps,
    weight_decay = 0.01,
    logging_dir = 'logs',
    logging_steps = 10,
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    save_steps = save_steps)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [40]:
trainer = Trainer(model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [41]:
trainer.train()

Currently training with a batch size of: 16
***** Running training *****
  Num examples = 500
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 96
  Number of trainable parameters = 124,648,708


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4537,1.178881,0.622
2,0.357,1.111431,0.676
3,0.4265,1.124339,0.7


Checkpoint destination directory test_trainer/checkpoint1/checkpoint-31 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Saving model checkpoint to test_trainer/checkpoint1/checkpoint-31
Configuration saved in test_trainer/checkpoint1/checkpoint-31/config.json
Model weights saved in test_trainer/checkpoint1/checkpoint-31/model.safetensors
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Checkpoint destination directory test_trainer/checkpoint1/checkpoint-62 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Saving model checkpoint to test_trainer/checkpoint1/checkpoint-62
Configuration saved in test_trainer/checkpoint1/checkpoint-62/config.json
Model weights saved in test_trainer/checkpoint1/checkpoint-62/model.safetensors
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Checkpoint destination directory test_trainer/checkpoint1/checkpoint-93 already exists and is non-empt

TrainOutput(global_step=96, training_loss=0.3741061786810557, metrics={'train_runtime': 195.8023, 'train_samples_per_second': 7.661, 'train_steps_per_second': 0.49, 'total_flos': 394673670144000.0, 'train_loss': 0.3741061786810557, 'epoch': 3.0})

## Evaluation

In [None]:
# evaluate the model
model.eval()

In [55]:
# evaluating the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

eval_loss, accuracy, f1_score = evaluate(model, eval_loader, device)
print(f'Evaluation Loss: {eval_loss}, Accuracy: {accuracy}, F1_score: {f1_score}')

NameError: name 'eval_loader' is not defined

In [43]:
trainer.evaluate(eval_dataset=val_dataset)  # Evaluate on validation dataset
evaluation_metrics = trainer.evaluate()

print(evaluation_metrics)

***** Running Evaluation *****
  Num examples = 500
  Batch size = 16


***** Running Evaluation *****
  Num examples = 500
  Batch size = 16


{'eval_loss': 1.1243388652801514, 'eval_accuracy': 0.7, 'eval_runtime': 16.7263, 'eval_samples_per_second': 29.893, 'eval_steps_per_second': 1.913, 'epoch': 3.0}


In [None]:
import matplotlib.pyplot as plt

# Assuming you have access to the evaluation metrics for each epoch
# Let's say you have lists 'eval_loss' and 'eval_accuracy' containing these metrics for each epoch

def plot_metrics(metrics, metric_name):
    plt.plot(epochs, metrics, label=metric_name)
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend()
    plt.show()

plot_metrics(evaluation_metrics['eval_accuracy'], 'Evaluation Accuracy')
plot_metrics(evaluation_metrics['eval_loss'], 'Evaluation Loss')
