<a href="https://colab.research.google.com/github/ishathombre/ASBA/blob/main/ASBA_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Requirements

In [None]:
!pip install transformers datasets evaluate peft sentencepiece accelerate torch torchvision

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split

##Data download and parsing

Datasets downloaded from https://alt.qcri.org/semeval2014/task4/ Semeval2014 Task 4 (ABSA)

In [None]:
!git clone https://github.com/ishathombre/ASBA.git

Cloning into 'ASBA'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 80 (delta 31), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (80/80), 1.01 MiB | 3.63 MiB/s, done.
Resolving deltas: 100% (31/31), done.


In [None]:
%cd ASBA
#import preprocessing
#import train
from main import *

/content/ASBA


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
xml_rest = 'Datasets/Restaurants_Train_v2.xml'
xml_lapt = 'Datasets/Laptop_Train_v2.xml'

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET

def xml_to_df(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    data = []
    for sentence in root.findall('sentence'):
        sentence_id = sentence.get('id')
        text = sentence.find('text').text

        aspect_terms = sentence.find('aspectTerms')
        if aspect_terms is not None:
            for aspect_term in aspect_terms.findall('aspectTerm'):
                term = aspect_term.get('term')
                polarity = aspect_term.get('polarity')
                from_index = aspect_term.get('from')
                to_index = aspect_term.get('to')
                data.append([sentence_id, text, term, polarity, from_index, to_index])
        else:
            data.append([sentence_id, text, None, None, None, None])

    df = pd.DataFrame(data, columns=['sentence_id', 'text', 'term', 'polarity', 'from_index', 'to_index'])
    return df


In [None]:
parsed_data = xml_to_df(xml_rest)
parsed_data_ = xml_to_df(xml_lapt)

## Data Inspection

In [None]:
parsed_data['polarity'].value_counts()

polarity
positive    2164
negative     805
neutral      633
conflict      91
Name: count, dtype: int64

In [None]:
parsed_data['term'].value_counts()

term
food               357
service            206
place               64
prices              60
menu                57
                  ... 
Thai cuisine         1
herbs                1
tomatoes             1
root vegetables      1
glass noodles        1
Name: count, Length: 1288, dtype: int64

In [None]:
parsed_data_['term'].value_counts()

term
screen            58
price             55
use               53
battery life      52
battery           45
                  ..
OpenOffice         1
PRODUCT KEY        1
one touch keys     1
Delivery           1
stability          1
Name: count, Length: 1042, dtype: int64

In [None]:
parsed_data_['polarity'].value_counts()

polarity
positive    987
negative    866
neutral     460
conflict     45
Name: count, dtype: int64

In [None]:
print(parsed_data)

In [None]:
df = pd.concat([parsed_data,parsed_data_], ignore_index=True)

In [None]:
todrop=['sentence_id', 'from_index', 'to_index']
df.drop(columns=todrop, inplace=True)
# Remove any NaN values or missing entries
df = df.dropna(subset=['term'])

In [None]:
print(df)

                                                   text  \
0                  But the staff was so horrible to us.   
1     To be completely fair, the only redeeming fact...   
2     The food is uniformly exceptional, with a very...   
3     The food is uniformly exceptional, with a very...   
4     The food is uniformly exceptional, with a very...   
...                                                 ...   
8612  We also use Paralles so we can run virtual mac...   
8613  We also use Paralles so we can run virtual mac...   
8618  How Toshiba handles the repair seems to vary, ...   
8619  How Toshiba handles the repair seems to vary, ...   
8620  I would like to use a different operating syst...   

                                term  polarity  
0                              staff  negative  
1                               food  positive  
2                               food  positive  
3                            kitchen  positive  
4                               menu   neutral

##Tokenization

In [None]:
class ABSA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
import torch
from transformers import AutoTokenizer, BertTokenizer, RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Splitting the DataFrame into two parts with a proportion of 80% and 20% (Train and Test)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Printing the shapes of the resulting DataFrames
print("Shape of training set:", df_train.shape)
print("Shape of testing set:", df_test.shape)

label_map = {'negative': 0, 'neutral': 1, 'positive': 2, 'conflict':3}  # Mapping of string labels to integer values

df_train['polarity'] = df['polarity'].map(label_map)
df_test['polarity'] = df['polarity'].map(label_map)

train_texts = df_train['text'].tolist()
train_terms = df_train['term'].tolist()
train_labels = df_train['polarity'].tolist()

val_texts = df_train['text'].tolist()
val_terms = df_test['term'].tolist()
val_labels = df_test['polarity'].tolist()

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512) #removed return_tensors="pt"
val_encodings = tokenizer(val_texts, padding="max_length", truncation=True, max_length=512)

train_dataset = ABSA_Dataset(train_encodings, train_labels)
val_dataset = ABSA_Dataset(val_encodings, val_labels)

In [None]:
train_dataset.encodings

{'input_ids': tensor([[  101,  1045,  2036,  ...,     0,     0,     0],
        [  101,  1045,  2066,  ...,     0,     0,     0],
        [  101,  1996, 15708,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2387,  ...,     0,     0,     0],
        [  101,  2096,  1045,  ...,     0,     0,     0],
        [  101, 10733,  2003,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
train_dataset.labels

[0,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 2,
 0,
 2,
 1,
 0,
 2,
 1,
 2,
 0,
 2,
 1,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 1,
 3,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 1,
 2,
 0,
 2,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 0,
 2,
 2,
 1,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 0,
 1,
 0,
 1,
 2,
 1,
 0,
 2,
 0,
 2,
 1,
 1,
 2,
 0,
 2,
 1,
 0,
 0,
 2,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 1,
 1,
 2,
 0,
 2,
 2,
 0,
 2,
 2,
 0,
 2,
 0,
 2,
 1,
 2,
 2,
 0,
 1,
 0,
 0,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 2,
 2,
 1,
 2,
 0,
 0,
 0,
 1,
 1,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 3,
 2,
 0,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 2,
 1,
 2,
 2,
 0,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,


##Model and Training

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_1 = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loadi

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


In [None]:
from torch import nn

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute accuracy
    accuracy = (predictions == labels).mean()
    # Compute loss
    loss = nn.CrossEntropyLoss()(torch.tensor(logits), torch.tensor(labels)).item()
    return {'accuracy': accuracy, 'loss': loss}

In [None]:
from transformers import TrainingArguments, Trainer


from transformers import logging
logging.set_verbosity_debug()


epochs = 3
batch_size = 16
num_steps = len(train) * epochs // batch_size
warmup_steps = num_steps // 10  # 10% of the training steps
save_steps = num_steps // epochs    # Save a checkpoint at the end of each epoch

training_args = TrainingArguments(output_dir="test_trainer/checkpoint1",num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    warmup_steps = warmup_steps,
    weight_decay = 0.01,
    logging_dir = 'logs',
    logging_steps = 10,
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    save_steps = save_steps)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(model_1,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Currently training with a batch size of: 16
***** Running training *****
  Num examples = 4,840
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 909
  Number of trainable parameters = 124,648,708
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
checkpoint_path = 'roberta'
torch.save(model_1, checkpoint_path)

## Evaluation

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import Softmax

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Example input text
input_text = "Your input text goes here."

# Tokenize input text
inputs = tokenizer(input_text, return_tensors='pt')

# Forward pass through BERT model
outputs = model(**inputs)

# Get logits from the last layer of BERT
logits = outputs.logits

# Apply softmax to get probabilities
softmax = Softmax(dim=1)
probabilities = softmax(logits)

# Convert probabilities to numpy array for easier manipulation
probabilities = probabilities.detach().numpy()

# Print probabilities
print(probabilities)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[[0.5170282  0.48297176]]


In [None]:
from evaluate import evaluator

task_evaluator = evaluator("text-classification")
eval_results = task_evaluator.compute(
    model_or_pipeline=model_1,
    data=eval_encodings,
    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    label_mapping={'negative': 0, 'neutral': 1, 'positive': 2, 'conflict':3}
)
print(eval_results)

In [None]:
# evaluate the model
model_1.eval()

In [None]:
trainer.evaluate(eval_dataset=val_dataset)  # Evaluate on validation dataset
evaluation_metrics = trainer.evaluate()

print(evaluation_metrics)

In [None]:
import matplotlib.pyplot as plt

# Assuming you have access to the evaluation metrics for each epoch
# Let's say you have lists 'eval_loss' and 'eval_accuracy' containing these metrics for each epoch

def plot_metrics(metrics, metric_name):
    plt.plot(epochs, metrics, label=metric_name)
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend()
    plt.show()

plot_metrics(evaluation_metrics['eval_accuracy'], 'Evaluation Accuracy')
plot_metrics(evaluation_metrics['eval_loss'], 'Evaluation Loss')


In [None]:
import torch
import matplotlib.pyplot as plt

# Example function to train the model
def train_model(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    accuracy = 100. * correct / total
    average_loss = total_loss / len(train_loader)

    return accuracy, average_loss

# Example function to validate the model
def validate_model(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    accuracy = 100. * correct / total
    average_loss = total_loss / len(val_loader)

    return accuracy, average_loss

# Example training loop
train_accuracy_history = []
val_accuracy_history = []

# Assuming train_loader and val_loader are your data loaders
for epoch in range(epochs):
    train_accuracy, train_loss = train_model(model_1, train_loader, optimizer, criterion)
    val_accuracy, val_loss = validate_model(model_1, val_loader, criterion)

    train_accuracy_history.append(train_accuracy)
    val_accuracy_history.append(val_accuracy)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%')
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
    print()

# Plotting train and validation accuracy
plt.plot(range(1, num_epochs+1), train_accuracy_history, label='Train Accuracy')
plt.plot(range(1, num_epochs+1), val_accuracy_history, label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Train and Validation Accuracy')
plt.legend()
plt.show()


NameError: name 'train_loader' is not defined