In [1]:
pip install transformers datasets evaluate peft sentencepiece accelerate

Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
     ---------------------------------------- 84.1/84.1 kB 4.6 MB/s eta 0:00:00
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
     ------------------------------------- 199.1/199.1 kB 11.8 MB/s eta 0:00:00
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-win_amd64.whl (991 kB)
     ------------------------------------- 991.5/991.5 kB 15.8 MB/s eta 0:00:00
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
     -------------------------------------- 290.1/290.1 kB 9.0 MB/s eta 0:00:00
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting safetensors
  Downloading safetensors-0.4.2-cp310-none-win_amd64.whl (269 kB)
     -------------------------------------- 269.5/269.5 kB 8.1 MB/s eta 0:00:00
Collecting torch>=1.13.0
  Download



In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split


In [2]:


def parse_data_2014(xml_path):
    container = []  # Initialize Container (List) for Parse Data
    sentences = ET.parse(xml_path).getroot()  # Get Sentence-Level Nodes

    for sentence in sentences:  # Loop Through Sentences
        sentence_id = sentence.attrib["id"]  # Save ID
        sentence_text = sentence.find('text').text  # Save Text
        aspects = sentence.findall('*')  # Get Aspect-Level Nodes

        found_category = False

        for aspect in aspects:  # Loop Through Aspects
            if aspect.tag == "aspectCategories":
                opinions = aspect.findall('*')  # Get Opinion-Level Nodes
                for opinion in opinions:
                    category = opinion.attrib["category"]
                    polarity = opinion.attrib.get("polarity", np.nan)
                    row = {"sentence_id": sentence_id, "sentence": sentence_text, "category": category, "polarity": polarity}
                    container.append(row)
                found_category = True

        if not found_category:
            row = {"sentence_id": sentence_id, "sentence": sentence_text, "category": np.nan, "polarity": np.nan}
            container.append(row)

    return pd.DataFrame(container)

In [40]:
xml_file = 'X:\My Downloads\Restaurants_Train.xml'
parsed_data = parse_data_2014(xml_file)
parsed_data=parsed_data[parsed_data['category']=='food']

In [23]:
parsed_data['polarity'].value_counts()

positive    2178
negative     839
neutral      501
conflict     196
Name: polarity, dtype: int64

In [43]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = parsed_data

df = Dataset.from_pandas(df)

train_df = df.select([i for i in range(500)])
eval_df = df.select([i for i in range(500,1000)])

label_map = {'negative': 0, 'neutral': 1, 'positive': 2, 'conflict':3}  # Mapping of string labels to integer values


# Remove any NaN values or missing entries
#df = df.dropna(subset=['category'])

# Ensure each entry represents a single sentiment category
# Convert entries to lowercase to standardize the format
#df['category'] = df['category'].str.lower()

train_texts = train_df['sentence']
train_labels = train_df['polarity']
train_labels =  [label_map[label] for label in train_labels]


eval_texts = eval_df['sentence']
eval_labels = eval_df['polarity']
eval_labels =  [label_map[label] for label in eval_labels]

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
eval_encodings = tokenizer(eval_texts, padding="max_length", truncation=True, max_length=512)


class ABSA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ABSA_Dataset(train_encodings, train_labels)
val_dataset = ABSA_Dataset(eval_encodings, eval_labels)






loading file vocab.txt from cache at C:\Users\S5689228/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\S5689228/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\S5689228/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "la

In [26]:
n_dataset

<__main__.ABSA_Dataset at 0x2359d1cf0a0>

In [5]:
tokenized_datasets

Dataset({
    features: ['sentence_id', 'sentence', 'category', 'polarity', 'input_ids', 'attention_mask', 'token_type_ids', 'labels'],
    num_rows: 3714
})

In [44]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

loading configuration file config.json from cache at C:\Users\S5689228/.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

load

In [45]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


In [46]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [47]:
from transformers import TrainingArguments


from transformers import logging
logging.set_verbosity_debug()


epochs = 2
batch_size = 8
num_steps = len(train_dataset) * epochs // batch_size
warmup_steps = num_steps // 10  # 10% of the training steps
save_steps = num_steps // epochs    # Save a checkpoint at the end of each epoch

training_args = TrainingArguments(output_dir="test_trainer/checkpoint1",num_train_epochs = epochs,              
    per_device_train_batch_size = batch_size,  
    per_device_eval_batch_size = batch_size,   
    warmup_steps = warmup_steps,   
    weight_decay = 0.01,               
    logging_dir = 'logs',            
    logging_steps = 10,
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    save_steps = save_steps)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [48]:
trainer = Trainer(model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 500
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 126
  Number of trainable parameters = 109485316


Epoch,Training Loss,Validation Loss
