In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score
from peft import get_peft_model, LoraConfig, TaskType

In [2]:
df_train = pd.read_csv("../SA/train.csv", encoding='ISO-8859-1')
df_train.dropna(subset=['text', 'sentiment'], inplace=True)
df_test = pd.read_csv("../SA/test.csv", encoding='ISO-8859-1')
df_test.dropna(subset=['text', 'sentiment'], inplace=True)

# Ensure text data is a list of strings and drop missing values
df_train['text'] = df_train['text'].astype(str)
df_test['text'] = df_test['text'].astype(str)

df_train = df_train.rename(columns={'sentiment': 'label'})
df_test = df_test.rename(columns={'sentiment': 'label'})

df_train['label'] = df_train['label'].apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)
df_test['label'] = df_test['label'].apply(lambda x: 2 if x == 'positive' else 1 if x == 'neutral' else 0)


df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

In [3]:
model_name = 'google-bert/bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(df_train[['text', 'label']])
test_dataset = Dataset.from_pandas(df_test[['text', 'label']])

# Adding short max length to lower training time
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=4096)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/27480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],  # BERT uses query, key, value in attention
    lora_dropout=0.1,
    bias="all",
    task_type=TaskType.SEQ_CLS
)

training_args = TrainingArguments(
    output_dir="./results/" + model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    #gradient_checkpointing=True,
    num_train_epochs=50,
    #dataloader_num_workers=16,
    #logging_steps=100,
    weight_decay=0.01
)

model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    if "classifier" in name:
        param.requires_grad = True

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Define a compute metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    accuracy = accuracy_score(p.label_ids, preds)
    return {'accuracy': accuracy}

In [7]:
from transformers import TrainerCallback

class LossAccuracyLogger(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_accuracy = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                self.train_loss.append((state.epoch, logs["loss"]))
            if "eval_accuracy" in logs:
                self.eval_accuracy.append((state.epoch, logs["eval_accuracy"]))


In [8]:
logger_callback = LossAccuracyLogger()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[logger_callback]
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
# Train the model
trainer.train()

Step,Training Loss
500,0.8924
1000,0.6225
1500,0.5834
2000,0.5609


KeyboardInterrupt: 

In [None]:
trainer.save_model("./results/" + model_name)

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

import matplotlib.pyplot as plt

# Unpack the epoch and values
train_epochs, train_losses = zip(*logger_callback.train_loss)
eval_epochs, eval_accuracies = zip(*logger_callback.eval_accuracy)

# Plot training loss
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_epochs, train_losses, marker='o')
plt.title("Training Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")

# Plot eval accuracy
plt.subplot(1, 2, 2)
plt.plot(eval_epochs, eval_accuracies, marker='o', color='green')
plt.title("Evaluation Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")

plt.tight_layout()
plt.show()


{'eval_loss': 1.189807415008545, 'eval_model_preparation_time': 0.005, 'eval_accuracy': 0.312110922467459, 'eval_runtime': 10.2588, 'eval_samples_per_second': 344.484, 'eval_steps_per_second': 2.729}


ValueError: not enough values to unpack (expected 2, got 0)