In [None]:
!pip install -q transformers==4.35.2
!pip install datasets==2.15.0 evaluate==0.4.1 seqeval==1.2.2
!pip install accelerate==0.23.0 peft==0.13.2

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
from loguru import logger
import warnings

logger.remove()
warnings.filterwarnings("ignore")


In [None]:
import numpy as np
import pandas as pd
import os
import re
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, precision_score, recall_score ,classification_report
from torch.utils.data import Dataset, TensorDataset
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [None]:
# Initialize model and tokenizer once
model_path = "lifeweb-ai/shiraz"
tokenizer = AutoTokenizer.from_pretrained(model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Label mapping (0-4 for 5 classes)
label2id = {i: i for i in range(5)}
id2label = {i: i for i in range(5)}

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=5,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True
    )

In [None]:
def train_on_dataset(dataset_name, csv_url=None):
    print(f"\n{'='*60}")
    print(f"Training on: {dataset_name}")
    print(f"{'='*60}\n")

    from datasets import load_dataset

    hf_ds = load_dataset("kforghani/sentipers")

    if csv_url:
        train_df = pd.read_csv(csv_url)
        train_texts = train_df["text"].tolist()
        train_labels = train_df["label"].tolist()
    else:
        train_texts = hf_ds["train"]["text"]
        train_labels = hf_ds["train"]["label"]

    val_texts = hf_ds["validation"]["text"]
    val_labels = hf_ds["validation"]["label"]

    test_texts = hf_ds["test"]["text"]
    test_labels = hf_ds["test"]["label"]

    train_enc = tokenizer(train_texts, truncation=True)
    train_enc["labels"] = train_labels
    train_dataset = Dataset.from_dict(train_enc)

    val_enc = tokenizer(val_texts, truncation=True)
    val_enc["labels"] = val_labels
    val_dataset = Dataset.from_dict(val_enc)

    test_enc = tokenizer(test_texts, truncation=True)
    test_enc["labels"] = test_labels
    test_dataset = Dataset.from_dict(test_enc)

    training_args = TrainingArguments(
        output_dir=f"output_{dataset_name}",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        learning_rate=2e-5,
        evaluation_strategy="steps",
        eval_steps=0.1,
        save_strategy="steps",
        save_steps=0.1,
        logging_strategy="steps",
        logging_steps=0.1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",
        seed=13,
        data_seed=13,
        fp16=False
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )

    trainer.train()

    prediction_result = trainer.predict(test_dataset)
    preds = np.argmax(prediction_result.predictions, axis=1)

    print(f"\n{dataset_name} - Test Classification Report:")
    print(classification_report(test_dataset["labels"], preds))

    model_save_path = f"shiraz_sentipers_{dataset_name}"
    trainer.save_model(model_save_path)
    print(f"\nModel saved to: {model_save_path}")

    return prediction_result.metrics


In [None]:
# Train on all three datasets
results = {}

# 1. Base dataset (HuggingFace)
results['base'] = train_on_dataset(
    dataset_name="base",
)

# 2. Sampled dataset (4000 records)
results['sampled_4000'] = train_on_dataset(
    dataset_name="sampled_4000",
    csv_url="https://raw.githubusercontent.com/k-forghani/teaug/refs/heads/main/data/base/sentipers_train.csv"
)

# 3. Augmented dataset (4000 base + 3822 augmented)
results['augmented_4000'] = train_on_dataset(
    dataset_name="augmented_4000",
    csv_url="https://raw.githubusercontent.com/k-forghani/teaug/refs/heads/main/data/output/augmented_sentipers_train.csv"
)


In [None]:
# Summary comparison
import pandas as pd

summary = pd.DataFrame(results).T
print("\n" + "="*60)
print("FINAL COMPARISON - All Datasets")
print("="*60)
print(summary[['test_accuracy', 'test_f1', 'test_precision', 'test_recall']])
print("\n")