In [1]:
!pip install -q transformers==4.35.2
!pip install datasets==2.15.0 evaluate==0.4.1 seqeval==1.2.2
!pip install accelerate==0.23.0 peft==0.13.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m113.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.2.0 requires transformers<6.0.0,>=4.41.0, but you have transformers 4.35.2 which is incompatible.[0m[31m
[0mCollecting datasets==2.15.0
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate==0.4.1
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting seqeval==1.2.2
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43

In [4]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
import warnings

warnings.filterwarnings("ignore")

env: CUDA_DEVICE_ORDER=PCI_BUS_ID


In [5]:
import numpy as np
import pandas as pd
import os
import re
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, precision_score, recall_score ,classification_report
from torch.utils.data import Dataset, TensorDataset
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [6]:
# Initialize model and tokenizer once
model_path = "lifeweb-ai/shiraz"
tokenizer = AutoTokenizer.from_pretrained(model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Label mapping (0-4 for 5 classes)
label2id = {i: i for i in range(5)}
id2label = {i: i for i in range(5)}

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=5,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True
    )

tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/208 [00:00<?, ?B/s]

In [7]:
def train_on_dataset(dataset_name, csv_url=None):
    print(f"\n{'='*60}")
    print(f"Training on: {dataset_name}")
    print(f"{'='*60}\n")

    from datasets import load_dataset

    hf_ds = load_dataset("kforghani/sentipers")

    if csv_url:
        train_df = pd.read_csv(csv_url)
        train_texts = train_df["text"].tolist()
        train_labels = train_df["label"].tolist()
    else:
        train_texts = hf_ds["train"]["text"]
        train_labels = hf_ds["train"]["label"]

    val_texts = hf_ds["validation"]["text"]
    val_labels = hf_ds["validation"]["label"]

    test_texts = hf_ds["test"]["text"]
    test_labels = hf_ds["test"]["label"]

    train_enc = tokenizer(train_texts, truncation=True)
    train_enc["labels"] = train_labels
    train_dataset = Dataset.from_dict(train_enc)

    val_enc = tokenizer(val_texts, truncation=True)
    val_enc["labels"] = val_labels
    val_dataset = Dataset.from_dict(val_enc)

    test_enc = tokenizer(test_texts, truncation=True)
    test_enc["labels"] = test_labels
    test_dataset = Dataset.from_dict(test_enc)

    training_args = TrainingArguments(
        output_dir=f"output_{dataset_name}",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        learning_rate=2e-5,
        evaluation_strategy="steps",
        eval_steps=0.1,
        save_strategy="steps",
        save_steps=0.1,
        logging_strategy="steps",
        logging_steps=0.1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",
        seed=13,
        data_seed=13,
        fp16=False
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )

    trainer.train()

    prediction_result = trainer.predict(test_dataset)
    preds = np.argmax(prediction_result.predictions, axis=1)

    print(f"\n{dataset_name} - Test Classification Report:")
    print(classification_report(test_dataset["labels"], preds))

    model_save_path = f"shiraz_sentipers_{dataset_name}"
    trainer.save_model(model_save_path)
    print(f"\nModel saved to: {model_save_path}")

    return prediction_result.metrics


In [8]:
# Train on all three datasets
results = {}

# 1. Base dataset (HuggingFace)
results['base'] = train_on_dataset(
    dataset_name="base",
)

# 2. Sampled dataset (4000 records)
results['sampled_4000'] = train_on_dataset(
    dataset_name="sampled_4000",
    csv_url="https://raw.githubusercontent.com/k-forghani/teaug/refs/heads/main/data/base/sentipers_train.csv"
)

# 3. Augmented dataset (4000 base + 3822 augmented)
results['augmented_4000'] = train_on_dataset(
    dataset_name="augmented_4000",
    csv_url="https://raw.githubusercontent.com/k-forghani/teaug/refs/heads/main/data/output/augmented_sentipers_train.csv"
)



Training on: base



Downloading readme: 0.00B [00:00, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/247k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/10820 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1352 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1353 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/186M [00:00<?, ?B/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at lifeweb-ai/shiraz and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at lifeweb-ai/shiraz and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
406,1.1134,0.944959,0.611686,0.477116,0.445718,0.45135
812,0.9569,0.870456,0.649408,0.708038,0.517364,0.524731
1218,0.9066,0.848527,0.667899,0.72769,0.522441,0.535816
1624,0.7538,0.877786,0.667899,0.622826,0.538587,0.539823
2030,0.6893,0.877309,0.670118,0.732349,0.532495,0.543194
2436,0.7103,0.814952,0.690089,0.740976,0.556201,0.563257
2842,0.6189,0.873895,0.690828,0.641767,0.576397,0.582648
3248,0.5161,0.891167,0.693047,0.643429,0.55814,0.564522
3654,0.4934,0.917245,0.694527,0.621002,0.582435,0.592733



base - Test Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.21      0.29        19
           1       0.55      0.61      0.58       142
           2       0.76      0.73      0.75       507
           3       0.64      0.64      0.64       432
           4       0.70      0.75      0.72       253

    accuracy                           0.68      1353
   macro avg       0.62      0.59      0.59      1353
weighted avg       0.68      0.68      0.68      1353


Model saved to: shiraz_sentipers_base

Training on: sampled_4000



Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at lifeweb-ai/shiraz and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at lifeweb-ai/shiraz and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
150,1.2579,1.153407,0.531065,0.493875,0.360277,0.369613
300,1.0823,1.011335,0.58358,0.449937,0.42455,0.426691
450,1.0057,0.92541,0.616124,0.486237,0.447534,0.452241
600,0.813,0.980942,0.622781,0.707637,0.475569,0.497766
750,0.7587,0.938776,0.627959,0.577125,0.478133,0.497777
900,0.7315,0.945463,0.64497,0.514101,0.484933,0.488731
1050,0.6627,0.999183,0.650148,0.565774,0.525559,0.524121
1200,0.5192,0.984838,0.653107,0.550621,0.517809,0.525069
1350,0.4995,0.996886,0.658284,0.556081,0.524505,0.531148
1500,0.5304,0.994844,0.659763,0.549757,0.527057,0.532109



sampled_4000 - Test Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.11      0.14        19
           1       0.55      0.52      0.53       142
           2       0.76      0.73      0.74       507
           3       0.62      0.62      0.62       432
           4       0.67      0.77      0.71       253

    accuracy                           0.67      1353
   macro avg       0.56      0.55      0.55      1353
weighted avg       0.67      0.67      0.67      1353


Model saved to: shiraz_sentipers_sampled_4000

Training on: augmented_4000



Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at lifeweb-ai/shiraz and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at lifeweb-ai/shiraz and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
282,1.1289,1.108849,0.516272,0.444031,0.406128,0.407609
564,0.9428,1.045468,0.557692,0.476487,0.455241,0.431604
846,0.8772,1.012493,0.601331,0.479393,0.491658,0.468166
1128,0.7612,1.016836,0.600592,0.484595,0.506604,0.492388
1410,0.726,0.956186,0.627219,0.554539,0.503916,0.50756
1692,0.6539,0.958645,0.650148,0.549554,0.526396,0.521801
1974,0.6433,0.985577,0.632396,0.533065,0.50309,0.512611
2256,0.5033,1.010783,0.64497,0.544256,0.525796,0.532707
2538,0.5121,1.015772,0.629438,0.528686,0.514872,0.51945



augmented_4000 - Test Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.51      0.48      0.49       142
           2       0.74      0.73      0.74       507
           3       0.60      0.59      0.60       432
           4       0.64      0.72      0.68       253

    accuracy                           0.65      1353
   macro avg       0.50      0.50      0.50      1353
weighted avg       0.64      0.65      0.64      1353


Model saved to: shiraz_sentipers_augmented_4000


In [9]:
# Summary comparison
import pandas as pd

summary = pd.DataFrame(results).T
print("\n" + "="*60)
print("FINAL COMPARISON - All Datasets")
print("="*60)
print(summary[['test_accuracy', 'test_f1', 'test_precision', 'test_recall']])
print("\n")


FINAL COMPARISON - All Datasets
                test_accuracy   test_f1  test_precision  test_recall
base                 0.682927  0.593362        0.618521     0.585911
sampled_4000         0.669623  0.549667        0.558937     0.547879
augmented_4000       0.647450  0.501001        0.498662     0.504451


