<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/HyperparametersARABERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U ray[tune]


Collecting ray[tune]
  Downloading ray-2.47.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (20 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Downloading tensorboardx-2.6.4-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ray-2.47.0-cp311-cp311-manylinux2014_x86_64.whl (68.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.9/68.9 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX, ray
Successfully installed ray-2.47.0 tensorboardX-2.6.4


In [None]:
!pip install seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=bf8eb3422d5f7fefb95cf03b4fd71449fcefd8a34c4a11af17a747da1e628627
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
pip install wandb datasets transformers seqeval




In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IO.xlsx")
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset prep
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer & model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Metric function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. W&B setup
wandb.init(project="arabert-hparam-tuning", name="arabert-run")

# 6. Define training arguments with sweep config
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=wandb.config.get("epochs", 1),
    logging_dir="./logs"
)


# 7. Model & Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train and evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

# 9. Final detailed report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
print(classification_report(true_labels, predicted_labels))

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📄 Classification Report:
              precision    recall  f1-score   support

           _       0.94      0.97      0.95       265

   micro avg       0.94      0.97      0.95       265
   macro avg       0.94      0.97      0.95       265
weighted avg       0.94      0.97      0.95       265



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.9966
eval/f1,0.95167
eval/loss,0.01294
eval/precision,0.93773
eval/recall,0.96604
eval/runtime,148.2927
eval/samples_per_second,5.071
eval/steps_per_second,0.634
eval_accuracy,0.9966


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IO-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence marker
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IO.xlsx")
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset preparation
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize WandB
wandb.init(project="arabert-hparam-tuning", name="arabert-IO-1epoch")

# 6. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Training and evaluation
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save report to file
with open("classification_report_IO.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0141
eval_accuracy: 0.9961
eval_precision: 0.9375
eval_recall: 0.9623
eval_f1: 0.9497
eval_runtime: 136.8416
eval_samples_per_second: 5.4950
eval_steps_per_second: 0.6870
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.94      0.96      0.95       265

   micro avg       0.94      0.96      0.95       265
   macro avg       0.94      0.96      0.95       265
weighted avg       0.94      0.96      0.95       265



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99609
eval/f1,0.94972
eval/loss,0.01406
eval/precision,0.9375
eval/recall,0.96226
eval/runtime,136.8416
eval/samples_per_second,5.495
eval/steps_per_second,0.687
eval_accuracy,0.99609


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load BI-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # Sentence delimiter
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/BI.xlsx")
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset preparation
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize WandB
wandb.init(project="arabert-hparam-tuning", name="arabert-BI-1epoch")

# 6. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Training and evaluation
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save to file
with open("classification_report_BI.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss





📊 Evaluation Results:
eval_loss: 0.0298
eval_accuracy: 0.9929
eval_precision: 0.9386
eval_recall: 0.9611
eval_f1: 0.9497
eval_runtime: 136.4505
eval_samples_per_second: 5.5110
eval_steps_per_second: 0.6890
epoch: 1.0000





📄 Classification Report:
              precision    recall  f1-score   support

           O       0.95      0.97      0.96       945
           _       0.91      0.94      0.92       264

   micro avg       0.94      0.96      0.95      1209
   macro avg       0.93      0.95      0.94      1209
weighted avg       0.94      0.96      0.95      1209



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99286
eval/f1,0.94973
eval/loss,0.02984
eval/precision,0.93861
eval/recall,0.96112
eval/runtime,136.4505
eval/samples_per_second,5.511
eval/steps_per_second,0.689
eval_accuracy,0.99286


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load BIES-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/BIES.xlsx")  # <- Replace with your path
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset formatting
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. WandB init
wandb.init(project="arabert-hparam-tuning", name="arabert-BIES-1epoch")

# 6. Training args
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Trainer setup
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train & Evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Final detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save to file
with open("classification_report_BIES.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss





📊 Evaluation Results:
eval_loss: 0.0496
eval_accuracy: 0.9892
eval_precision: 0.9348
eval_recall: 0.9603
eval_f1: 0.9474
eval_runtime: 135.4640
eval_samples_per_second: 5.5510
eval_steps_per_second: 0.6940
epoch: 1.0000





📄 Classification Report:
              precision    recall  f1-score   support

           O       0.95      0.97      0.96       945
           _       0.88      0.94      0.91       264

   micro avg       0.93      0.96      0.95      1209
   macro avg       0.92      0.95      0.93      1209
weighted avg       0.94      0.96      0.95      1209



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.9892
eval/f1,0.94737
eval/loss,0.04964
eval/precision,0.93478
eval/recall,0.9603
eval/runtime,135.464
eval/samples_per_second,5.551
eval/steps_per_second,0.694
eval_accuracy,0.9892


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IE-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IE.xlsx")  # ← Update the path if needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset preparation
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenization & alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metric function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize W&B
wandb.init(project="arabert-hparam-tuning", name="arabert-IE-1epoch")

# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer setup
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train and Evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Classification Report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save report to file
with open("classification_report_IE.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss





📊 Evaluation Results:
eval_loss: 0.0295
eval_accuracy: 0.9924
eval_precision: 0.9371
eval_recall: 0.9620
eval_f1: 0.9494
eval_runtime: 139.5207
eval_samples_per_second: 5.3900
eval_steps_per_second: 0.6740
epoch: 1.0000





📄 Classification Report:
              precision    recall  f1-score   support

           O       0.95      0.97      0.96       945
           _       0.90      0.94      0.92       264

   micro avg       0.94      0.96      0.95      1209
   macro avg       0.92      0.96      0.94      1209
weighted avg       0.94      0.96      0.95      1209



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99243
eval/f1,0.94939
eval/loss,0.02948
eval/precision,0.93715
eval/recall,0.96195
eval/runtime,139.5207
eval/samples_per_second,5.39
eval/steps_per_second,0.674
eval_accuracy,0.99243


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOB-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IOB.xlsx")  # <-- Adjust path as needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset formatting
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Compute metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize Weights & Biases
wandb.init(project="arabert-hparam-tuning", name="arabert-IOB-1epoch")

# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Training and evaluation
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save to file
with open("classification_report_IOB.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0131
eval_accuracy: 0.9967
eval_precision: 0.9377
eval_recall: 0.9624
eval_f1: 0.9499
eval_runtime: 133.3900
eval_samples_per_second: 5.6380
eval_steps_per_second: 0.7050
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.94      0.96      0.95       266

   micro avg       0.94      0.96      0.95       266
   macro avg       0.94      0.96      0.95       266
weighted avg       0.94      0.96      0.95       266



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99668
eval/f1,0.94991
eval/loss,0.01315
eval/precision,0.93773
eval/recall,0.96241
eval/runtime,133.39
eval/samples_per_second,5.638
eval/steps_per_second,0.705
eval_accuracy,0.99668


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOBES-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # Sentence end
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IOBES.xlsx")  # ← update path as needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Prepare HuggingFace datasets
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenization and label alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize Weights & Biases
wandb.init(project="arabert-hparam-tuning", name="arabert-IOBES-1epoch")

# 6. Training configuration
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train and evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Final classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save results
with open("classification_report_IOBES.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0238
eval_accuracy: 0.9946
eval_precision: 0.8857
eval_recall: 0.9394
eval_f1: 0.9118
eval_runtime: 137.1569
eval_samples_per_second: 5.4830
eval_steps_per_second: 0.6850
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.89      0.94      0.91       264

   micro avg       0.89      0.94      0.91       264
   macro avg       0.89      0.94      0.91       264
weighted avg       0.89      0.94      0.91       264



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99464
eval/f1,0.91176
eval/loss,0.02375
eval/precision,0.88571
eval/recall,0.93939
eval/runtime,137.1569
eval/samples_per_second,5.483
eval/steps_per_second,0.685
eval_accuracy,0.99464


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOE-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # Sentence delimiter
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IOE.xlsx")  # Update path as needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset prep
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer & alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize W&B
wandb.init(project="arabert-hparam-tuning", name="arabert-IOE-1epoch")

# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Model and trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train and evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

with open("classification_report_IOE.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0179
eval_accuracy: 0.9954
eval_precision: 0.9061
eval_recall: 0.9508
eval_f1: 0.9279
eval_runtime: 138.6270
eval_samples_per_second: 5.4250
eval_steps_per_second: 0.6780
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.91      0.95      0.93       264

   micro avg       0.91      0.95      0.93       264
   macro avg       0.91      0.95      0.93       264
weighted avg       0.91      0.95      0.93       264



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99541
eval/f1,0.92791
eval/loss,0.01787
eval/precision,0.90614
eval/recall,0.95076
eval/runtime,138.627
eval/samples_per_second,5.425
eval/steps_per_second,0.678
eval_accuracy,0.99541


##SECOND

In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IO-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence marker
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IO2.xlsx")
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset preparation
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize WandB
wandb.init(project="arabert-hparam-tuning", name="arabert-IO-1epoch")

# 6. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Training and evaluation
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save report to file
with open("classification_report_IO.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0121
eval_accuracy: 0.9971
eval_precision: 0.9496
eval_recall: 0.9362
eval_f1: 0.9429
eval_runtime: 137.8065
eval_samples_per_second: 5.4570
eval_steps_per_second: 0.6820
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.95      0.94      0.94       282

   micro avg       0.95      0.94      0.94       282
   macro avg       0.95      0.94      0.94       282
weighted avg       0.95      0.94      0.94       282



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99711
eval/f1,0.94286
eval/loss,0.01207
eval/precision,0.94964
eval/recall,0.93617
eval/runtime,137.8065
eval/samples_per_second,5.457
eval/steps_per_second,0.682
eval_accuracy,0.99711


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load BI-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # Sentence delimiter
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/BI2.xlsx")
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset preparation
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize WandB
wandb.init(project="arabert-hparam-tuning", name="arabert-BI-1epoch")

# 6. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Training and evaluation
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save to file
with open("classification_report_BI.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss





📊 Evaluation Results:
eval_loss: 0.0390
eval_accuracy: 0.9913
eval_precision: 0.9286
eval_recall: 0.9391
eval_f1: 0.9338
eval_runtime: 143.5620
eval_samples_per_second: 5.2380
eval_steps_per_second: 0.6550
epoch: 1.0000





📄 Classification Report:
              precision    recall  f1-score   support

           O       0.94      0.96      0.95       964
           _       0.90      0.88      0.89       283

   micro avg       0.93      0.94      0.93      1247
   macro avg       0.92      0.92      0.92      1247
weighted avg       0.93      0.94      0.93      1247



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99133
eval/f1,0.93381
eval/loss,0.03897
eval/precision,0.92863
eval/recall,0.93905
eval/runtime,143.562
eval/samples_per_second,5.238
eval/steps_per_second,0.655
eval_accuracy,0.99133


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load BIES-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/BIES2.xlsx")  # <- Replace with your path
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset formatting
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. WandB init
wandb.init(project="arabert-hparam-tuning", name="arabert-BIES-1epoch")

# 6. Training args
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Trainer setup
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train & Evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Final detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save to file
with open("classification_report_BIES.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss





📊 Evaluation Results:
eval_loss: 0.0503
eval_accuracy: 0.9908
eval_precision: 0.9535
eval_recall: 0.9543
eval_f1: 0.9539
eval_runtime: 136.7214
eval_samples_per_second: 5.5000
eval_steps_per_second: 0.6880
epoch: 1.0000





📄 Classification Report:
              precision    recall  f1-score   support

           O       0.96      0.96      0.96       964
           _       0.92      0.92      0.92       283

   micro avg       0.95      0.95      0.95      1247
   macro avg       0.94      0.94      0.94      1247
weighted avg       0.95      0.95      0.95      1247



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99082
eval/f1,0.95391
eval/loss,0.05032
eval/precision,0.95353
eval/recall,0.95429
eval/runtime,136.7214
eval/samples_per_second,5.5
eval/steps_per_second,0.688
eval_accuracy,0.99082


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IE-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IE2.xlsx")  # ← Update the path if needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset preparation
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenization & alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metric function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize W&B
wandb.init(project="arabert-hparam-tuning", name="arabert-IE-1epoch")

# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer setup
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train and Evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Classification Report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save report to file
with open("classification_report_IE.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss





📊 Evaluation Results:
eval_loss: 0.0430
eval_accuracy: 0.9900
eval_precision: 0.9262
eval_recall: 0.9254
eval_f1: 0.9258
eval_runtime: 140.4369
eval_samples_per_second: 5.3550
eval_steps_per_second: 0.6690
epoch: 1.0000





📄 Classification Report:
              precision    recall  f1-score   support

           O       0.94      0.95      0.94       964
           _       0.87      0.86      0.86       283

   micro avg       0.93      0.93      0.93      1247
   macro avg       0.91      0.90      0.90      1247
weighted avg       0.93      0.93      0.93      1247



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.98997
eval/f1,0.92579
eval/loss,0.043
eval/precision,0.92616
eval/recall,0.92542
eval/runtime,140.4369
eval/samples_per_second,5.355
eval/steps_per_second,0.669
eval_accuracy,0.98997


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOB-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # End of sentence
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IOB2.xlsx")  # <-- Adjust path as needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset formatting
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer and alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Compute metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize Weights & Biases
wandb.init(project="arabert-hparam-tuning", name="arabert-IOB-1epoch")

# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and Trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Training and evaluation
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save to file
with open("classification_report_IOB.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0161
eval_accuracy: 0.9963
eval_precision: 0.9534
eval_recall: 0.9399
eval_f1: 0.9466
eval_runtime: 139.6513
eval_samples_per_second: 5.3850
eval_steps_per_second: 0.6730
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.95      0.94      0.95       283

   micro avg       0.95      0.94      0.95       283
   macro avg       0.95      0.94      0.95       283
weighted avg       0.95      0.94      0.95       283



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99634
eval/f1,0.94662
eval/loss,0.01614
eval/precision,0.95341
eval/recall,0.93993
eval/runtime,139.6513
eval/samples_per_second,5.385
eval/steps_per_second,0.673
eval_accuracy,0.99634


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOBES-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # Sentence end
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IOBES2.xlsx")  # ← update path as needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Prepare HuggingFace datasets
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenization and label alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize Weights & Biases
wandb.init(project="arabert-hparam-tuning", name="arabert-IOBES-1epoch")

# 6. Training configuration
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Load model and trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train and evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Final classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

# Save results
with open("classification_report_IOBES.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mis-it-ali03[0m ([33mis-it-ali03-german-university-in-cairo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0279
eval_accuracy: 0.9942
eval_precision: 0.9056
eval_recall: 0.9152
eval_f1: 0.9104
eval_runtime: 167.2833
eval_samples_per_second: 4.4950
eval_steps_per_second: 0.5620
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.91      0.92      0.91       283

   micro avg       0.91      0.92      0.91       283
   macro avg       0.91      0.92      0.91       283
weighted avg       0.91      0.92      0.91       283



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99422
eval/f1,0.91037
eval/loss,0.02792
eval/precision,0.90559
eval/recall,0.91519
eval/runtime,167.2833
eval/samples_per_second,4.495
eval/steps_per_second,0.562
eval_accuracy,0.99422


In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load IOE-tagged data
def load_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()
    sentences, labels = [], []
    s, l = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:  # Sentence delimiter
            if s:
                sentences.append(s)
                labels.append(l)
                s, l = [], []
        else:
            s.append(word)
            l.append(tag)
    if s:
        sentences.append(s)
        labels.append(l)
    return sentences, labels

sentences, tags = load_data("/content/IOE2.xlsx")  # Update path as needed
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 2. Dataset prep
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 3. Tokenizer & alignment
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(label[wid])
            else:
                aligned.append(-100)
            prev = wid
        labels.append(aligned)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids
    pred_tags = [[id2tag[pred] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    true_tags = [[id2tag[lab] for pred, lab in zip(pred_seq, true_seq) if lab != -100]
                 for pred_seq, true_seq in zip(preds, true)]
    return {
        "accuracy": accuracy_score(true_tags, pred_tags),
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

# 5. Initialize W&B
wandb.init(project="arabert-hparam-tuning", name="arabert-IOE-1epoch")

# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=wandb.config.get("batch_size", 8),
    per_device_eval_batch_size=8,
    learning_rate=wandb.config.get("lr", 3e-5),
    weight_decay=wandb.config.get("weight_decay", 0.01),
    num_train_epochs=1,
    logging_dir="./logs"
)

# 7. Model and trainer
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8. Train and evaluate
trainer.train()
eval_result = trainer.evaluate()
wandb.log(eval_result)

print("\n📊 Evaluation Results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")

# 9. Classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for l, p in zip(label, pred) if l != -100]
               for label, pred in zip(labels, predictions)]
predicted_labels = [[id2tag[p] for l, p in zip(label, pred) if l != -100]
                    for label, pred in zip(labels, predictions)]

print("\n📄 Classification Report:")
report = classification_report(true_labels, predicted_labels)
print(report)

with open("classification_report_IOE.txt", "w", encoding="utf-8") as f:
    f.write("📊 Evaluation Results:\n")
    for key, value in eval_result.items():
        f.write(f"{key}: {value:.4f}\n")
    f.write("\n📄 Classification Report:\n")
    f.write(report)

wandb.finish()


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



📊 Evaluation Results:
eval_loss: 0.0181
eval_accuracy: 0.9961
eval_precision: 0.9458
eval_recall: 0.9258
eval_f1: 0.9357
eval_runtime: 165.5585
eval_samples_per_second: 4.5420
eval_steps_per_second: 0.5680
epoch: 1.0000

📄 Classification Report:
              precision    recall  f1-score   support

           _       0.95      0.93      0.94       283

   micro avg       0.95      0.93      0.94       283
   macro avg       0.95      0.93      0.94       283
weighted avg       0.95      0.93      0.94       283



0,1
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_accuracy,▁

0,1
epoch,1.0
eval/accuracy,0.99609
eval/f1,0.93571
eval/loss,0.01814
eval/precision,0.94585
eval/recall,0.9258
eval/runtime,165.5585
eval/samples_per_second,4.542
eval/steps_per_second,0.568
eval_accuracy,0.99609
