<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/Arabert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##FIRST

In [None]:
pip install transformers datasets seqeval scikit-learn pandas openpyxl




In [None]:
import transformers
print(transformers.__version__)


4.52.4


In [None]:
import transformers
print("Transformers version:", transformers.__version__)


Transformers version: 4.52.4


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, ClassLabel
import numpy as np
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        if str(word).strip() in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(str(word).strip())
            label.append(str(tag).strip())

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IO.xlsx")

# 2. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Prepare data for Hugging Face
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2id), id2label=id2tag, label2id=tag2id)

# 5. Tokenization function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

print("Classification Report:")
print(classification_report(true_labels, true_predictions))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mis-it-ali03[0m ([33mis-it-ali03-german-university-in-cairo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.0244
1000,0.0204
1500,0.0116


Classification Report:
              precision    recall  f1-score   support

           _       0.96      0.98      0.97       265

   micro avg       0.96      0.98      0.97       265
   macro avg       0.96      0.98      0.97       265
weighted avg       0.96      0.98      0.97       265



In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:  # Add the last sentence if any
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BI-tagged dataset
sentences, tags = load_excel_data("/content/BI.xlsx")

# 3. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Prepare data for Hugging Face
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Generate detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nClassification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,0.0563
1000,0.0128
1500,0.0068





Classification Report:
              precision    recall  f1-score   support

           B     0.9811    0.9848    0.9830       264
          BO     0.9653    0.9701    0.9677       201
           I     0.9845    0.9725    0.9785       327
          IO     0.9985    0.9987    0.9986     10970

    accuracy                         0.9972     11762
   macro avg     0.9824    0.9815    0.9820     11762
weighted avg     0.9972    0.9972    0.9972     11762


Evaluation Results:
Accuracy:  0.9972
Precision: 0.9786
Recall:    0.9835
F1 Score:  0.9810




In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report

# Force CUDA to give specific error locations
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if tag.lower() == 'nan' or tag == '':
            continue
        if word in ['.', '؟']:  # End of sentence punctuation
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BIES-tagged dataset
sentences, tags = load_excel_data("/content/BIES.xlsx")

# Strip all tags
tags = [[str(t).strip() for t in tag_seq] for tag_seq in tags]

# 3. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc if tag != "" and tag.lower() != "nan"))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

print("Unique tags:", unique_tags)
print("Tag to ID mapping:", tag2id)

# 4. Prepare data for Hugging Face
data = [{"tokens": s, "ner_tags": [tag2id[t] for t in t_list]} for s, t_list in zip(sentences, tags)]

# Validate tag IDs
for ex in data:
    for tid in ex["ner_tags"]:
        if tid < 0 or tid >= len(tag2id):
            raise ValueError(f"Invalid tag_id found: {tid}")

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != previous_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            previous_word_id = word_id
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 7. Apply tokenization
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 8. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 9. Metric function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 10. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch"
)

# 11. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 12. Train the model
trainer.train()

# 13. Evaluate the model
results = trainer.evaluate()

# 14. Generate detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 15. Print classification report
print("\nClassification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Unique tags: ['B', 'BO', 'E', 'EO', 'I', 'IO', 'S', 'SO']
Tag to ID mapping: {'B': 0, 'BO': 1, 'E': 2, 'EO': 3, 'I': 4, 'IO': 5, 'S': 6, 'SO': 7}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
pip install wandb




In [None]:
import os
import pandas as pd
import torch
import numpy as np
import wandb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report

# 0. Initialize wandb
wandb.init(project="arabic-medical-ner", name="arabertv2-BIES")

# Optional: force CUDA error trace
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if tag.lower() == 'nan' or tag == '':
            continue
        if word in ['.', '؟']:  # End of sentence punctuation
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BIES-tagged dataset
sentences, tags = load_excel_data("/content/BIES.xlsx")
tags = [[str(t).strip() for t in tag_seq] for tag_seq in tags]

# 3. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc if tag != "" and tag.lower() != "nan"))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Format data
data = [{"tokens": s, "ner_tags": [tag2id[t] for t in t_list]} for s, t_list in zip(sentences, tags)]

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != previous_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            previous_word_id = word_id
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments with wandb enabled
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb"  # <--- Enable W&B reporting
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train
trainer.train()

# 11. Evaluate and report
results = trainer.evaluate()

# 12. Predictions and detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Log classification report to W&B
report = sk_classification_report(flat_true, flat_pred, digits=4, output_dict=True)
wandb.log({"classification_report": report})

# 14. Print to console as well
print("\nClassification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")

# 15. Finish wandb run
wandb.finish()


0,1
eval/accuracy,█▁
eval/f1,▁█
eval/loss,▁█
eval/precision,▁█
eval/recall,▁█
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
test/accuracy,█▁
test/f1,▁█

0,1
eval/accuracy,0.99719
eval/f1,0.98102
eval/loss,0.01655
eval/precision,0.9786
eval/recall,0.98346
eval/runtime,2.6144
eval/samples_per_second,287.641
eval/steps_per_second,35.955
test/accuracy,0.99719
test/f1,0.98102


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
!pip install transformers datasets seqeval pandas openpyxl


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=05e555708a81f391742fecf6c5e4ee4d84c725ad00d9535919cb7a0de1aca903
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IE-tagged dataset
sentences, tags = load_excel_data("/content/IE.xlsx")

# 3. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Prepare data for Hugging Face
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (with smaller epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_ie",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Reduced epoch for faster execution
    weight_decay=0.01,
    logging_dir="./logs_ie"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Generate detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIE Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mis-it-ali03[0m ([33mis-it-ali03-german-university-in-cairo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss





IE Classification Report:
              precision    recall  f1-score   support

           E     0.9343    0.9697    0.9517       264
          EO     0.9432    0.9727    0.9577       256
           I     0.9429    0.9602    0.9515       327
          IO     0.9982    0.9960    0.9971     10915

    accuracy                         0.9939     11762
   macro avg     0.9546    0.9746    0.9645     11762
weighted avg     0.9940    0.9939    0.9939     11762


Evaluation Results:
Accuracy:  0.9939
Precision: 0.9506
Recall:    0.9711
F1 Score:  0.9607




In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IOB-tagged dataset
sentences, tags = load_excel_data("/content/IOB.xlsx")

# 3. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Prepare dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (fast)
training_args = TrainingArguments(
    output_dir="./results_iob",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iob"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Output
print("\nIOB Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOB Classification Report:
              precision    recall  f1-score   support

           B     0.9457    0.9812    0.9631       266
           I     0.9362    0.9818    0.9585       329
           O     0.9990    0.9967    0.9978     11167

    accuracy                         0.9959     11762
   macro avg     0.9603    0.9866    0.9731     11762
weighted avg     0.9960    0.9959    0.9960     11762


Evaluation Results:
Accuracy:  0.9959
Precision: 0.9179
Recall:    0.9662
F1 Score:  0.9414


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IOBES-tagged dataset
sentences, tags = load_excel_data("/content/IOBES.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results_iobes",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iobes"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIOBES Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOBES Classification Report:
              precision    recall  f1-score   support

           B     0.9483    0.9847    0.9662       261
           E     0.9101    0.9693    0.9388       261
           I     0.9032    0.8485    0.8750        66
           O     0.9983    0.9965    0.9974     11171
           S     0.0000    0.0000    0.0000         3

    accuracy                         0.9946     11762
   macro avg     0.7520    0.7598    0.7555     11762
weighted avg     0.9944    0.9946    0.9945     11762


Evaluation Results:
Accuracy:  0.9946
Precision: 0.8929
Recall:    0.9470
F1 Score:  0.9191


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IOE-tagged dataset
sentences, tags = load_excel_data("/content/IOE.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_ioe",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_ioe"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIOE Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOE Classification Report:
              precision    recall  f1-score   support

           E     0.9247    0.9773    0.9503       264
           I     0.9461    0.9664    0.9561       327
           O     0.9988    0.9969    0.9978     11171

    accuracy                         0.9956     11762
   macro avg     0.9566    0.9802    0.9681     11762
weighted avg     0.9957    0.9956    0.9956     11762


Evaluation Results:
Accuracy:  0.9956
Precision: 0.8968
Recall:    0.9545
F1 Score:  0.9248


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BIES-tagged dataset
sentences, tags = load_excel_data("/content/BIES.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_bies",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_bies"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nBIES Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





BIES Classification Report:
              precision    recall  f1-score   support

           B     0.9446    0.9808    0.9624       261
          BO     0.9095    0.9526    0.9306       190
           E     0.9194    0.9617    0.9401       261
          EO     0.9482    0.9754    0.9616       244
           I     0.8429    0.8939    0.8676        66
          IO     0.9977    0.9951    0.9964     10726
           S     0.0000    0.0000    0.0000         3
          SO     0.0000    0.0000    0.0000        11

    accuracy                         0.9912     11762
   macro avg     0.6953    0.7199    0.7073     11762
weighted avg     0.9902    0.9912    0.9907     11762


Evaluation Results:
Accuracy:  0.9912
Precision: 0.9442
Recall:    0.9661
F1 Score:  0.9550


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##SECOND

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IO-tagged dataset
sentences, tags = load_excel_data("/content/IO.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_io",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_io"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIO Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IO Classification Report:
              precision    recall  f1-score   support

           I     0.9851    0.9519    0.9682       624
           O     0.9973    0.9992    0.9983     11138

    accuracy                         0.9967     11762
   macro avg     0.9912    0.9756    0.9832     11762
weighted avg     0.9967    0.9967    0.9967     11762


Evaluation Results:
Accuracy:  0.9967
Precision: 0.9527
Recall:    0.9291
F1 Score:  0.9408


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IE-tagged dataset
sentences, tags = load_excel_data("/content/IE.xlsx")

# 3. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Prepare data for Hugging Face
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (with smaller epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_ie",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Reduced epoch for faster execution
    weight_decay=0.01,
    logging_dir="./logs_ie"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Generate detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIE Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





IE Classification Report:
              precision    recall  f1-score   support

           E     0.9474    0.9541    0.9507       283
          EO     0.9851    0.9600    0.9724       275
           I     0.9907    0.9384    0.9639       341
          IO     0.9966    0.9987    0.9977     10863

    accuracy                         0.9950     11762
   macro avg     0.9799    0.9628    0.9711     11762
weighted avg     0.9950    0.9950    0.9950     11762


Evaluation Results:
Accuracy:  0.9950
Precision: 0.9577
Recall:    0.9615
F1 Score:  0.9596




In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IOB-tagged dataset
sentences, tags = load_excel_data("/content/IOB.xlsx")

# 3. Label mapping
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Prepare dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (fast)
training_args = TrainingArguments(
    output_dir="./results_iob",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iob"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Output
print("\nIOB Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOB Classification Report:
              precision    recall  f1-score   support

           B     0.9855    0.9576    0.9713       283
           I     0.9679    0.9736    0.9708       341
           O     0.9984    0.9989    0.9987     11138

    accuracy                         0.9972     11762
   macro avg     0.9839    0.9767    0.9802     11762
weighted avg     0.9972    0.9972    0.9972     11762


Evaluation Results:
Accuracy:  0.9972
Precision: 0.9604
Recall:    0.9435
F1 Score:  0.9519


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IOBES-tagged dataset
sentences, tags = load_excel_data("/content/IOBES.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results_iobes",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iobes"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIOBES Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOBES Classification Report:
              precision    recall  f1-score   support

           B     0.9818    0.9676    0.9746       278
           E     0.9601    0.9532    0.9567       278
           I     0.9808    0.8095    0.8870        63
           O     0.9970    0.9989    0.9979     11138
           S     0.0000    0.0000    0.0000         5

    accuracy                         0.9957     11762
   macro avg     0.7839    0.7459    0.7632     11762
weighted avg     0.9952    0.9957    0.9954     11762


Evaluation Results:
Accuracy:  0.9957
Precision: 0.9424
Recall:    0.9258
F1 Score:  0.9340


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IOE-tagged dataset
sentences, tags = load_excel_data("/content/IOE.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_ioe",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_ioe"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIOE Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOE Classification Report:
              precision    recall  f1-score   support

           E     0.9474    0.9541    0.9507       283
           I     0.9730    0.9501    0.9614       341
           O     0.9978    0.9984    0.9981     11138

    accuracy                         0.9959     11762
   macro avg     0.9727    0.9675    0.9701     11762
weighted avg     0.9959    0.9959    0.9959     11762


Evaluation Results:
Accuracy:  0.9959
Precision: 0.9138
Recall:    0.9364
F1 Score:  0.9250


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BIES-tagged dataset
sentences, tags = load_excel_data("/content/BIES.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_bies",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_bies"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nBIES Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





BIES Classification Report:
              precision    recall  f1-score   support

           B     0.9746    0.9676    0.9711       278
          BO     0.9070    0.9512    0.9286       205
           E     0.9531    0.9496    0.9514       278
          EO     0.9617    0.9691    0.9654       259
           I     0.9623    0.8095    0.8793        63
          IO     0.9959    0.9978    0.9969     10659
           S     0.0000    0.0000    0.0000         5
          SO     0.0000    0.0000    0.0000        15

    accuracy                         0.9918     11762
   macro avg     0.7193    0.7056    0.7116     11762
weighted avg     0.9902    0.9918    0.9910     11762


Evaluation Results:
Accuracy:  0.9918
Precision: 0.9599
Recall:    0.9591
F1 Score:  0.9595


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BI-tagged dataset
sentences, tags = load_excel_data("/content/BI.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for quick training)
training_args = TrainingArguments(
    output_dir="./results_bi",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_bi"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nBI Classification Report:")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





BI Classification Report:
              precision    recall  f1-score   support

           B     0.9784    0.9611    0.9697       283
          BO     0.9486    0.9227    0.9355       220
           I     0.9752    0.9238    0.9488       341
          IO     0.9958    0.9984    0.9971     10918

    accuracy                         0.9940     11762
   macro avg     0.9745    0.9515    0.9628     11762
weighted avg     0.9939    0.9940    0.9939     11762


Evaluation Results:
Accuracy:  0.9940
Precision: 0.9552
Recall:    0.9575
F1 Score:  0.9563


