<a href="https://colab.research.google.com/github/its3alih/Thesis/blob/main/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets seqeval pandas openpyxl


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=3a905d81ba529422ca16caa0fe145d6d6e73cad4d3372519fdedfd3c5454b63c
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
pip install wandb




In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IO-tagged dataset
sentences, tags = load_excel_data("/content/IO.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and BERT model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_io_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_io_bert"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIO Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mis-it-ali03[0m ([33mis-it-ali03-german-university-in-cairo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss



IO Classification Report (BERT):
              precision    recall  f1-score   support

           I     0.9747    0.9714    0.9731       595
           O     0.9985    0.9987    0.9986     11167

    accuracy                         0.9973     11762
   macro avg     0.9866    0.9850    0.9858     11762
weighted avg     0.9973    0.9973    0.9973     11762


Evaluation Results:
Accuracy:  0.9973
Precision: 0.9474
Recall:    0.9509
F1 Score:  0.9492


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BI-tagged dataset
sentences, tags = load_excel_data("/content/BI.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and BERT model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results_bi_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_bi_bert"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Predict and generate classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print report
print("\nBI Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





BI Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9848    0.9811    0.9829       264
          BO     0.9703    0.9751    0.9727       201
           I     0.9843    0.9572    0.9705       327
          IO     0.9982    0.9990    0.9986     10970

    accuracy                         0.9970     11762
   macro avg     0.9844    0.9781    0.9812     11762
weighted avg     0.9970    0.9970    0.9970     11762


Evaluation Results:
Accuracy:  0.9970
Precision: 0.9746
Recall:    0.9826
F1 Score:  0.9786




In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess BIES-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/BIES.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_bies_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_bies_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nBIES Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





BIES Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9585    0.9732    0.9658       261
          BO     0.9632    0.9632    0.9632       190
           E     0.9401    0.9617    0.9508       261
          EO     0.9714    0.9754    0.9734       244
           I     0.9787    0.6970    0.8142        66
          IO     0.9973    0.9984    0.9979     10726
           S     0.0000    0.0000    0.0000         3
          SO     0.7000    0.6364    0.6667        11

    accuracy                         0.9937     11762
   macro avg     0.8136    0.7756    0.7915     11762
weighted avg     0.9934    0.9937    0.9934     11762


Evaluation Results:
Accuracy:  0.9937
Precision: 0.9544


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall:    0.9702
F1 Score:  0.9623


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IE-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IE.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_ie_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_ie_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIE Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





IE Classification Report (BERT):
              precision    recall  f1-score   support

           E     0.9559    0.9848    0.9701       264
          EO     0.9655    0.9844    0.9749       256
           I     0.9694    0.9694    0.9694       327
          IO     0.9989    0.9977    0.9983     10915

    accuracy                         0.9963     11762
   macro avg     0.9724    0.9841    0.9782     11762
weighted avg     0.9964    0.9963    0.9964     11762


Evaluation Results:
Accuracy:  0.9963
Precision: 0.9689
Recall:    0.9801
F1 Score:  0.9745




In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IOB-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IOB.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_iob_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iob_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIOB Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOB Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9962    0.9850    0.9905       266
           I     0.9907    0.9696    0.9800       329
           O     0.9987    0.9996    0.9992     11167

    accuracy                         0.9985     11762
   macro avg     0.9952    0.9847    0.9899     11762
weighted avg     0.9985    0.9985    0.9985     11762


Evaluation Results:
Accuracy:  0.9985
Precision: 0.9660
Recall:    0.9624
F1 Score:  0.9642


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IOBES-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IOBES.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_iobes_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iobes_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIOBES Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOBES Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9734    0.9808    0.9771       261
           E     0.9696    0.9770    0.9733       261
           I     0.9375    0.9091    0.9231        66
           O     0.9989    0.9987    0.9988     11171
           S     1.0000    1.0000    1.0000         3

    accuracy                         0.9974     11762
   macro avg     0.9759    0.9731    0.9745     11762
weighted avg     0.9974    0.9974    0.9974     11762


Evaluation Results:
Accuracy:  0.9974
Precision: 0.9373
Recall:    0.9621
F1 Score:  0.9495


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IOE-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IOE.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_ioe_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_ioe_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIOE Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOE Classification Report (BERT):
              precision    recall  f1-score   support

           E     0.9438    0.9545    0.9492       264
           I     0.9745    0.9358    0.9548       327
           O     0.9979    0.9988    0.9984     11171

    accuracy                         0.9961     11762
   macro avg     0.9721    0.9631    0.9674     11762
weighted avg     0.9961    0.9961    0.9961     11762


Evaluation Results:
Accuracy:  0.9961
Precision: 0.9114
Recall:    0.9356
F1 Score:  0.9234


##SECOND


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load IO-tagged dataset
sentences, tags = load_excel_data("/content/IO2.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and BERT model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch for speed)
training_args = TrainingArguments(
    output_dir="./results_io_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_io_bert"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print classification report
print("\nIO Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IO Classification Report (BERT):
              precision    recall  f1-score   support

           I     0.9788    0.9599    0.9693       624
           O     0.9978    0.9988    0.9983     11138

    accuracy                         0.9968     11762
   macro avg     0.9883    0.9794    0.9838     11762
weighted avg     0.9967    0.9968    0.9968     11762


Evaluation Results:
Accuracy:  0.9968
Precision: 0.9464
Recall:    0.9397
F1 Score:  0.9431


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

# 2. Load BI-tagged dataset
sentences, tags = load_excel_data("/content/BI2.xlsx")

# 3. Create label mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 4. Create Hugging Face dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 5. Load tokenizer and BERT model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 6. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 7. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 8. Evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 9. Training arguments (1 epoch)
training_args = TrainingArguments(
    output_dir="./results_bi_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_bi_bert"
)

# 10. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 11. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 12. Predict and generate classification report
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# Flatten for sklearn report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print report
print("\nBI Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





BI Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9963    0.9576    0.9766       283
          BO     0.9718    0.9409    0.9561       220
           I     0.9969    0.9443    0.9699       341
          IO     0.9964    0.9997    0.9981     10918

    accuracy                         0.9960     11762
   macro avg     0.9904    0.9606    0.9752     11762
weighted avg     0.9960    0.9960    0.9960     11762


Evaluation Results:
Accuracy:  0.9960




Precision: 0.9734
Recall:    0.9679
F1 Score:  0.9706


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess BIES-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/BIES2.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_bies_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_bies_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nBIES Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





BIES Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9783    0.9712    0.9747       278
          BO     0.9612    0.9659    0.9635       205
           E     0.9747    0.9712    0.9730       278
          EO     0.9729    0.9691    0.9710       259
           I     1.0000    0.8095    0.8947        63
          IO     0.9965    0.9991    0.9978     10659
           S     0.0000    0.0000    0.0000         5
          SO     0.7500    0.4000    0.5217        15

    accuracy                         0.9943     11762
   macro avg     0.8292    0.7607    0.7871     11762
weighted avg     0.9937    0.9943    0.9939     11762


Evaluation Results:
Accuracy:  0.9943
Precision: 0.9704


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall:    0.9711
F1 Score:  0.9707


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IE-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IE2.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_ie_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_ie_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIE Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss





IE Classification Report (BERT):
              precision    recall  f1-score   support

           E     0.9715    0.9647    0.9681       283
          EO     0.9672    0.9636    0.9654       275
           I     0.9878    0.9531    0.9701       341
          IO     0.9972    0.9985    0.9978     10863

    accuracy                         0.9956     11762
   macro avg     0.9809    0.9700    0.9754     11762
weighted avg     0.9956    0.9956    0.9956     11762


Evaluation Results:
Accuracy:  0.9956
Precision: 0.9672
Recall:    0.9703
F1 Score:  0.9688




In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IOB-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IOB2.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_iob_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iob_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIOB Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOB Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9892    0.9717    0.9804       283
           I     0.9880    0.9677    0.9778       341
           O     0.9984    0.9995    0.9989     11138

    accuracy                         0.9979     11762
   macro avg     0.9919    0.9796    0.9857     11762
weighted avg     0.9979    0.9979    0.9979     11762


Evaluation Results:
Accuracy:  0.9979
Precision: 0.9610
Recall:    0.9576
F1 Score:  0.9593


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IOBES-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IOBES2.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_iobes_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_iobes_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIOBES Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOBES Classification Report (BERT):
              precision    recall  f1-score   support

           B     0.9782    0.9676    0.9729       278
           E     0.9673    0.9568    0.9620       278
           I     1.0000    0.8095    0.8947        63
           O     0.9973    0.9991    0.9982     11138
           S     1.0000    0.6000    0.7500         5

    accuracy                         0.9962     11762
   macro avg     0.9886    0.8666    0.9156     11762
weighted avg     0.9962    0.9962    0.9961     11762


Evaluation Results:
Accuracy:  0.9962
Precision: 0.9263
Recall:    0.9329
F1 Score:  0.9296


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report as sk_classification_report
import numpy as np

# 1. Load and preprocess IOE-tagged data
def load_excel_data(file_path):
    df = pd.read_csv(file_path) if file_path.endswith('.csv') else pd.read_excel(file_path)
    df = df[['Word i', 'Word i entity tag']].dropna()

    sentences, labels = [], []
    sentence, label = [], []
    for word, tag in zip(df['Word i'], df['Word i entity tag']):
        word = str(word).strip()
        tag = str(tag).strip()
        if word in ['.', '؟']:
            if sentence:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            sentence.append(word)
            label.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = load_excel_data("/content/IOE2.xlsx")

# 2. Create tag-id mappings
unique_tags = sorted(set(tag for doc in tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# 3. Format as HuggingFace Dataset
examples = [{"tokens": s, "ner_tags": [tag2id[t] for t in l]} for s, l in zip(sentences, tags)]
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

# 4. Load tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# 5. Tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            elif word_id != prev_word_id:
                aligned_labels.append(label[word_id])
            else:
                aligned_labels.append(-100)
            prev_word_id = word_id
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# 6. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 7. Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
                   for label, prediction in zip(labels, predictions)]
    true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                        for label, prediction in zip(labels, predictions)]

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results_ioe_bert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs_ioe_bert"
)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train and evaluate
trainer.train()
results = trainer.evaluate()

# 11. Detailed predictions
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

true_labels = [[id2tag[l] for (l, p) in zip(label, prediction) if l != -100]
               for label, prediction in zip(labels, predictions)]
true_predictions = [[id2tag[p] for (l, p) in zip(label, prediction) if l != -100]
                    for label, prediction in zip(labels, predictions)]

# 12. Flatten for classification report
flat_true = [tag for sublist in true_labels for tag in sublist]
flat_pred = [tag for sublist in true_predictions for tag in sublist]

# 13. Print reports
print("\nIOE Classification Report (BERT):")
print(sk_classification_report(flat_true, flat_pred, digits=4))

print("\nEvaluation Results:")
print(f"Accuracy:  {accuracy_score(true_labels, true_predictions):.4f}")
print(f"Precision: {precision_score(true_labels, true_predictions):.4f}")
print(f"Recall:    {recall_score(true_labels, true_predictions):.4f}")
print(f"F1 Score:  {f1_score(true_labels, true_predictions):.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3006 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss



IOE Classification Report (BERT):
              precision    recall  f1-score   support

           E     0.9819    0.9611    0.9714       283
           I     0.9939    0.9560    0.9746       341
           O     0.9979    0.9996    0.9988     11138

    accuracy                         0.9974     11762
   macro avg     0.9913    0.9723    0.9816     11762
weighted avg     0.9974    0.9974    0.9974     11762


Evaluation Results:
Accuracy:  0.9974
Precision: 0.9537
Recall:    0.9470
F1 Score:  0.9504
