# 1. Installing and importing libraries.

In [None]:
pip install seqeval

In [None]:
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, Features, Value, Sequence
from seqeval.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import ast

# 2. Loading and analyzing dataset

In [None]:
# Load dataset
ds = load_dataset("LocalDoc/azerbaijani-ner-dataset")

In [None]:
ds["train"][0]

## As you can see, 'tokens' and 'ner_tags' are stored as long strings that resemble lists. For token classification, they should be converted to actual lists.

In [None]:
ds["train"].features

In [None]:
print(ds)

In [None]:
sample = ds["train"][0]
print("Tokens:", sample["tokens"])
print("NER Tags:", sample["ner_tags"])

# 3. Preprocessing

## 3.1 Some rows have null tokens and ner tags

In [None]:
def has_none_fields(example):
    return example["tokens"] is None or example["ner_tags"] is None

# Filter the bad rows only
bad_rows = ds["train"].filter(has_none_fields)

print("Number of null rows: ", len(bad_rows), "\n")

# Display first few problematic rows
for i in range(10):
    print(f"Index: {bad_rows[i]['index']}")
    print(f"Tokens: {bad_rows[i]['tokens']}")
    print(f"NER Tags: {bad_rows[i]['ner_tags']}")
    print("-" * 40)

In [None]:
# Example bad sample
bad_sample = ds.filter(lambda x: x["index"] == 'dac55265-38cd-4c4b-9e56-a48a77e108d4')
print(bad_sample['train'][0])

## 3.2 In some rows, the lengths of the 'tokens' and 'ner_tags' do not match. This inconsistency must be also considered during parsing.

In [None]:
# Parse string fields

mismatch_ids = []

def safe_parse_strings(example):
    try:
        tokens = ast.literal_eval(example["tokens"])
        ner_tags = ast.literal_eval(example["ner_tags"])
        tokens = [str(token) for token in tokens]
        ner_tags = [int(tag) for tag in ner_tags]
        if not tokens or not ner_tags:
            print(f"Empty list in example {example['index']}")
            return None
        if len(tokens) != len(ner_tags):
            mismatch_ids.append(example['index'])
            return None
        return {
            "index": example["index"],
            "tokens": tokens,
            "ner_tags": ner_tags,
        }
    except Exception as e:
        # print(f"Parsing error in example {example['index']}: {str(e)} \n")
        # print(f"tokens: {example['tokens']}, ner_tags {example['ner_tags']}")
        return None

parsed_ds = ds.map(safe_parse_strings)

In [None]:
len(mismatch_ids)

In [None]:
parsed_ds.shape

In [None]:
# Define and apply schema
features = Features({
    "index": Value("string"),
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(Value("int32")),
})
parsed_ds = parsed_ds.cast(features)

In [None]:
parsed_ds

In [None]:
# Define label mapping
label_list = [
    "O", "PERSON", "LOCATION", "ORGANISATION", "DATE", "TIME", "MONEY", "PERCENTAGE",
    "FACILITY", "PRODUCT", "EVENT", "ART", "LAW", "LANGUAGE", "GPE", "NORP",
    "ORDINAL", "CARDINAL", "DISEASE", "CONTACT", "ADAGE", "QUANTITY", "MISCELLANEOUS",
    "POSITION", "PROJECT"
]
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for idx, label in enumerate(label_list)}
num_labels = len(label_list)

# Verify unique tags
unique_tags = sorted(set(tag for example in parsed_ds["train"] for tag in example["ner_tags"]))
print("Unique Tags:", unique_tags)
print("Number of Labels:", num_labels)

## 3.3 Loading pretrained model and tokenize and align labels

In [None]:
# Multilingual BERT model is used as a pretrained model

model_checkpoint = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

In [None]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128  # change if needed
    )

    word_ids = tokenized.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)  # will be ignored in loss
        elif word_idx != previous_word_idx:
            label_ids.append(example["ner_tags"][word_idx])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized["labels"] = label_ids
    return tokenized

tokenized_dataset = parsed_ds.map(tokenize_and_align_labels, batched=False)

## 3.4 Splitting dataset in 7:1:2 ratio

In [None]:
# First split into train + temp
split = tokenized_dataset["train"].train_test_split(test_size=0.3, seed=42)
train_ds = split["train"]
temp_ds = split["test"]

# Then split temp into val and test (1/3 and 2/3 of 0.3)
val_test_split = temp_ds.train_test_split(test_size=2/3, seed=42)
val_ds = val_test_split["train"]
test_ds = val_test_split["test"]

print("Train:", len(train_ds), "Val:", len(val_ds), "Test:", len(test_ds))

# 4. Training

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
from seqeval.metrics import classification_report

# Define metrics

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

In [None]:
# Define trainer

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./ner-az",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_ds)

In [None]:
# Save model, tokenizer, and config
model_path = "./ner_model_az"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model saved to: {model_path}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load logs from the trainer's state
logs = trainer.state.log_history

# Convert to DataFrame
df_logs = pd.DataFrame(logs)

# Plot Losses
plt.figure(figsize=(10, 4))
plt.plot(df_logs["step"], df_logs["loss"], label="Train Loss", marker='o')
if "eval_loss" in df_logs:
    plt.plot(df_logs["step"], df_logs["eval_loss"], label="Validation Loss", marker='x')
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Training & Validation Loss")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Get predictions
predictions, labels, _ = trainer.predict(test_ds)
preds = predictions.argmax(-1)

true_labels = []
true_preds = []

for pred, label in zip(preds, labels):
    true_label = []
    true_pred = []
    for p, l in zip(pred, label):
        if l != -100:
            true_label.append(id2label[l])
            true_pred.append(id2label[p])
    true_labels.append(true_label)
    true_preds.append(true_pred)

In [None]:
print(classification_report(true_labels, true_preds))

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_path = "./ner_model_az"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

# Flatten labels
flat_true = [label for seq in true_labels for label in seq]
flat_pred = [label for seq in true_preds for label in seq]

labels_sorted = sorted(set(flat_true + flat_pred))
label_to_index = {label: i for i, label in enumerate(labels_sorted)}

y_true_idx = [label_to_index[t] for t in flat_true]
y_pred_idx = [label_to_index[p] for p in flat_pred]

cm = confusion_matrix(y_true_idx, y_pred_idx)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Plot
plt.figure(figsize=(14, 12))
sns.heatmap(cm_norm, annot=True, fmt=".2f", xticklabels=labels_sorted, yticklabels=labels_sorted, cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Normalized Confusion Matrix for NER")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
from transformers import pipeline

# Load model and tokenizer from saved directory
ner_pipeline = pipeline(
    "ner",
    model="./ner_model_az",
    tokenizer="./ner_model_az",
    aggregation_strategy="simple"  # groups sub-tokens together
)

In [None]:
text = "2025-ci ildə Bakıda F1 keçirilib."
ner_results = ner_pipeline(text)

for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.2f}")

In [None]:
text = "Azərbaycan Respublikasının paytaxtı Bakıda, 2024-cü ilin yayında keçirilən beynəlxalq konfrans zamanı, professor Əhmədov, ABŞ-dan gəlmiş qonaqlar və Avropa Birliyinin nümayəndələri ilə ətraf mühitin qorunması strategiyaları haqqında səmərəli müzakirələr apardıqdan sonra, yeni əməkdaşlıq imkanlarının yaranacağını bildirdi."
ner_results = ner_pipeline(text)

for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.2f}")
