# this is run on Google collab

In [None]:
# Install needed libraries
#!pip install transformers datasets seqeval -q
!pip install -U transformers




In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


MessageError: Error: credential propagation was unsuccessful

In [None]:
import pandas as pd

def parse_conll(file_path):
    tokens, labels = [], []
    sentence_tokens, sentence_labels = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence_tokens:
                    tokens.append(sentence_tokens)
                    labels.append(sentence_labels)
                    sentence_tokens, sentence_labels = [], []
            else:
                word, label = line.split()
                sentence_tokens.append(word)
                sentence_labels.append(label)

    return pd.DataFrame({'tokens': tokens, 'ner_tags': labels})

file_path = "/content/drive/MyDrive/Colab Notebooks/Untitled folder/CoNLL_data.conll"
df = parse_conll(file_path)


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Choose a pretrained multilingual or Amharic-friendly model
model_name = "xlm-roberta-base"  # or "Davlan/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Build label mappings
label_list = sorted({label for row in df['ner_tags'] for label in row})
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def tokenize_and_align(example):
    tokenized = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()

    label_ids = []
    prev_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != prev_word_idx:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]])
        else:
            label_ids.append(-100)
        prev_word_idx = word_idx

    tokenized["labels"] = label_ids
    return tokenized

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df).map(tokenize_and_align)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_and_align)


Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_model_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_dir="./ner_logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    # ADD THIS LINE TO DISABLE WANDB
    report_to="none",
)

In [None]:
from transformers import Trainer, DataCollatorForTokenClassification
import evaluate # Import evaluate library for seqeval metric
import numpy as np # Import numpy for array operations

# Ensure id_to_label is defined somewhere globally or passed into scope.
# For example, if you loaded your labels earlier:
# id_to_label = model.config.id2label # Often available from the model config
# OR:
# label_names = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
# id_to_label = {i: label for i, label in enumerate(label_names)}


# Load the seqeval metric from the evaluate library
# This is generally preferred over importing classification_report directly
metric = evaluate.load("seqeval")

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=-1) # Use np.argmax for consistency

    # Filter out ignored indices (-100) and convert IDs back to labels
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id_to_label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    # Compute the full seqeval results using the loaded metric
    results = metric.compute(predictions=true_preds, references=true_labels)

    # Extract the overall_f1, precision, recall, and accuracy
    # These will be directly available as 'eval_f1', 'eval_precision', etc. in Trainer logs
    overall_f1 = results["overall_f1"]
    overall_precision = results["overall_precision"]
    overall_recall = results["overall_recall"]
    overall_accuracy = results["overall_accuracy"]

    return {
        "f1": overall_f1,
        "precision": overall_precision,
        "recall": overall_recall,
        "accuracy": overall_accuracy,
        # Optionally, include the full report if you want to see it in WandB/TensorBoard
        # but be aware of the 'scalar' logging warning if not handled by logger
        "full_seqeval_report": results,
    }

# --- Trainer Setup ---
# Assuming 'model', 'training_args', 'train_dataset', 'val_dataset', and 'tokenizer'
# are already defined in your Colab notebook.

# Fix for FutureWarning: `tokenizer` is deprecated.
# The `DataCollatorForTokenClassification` usually requires the tokenizer,
# but the Trainer can often infer it or the data_collator handles it.
# The warning is mostly about passing `tokenizer=tokenizer` directly to Trainer.__init__

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset, # Assuming this is your tokenized training dataset
    eval_dataset=val_dataset,    # Assuming this is your tokenized validation dataset
    # Removed: tokenizer=tokenizer,  <-- This line is the cause of the FutureWarning in Trainer.__init__
    # The tokenizer is implicitly available to the model, and explicitly to the data_collator
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer), # Pass tokenizer directly to collator
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy,Full Seqeval Report
1,No log,0.268624,0.339623,0.391304,0.3,0.915301,"{'LOC': {'precision': 0.47368421052631576, 'recall': 0.6428571428571429, 'f1': 0.5454545454545454, 'number': 14}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11}, 'overall_precision': 0.391304347826087, 'overall_recall': 0.3, 'overall_f1': 0.33962264150943394, 'overall_accuracy': 0.9153005464480874}"
2,No log,0.237818,0.360656,0.354839,0.366667,0.92623,"{'LOC': {'precision': 0.5238095238095238, 'recall': 0.7857142857142857, 'f1': 0.6285714285714286, 'number': 14}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11}, 'overall_precision': 0.3548387096774194, 'overall_recall': 0.36666666666666664, 'overall_f1': 0.36065573770491804, 'overall_accuracy': 0.9262295081967213}"
3,No log,0.203631,0.474576,0.482759,0.466667,0.930328,"{'LOC': {'precision': 0.7647058823529411, 'recall': 0.9285714285714286, 'f1': 0.8387096774193549, 'number': 14}, 'PRICE': {'precision': 0.09090909090909091, 'recall': 0.2, 'f1': 0.12500000000000003, 'number': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11}, 'overall_precision': 0.4827586206896552, 'overall_recall': 0.4666666666666667, 'overall_f1': 0.47457627118644075, 'overall_accuracy': 0.930327868852459}"
4,No log,0.178128,0.526316,0.555556,0.5,0.943989,"{'LOC': {'precision': 0.7058823529411765, 'recall': 0.8571428571428571, 'f1': 0.7741935483870968, 'number': 14}, 'PRICE': {'precision': 0.5, 'recall': 0.6, 'f1': 0.5454545454545454, 'number': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11}, 'overall_precision': 0.5555555555555556, 'overall_recall': 0.5, 'overall_f1': 0.5263157894736842, 'overall_accuracy': 0.9439890710382514}"
5,No log,0.161484,0.666667,0.75,0.6,0.959016,"{'LOC': {'precision': 0.9333333333333333, 'recall': 1.0, 'f1': 0.9655172413793104, 'number': 14}, 'PRICE': {'precision': 0.8, 'recall': 0.8, 'f1': 0.8000000000000002, 'number': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11}, 'overall_precision': 0.75, 'overall_recall': 0.6, 'overall_f1': 0.6666666666666665, 'overall_accuracy': 0.9590163934426229}"
6,No log,0.145229,0.642857,0.692308,0.6,0.963115,"{'LOC': {'precision': 0.9333333333333333, 'recall': 1.0, 'f1': 0.9655172413793104, 'number': 14}, 'PRICE': {'precision': 0.8, 'recall': 0.8, 'f1': 0.8000000000000002, 'number': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11}, 'overall_precision': 0.6923076923076923, 'overall_recall': 0.6, 'overall_f1': 0.6428571428571429, 'overall_accuracy': 0.9631147540983607}"
7,No log,0.1501,0.642857,0.692308,0.6,0.961749,"{'LOC': {'precision': 0.9333333333333333, 'recall': 1.0, 'f1': 0.9655172413793104, 'number': 14}, 'PRICE': {'precision': 0.8, 'recall': 0.8, 'f1': 0.8000000000000002, 'number': 5}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 11}, 'overall_precision': 0.6923076923076923, 'overall_recall': 0.6, 'overall_f1': 0.6428571428571429, 'overall_accuracy': 0.9617486338797814}"


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=70, training_loss=0.20672005244663783, metrics={'train_runtime': 309.909, 'train_samples_per_second': 1.762, 'train_steps_per_second': 0.226, 'total_flos': 58704603109152.0, 'train_loss': 0.20672005244663783, 'epoch': 7.0})

In [None]:
model_path = "/content/drive/MyDrive/ner_finetuned_amharic"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('/content/drive/MyDrive/ner_finetuned_amharic/tokenizer_config.json',
 '/content/drive/MyDrive/ner_finetuned_amharic/special_tokens_map.json',
 '/content/drive/MyDrive/ner_finetuned_amharic/sentencepiece.bpe.model',
 '/content/drive/MyDrive/ner_finetuned_amharic/added_tokens.json',
 '/content/drive/MyDrive/ner_finetuned_amharic/tokenizer.json')

## DistilBERT

In [None]:
from transformers import AutoTokenizer

# Change this line to load the DistilBERT tokenizer
# Use 'distilbert-base-uncased' for English, or 'distilbert-base-multilingual-cased' for multilingual
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# Or for multilingual:
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:

# --- STEP 2: DEFINE YOUR LABELS (THIS IS CRUCIAL AND MUST COME FIRST) ---
# Make sure these match the specific entity tags in your CoNLL data.
# For example:
label_names = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

# Create the ID to Label and Label to ID mappings
id_to_label = {i: label for i, label in enumerate(label_names)}
label_to_id = {label: i for i, label in enumerate(label_names)}

print("Label names defined:", label_names)
print("ID to Label mapping:", id_to_label)
# -----------------------------------------------------------


# --- STEP 3: LOAD TOKENIZER (After labels are defined) ---
from transformers import AutoTokenizer
# Choose your DistilBERT model:
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # For English
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased") # For multilingual (Amharic/mixed)
# -----------------------------------------------------------


# --- STEP 4: LOAD MODEL (After labels and tokenizer are defined) ---
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    # Use the same model name as your tokenizer
    "distilbert-base-multilingual-cased", # OR "distilbert-base-uncased"
    num_labels=len(label_names), # NOW 'label_names' is defined!
    id2label=id_to_label,
    label2id=label_to_id
)
# -----------------------------------------------------------


# --- STEP 5: PREPARE YOUR DATA (DUMMY DATA FOR EXAMPLE, REPLACE WITH YOUR REAL DATA LOADING) ---
# This part of your code would involve loading your CoNLL files
# and splitting them into train_dataset and val_dataset.
# Ensure 'raw_data' or your actual loaded data aligns with the structure
# expected by tokenize_and_align_labels.

raw_data = {
    "tokens": [
        ["EthioMart", "is", "a", "company", "selling", "coffee", "for", "300", "ETB", "."],
        ["The", "store", "is", "located", "in", "Addis", "Ababa", "."]
    ],
    "ner_tags": [
        [label_to_id["B-ORG"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["B-PRODUCT"], label_to_id["O"], label_to_id["B-PRICE"], label_to_id["I-PRICE"], label_to_id["O"]],
        [label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["B-LOC"], label_to_id["I-LOC"], label_to_id["O"]]
    ]
}
from datasets import Dataset
dataset = Dataset.from_dict(raw_data)
# For a real project, you'd likely have actual train/val splits
train_dataset = dataset
val_dataset = dataset


# Define tokenize_and_align_labels function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
# -----------------------------------------------------------


# --- STEP 6: SET UP TRAINING ARGUMENTS ---
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./ner_distilbert_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_dir="./ner_distilbert_logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none", # Uncomment if you don't want to use WandB
)
# -----------------------------------------------------------


# --- STEP 7: DEFINE COMPUTE METRICS ---
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=-1)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id_to_label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    results = metric.compute(predictions=true_preds, references=true_labels)

    overall_f1 = results["overall_f1"]
    overall_precision = results["overall_precision"]
    overall_recall = results["overall_recall"]
    overall_accuracy = results["overall_accuracy"]

    return {
        "f1": overall_f1,
        "precision": overall_precision,
        "recall": overall_recall,
        "accuracy": overall_accuracy,
        "full_seqeval_report": results, # Keep this for detailed logging if desired
    }
# -----------------------------------------------------------


# --- STEP 8: CREATE AND TRAIN THE TRAINER ---
from transformers import Trainer, DataCollatorForTokenClassification

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

# Start training!
trainer.train()

# --- Save your fine-tuned model ---
trainer.save_model("./my_finetuned_distilbert_ner")
# -----------------------------------------------------------

Label names defined: ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
ID to Label mapping: {0: 'O', 1: 'B-PRODUCT', 2: 'I-PRODUCT', 3: 'B-PRICE', 4: 'I-PRICE', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy,Full Seqeval Report
1,No log,2.018904,0.0,0.0,0.0,0.5,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.5}"
2,No log,1.816963,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
3,No log,1.664711,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
4,No log,1.552024,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
5,No log,1.472933,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
6,No log,1.422557,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
7,No log,1.398064,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## mBERT

In [None]:
# --- STEP 1: INSTALL LIBRARIES (If you haven't already) ---
# Ensure you have these installed. Restart runtime after installation.
# !pip install transformers datasets evaluate seqeval
# -----------------------------------------------------------


# --- STEP 2: DEFINE YOUR LABELS (CRUCIAL: Must match your CoNLL data) ---
label_names = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
id_to_label = {i: label for i, label in enumerate(label_names)}
label_to_id = {label: i for i, label in enumerate(label_names)}

print("Label names defined:", label_names)
print("ID to Label mapping:", id_to_label)
# -----------------------------------------------------------


# --- STEP 3: LOAD mBERT TOKENIZER ---
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
# -----------------------------------------------------------


# --- STEP 4: LOAD mBERT MODEL FOR TOKEN CLASSIFICATION ---
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased", # Using mBERT
    num_labels=len(label_names),
    id2label=id_to_label,
    label2id=label_to_id
)
# -----------------------------------------------------------


# --- STEP 5: PREPARE YOUR DATA (DUMMY DATA FOR EXAMPLE, REPLACE WITH YOUR REAL DATA LOADING) ---
# This part of your code would involve loading your CoNLL files
# and splitting them into train_dataset and val_dataset.

# Example dummy data structure:
from datasets import Dataset
raw_data = {
    "tokens": [
        ["EthioMart", "is", "a", "company", "selling", "coffee", "for", "300", "ETB", "."],
        ["The", "store", "is", "located", "in", "Addis", "Ababa", "."]
    ],
    "ner_tags": [
        [label_to_id["B-ORG"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["B-PRODUCT"], label_to_id["O"], label_to_id["B-PRICE"], label_to_id["I-PRICE"], label_to_id["O"]],
        [label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["O"], label_to_id["B-LOC"], label_to_id["I-LOC"], label_to_id["O"]]
    ]
}
dataset = Dataset.from_dict(raw_data)
# For a real project, you'd likely have actual train/val splits
train_dataset = dataset
val_dataset = dataset


# Define tokenize_and_align_labels function (remains the same)
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
# -----------------------------------------------------------


# --- STEP 6: SET UP TRAINING ARGUMENTS ---
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./ner_mbert_output", # Changed output directory name for clarity
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7, # Or whatever number of epochs you prefer
    weight_decay=0.01,
    logging_dir="./ner_mbert_logs", # Changed log directory name
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none", # Uncomment if you don't want to use WandB
)
# -----------------------------------------------------------


# --- STEP 7: DEFINE COMPUTE METRICS ---
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=-1)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id_to_label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    results = metric.compute(predictions=true_preds, references=true_labels)

    overall_f1 = results["overall_f1"]
    overall_precision = results["overall_precision"]
    overall_recall = results["overall_recall"]
    overall_accuracy = results["overall_accuracy"]

    return {
        "f1": overall_f1,
        "precision": overall_precision,
        "recall": overall_recall,
        "accuracy": overall_accuracy,
        "full_seqeval_report": results,
    }
# -----------------------------------------------------------


# --- STEP 8: CREATE AND TRAIN THE TRAINER ---
from transformers import Trainer, DataCollatorForTokenClassification

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

# Start training!
trainer.train()

# --- Save your fine-tuned model ---
trainer.save_model("./my_finetuned_mbert_ner")
# -----------------------------------------------------------

Label names defined: ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
ID to Label mapping: {0: 'O', 1: 'B-PRODUCT', 2: 'I-PRODUCT', 3: 'B-PRICE', 4: 'I-PRICE', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy,Full Seqeval Report
1,No log,1.639042,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
2,No log,1.244776,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
3,No log,1.084761,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
4,No log,1.018766,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
5,No log,0.973018,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
6,No log,0.935486,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"
7,No log,0.915062,0.0,0.0,0.0,0.666667,"{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.6666666666666666}"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
