# Prepare Dataset and Train Model 

## Load Dataset

Load the dataset from a JSON file.

In [29]:
from datasets import load_dataset

# Load the dataset from a JSONL file
dataset = load_dataset("json", data_files="data/ner_dataset.json")

# Check the structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'entities'],
        num_rows: 11084
    })
})


## Split Dataset

Reserve 10% for model testing.

In [30]:
# Split the dataset
dataset = dataset["train"].train_test_split(test_size=0.1)  # 10% for testing

train_dataset = dataset["train"]
test_dataset = dataset["test"]


## Tokenize Dataset

In [34]:
# Tokenize the dataset

import token
from transformers import AutoTokenizer

label_map = {
    "O": 0,  # "O" stands for "Outside" (no entity)
    "COMPANY": 1,
    "DOMAIN": 2,
    "IP_ADDR": 3,
    "URL": 4,
    "EMAIL": 5,
}

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=128, return_offsets_mapping=True
    )
    labels = []
    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        entity_positions = examples["entities"][i]
        label_ids = [label_map["O"]] * len(offsets)  # Initialize all tokens as "O"
        for entity in entity_positions:
            for idx, (start, end) in enumerate(offsets):
                if start >= entity["start"] and end <= entity["end"]:
                    label_ids[idx] = label_map[entity["label"]]  # Use numeric ID for the label
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    tokenized_inputs.pop("offset_mapping")  # Remove offset mapping as it’s not needed for training
    return tokenized_inputs

# Apply tokenization in batches
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

print(tokenized_train_dataset[0])
print(tokenized_test_dataset[0])

{'text': 'An error on the checkout page of xhaiuszeds.shop caused some transactions to fail during peak hours.', 'entities': [{'end': 48, 'label': 'DOMAIN', 'start': 33}], 'input_ids': [0, 4688, 5849, 15, 5, 28429, 1842, 9, 3023, 30279, 687, 329, 12080, 4, 22799, 1726, 103, 5538, 7, 5998, 148, 4996, 722, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Train the Model

In [35]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base", num_labels=len(label_map)
)

training_args = TrainingArguments(
    output_dir="models/ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="logs",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
)

trainer.train()


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/1872 [00:00<?, ?it/s]

{'loss': 0.0532, 'grad_norm': 0.06708976626396179, 'learning_rate': 1.4658119658119658e-05, 'epoch': 0.8}


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 0.002198802540078759, 'eval_runtime': 4.6455, 'eval_samples_per_second': 238.724, 'eval_steps_per_second': 29.921, 'epoch': 1.0}
{'loss': 0.0016, 'grad_norm': 0.011260145343840122, 'learning_rate': 9.316239316239318e-06, 'epoch': 1.6}


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 0.0002460828691255301, 'eval_runtime': 4.6521, 'eval_samples_per_second': 238.389, 'eval_steps_per_second': 29.879, 'epoch': 2.0}
{'loss': 0.0005, 'grad_norm': 0.0021937601268291473, 'learning_rate': 3.974358974358974e-06, 'epoch': 2.4}


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 9.741134999785572e-05, 'eval_runtime': 4.7021, 'eval_samples_per_second': 235.854, 'eval_steps_per_second': 29.561, 'epoch': 3.0}
{'train_runtime': 431.4505, 'train_samples_per_second': 69.359, 'train_steps_per_second': 4.339, 'train_loss': 0.014817170934098909, 'epoch': 3.0}


TrainOutput(global_step=1872, training_loss=0.014817170934098909, metrics={'train_runtime': 431.4505, 'train_samples_per_second': 69.359, 'train_steps_per_second': 4.339, 'total_flos': 1954897055193600.0, 'train_loss': 0.014817170934098909, 'epoch': 3.0})

## Save the Trained Model

In [36]:
trainer.save_model("models/ner")
tokenizer.save_pretrained("models/ner")

('models/ner/tokenizer_config.json',
 'models/ner/special_tokens_map.json',
 'models/ner/vocab.json',
 'models/ner/merges.txt',
 'models/ner/added_tokens.json',
 'models/ner/tokenizer.json')

## Export Model to ONNX

Use:

```
optimum-cli export onnx --model models/ner --task token-classification models/onnx
```

## Verify Exported Model

In [37]:
import onnx
import torch
import onnxruntime as ort

onnx_model_path = "models/onnx/model.onnx"

# Load and check the ONNX model
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

# Run inference with ONNX Runtime
session = ort.InferenceSession(onnx_model_path)

# Prepare dummy input
input_ids = [[101, 2054, 2003, 1996, 2171, 102]]  # Token IDs (example)
attention_mask = [[1, 1, 1, 1, 1, 1]]  # Attention mask

inputs = {
    "input_ids": torch.tensor(input_ids, dtype=torch.long).numpy(),
    "attention_mask": torch.tensor(attention_mask, dtype=torch.long).numpy(),
}

# Run the ONNX model
outputs = session.run(None, inputs)
print("ONNX Model Outputs:", outputs)


ONNX Model Outputs: [array([[[ 4.4469266 ,  2.2834892 ,  1.2277882 , -3.4402235 ,
         -2.331243  , -3.9127386 ],
        [ 7.983002  , -0.6029357 , -0.89352286, -3.0848005 ,
         -1.571463  , -3.4708927 ],
        [ 4.613029  ,  1.105935  ,  1.8005015 , -3.7764654 ,
         -2.3312867 , -3.7697682 ],
        [ 3.2291615 ,  2.4427762 ,  2.321777  , -3.7389376 ,
         -2.4622188 , -3.6951158 ],
        [ 2.5670388 ,  3.318974  ,  2.627408  , -3.634867  ,
         -2.5895925 , -3.8985283 ],
        [ 3.1018884 ,  3.634954  ,  2.065163  , -3.442695  ,
         -2.5756822 , -4.1144876 ]]], dtype=float32)]


# Test

## Evaluate Model with Test Data

In [40]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import classification_report
import numpy as np

# Tokenized test dataset (ensure it's already tokenized)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Define the label map and ensure keys are plain integers
id2label = {int(v): k for k, v in label_map.items()}
label2id = {k: int(k) for k, v in id2label.items()}

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) from predictions and labels
    true_predictions = [
        [id2label[int(p)] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[int(l)] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    # Use sklearn's classification report for detailed metrics
    results = classification_report(
        [item for sublist in true_labels for item in sublist],
        [item for sublist in true_predictions for item in sublist],
        output_dict=True,
    )
    return {
        "precision": results["macro avg"]["precision"],
        "recall": results["macro avg"]["recall"],
        "f1": results["macro avg"]["f1-score"],
    }

# Load model
model = AutoModelForTokenClassification.from_pretrained("models/ner")

# Define trainer
trainer = Trainer(
    model=model,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Evaluate
results = trainer.evaluate()
print(results)


  trainer = Trainer(


  0%|          | 0/139 [00:00<?, ?it/s]

{'eval_loss': 9.741134999785572e-05, 'eval_model_preparation_time': 0.0012, 'eval_precision': 0.9999004530536745, 'eval_recall': 0.9999957645783214, 'eval_f1': 0.9999481010928575, 'eval_runtime': 5.9452, 'eval_samples_per_second': 186.536, 'eval_steps_per_second': 23.38}


## Test ONNX Model

In [41]:
from optimum.onnxruntime import ORTModelForTokenClassification
from transformers import AutoTokenizer, pipeline

onnx_path = "models/onnx"
trained_model_path = "models/ner"

# Load the ONNX model
onnx_model = ORTModelForTokenClassification.from_pretrained(onnx_path)
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)

# Define the label map
label_map = {0: "O", 1: "COMPANY", 2: "DOMAIN", 3: "IP_ADDR", 4: "URL", 5: "EMAIL"}

# Update the model's config with the label map
onnx_model.config.id2label = label_map
onnx_model.config.label2id = {v: k for k, v in label_map.items()}

# Create a pipeline for NER
onnx_pipeline = pipeline(
    "ner",
    model=onnx_model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy="simple"
)

# Test inference
text = "Dr. Lynn H. Monkres works at ABC Corp."
results = onnx_pipeline(text)
print(results)


[{'entity_group': 'COMPANY', 'score': np.float32(0.731601), 'word': ' Monkres', 'start': 12, 'end': 19}, {'entity_group': 'COMPANY', 'score': np.float32(0.99096173), 'word': ' ABC Corp', 'start': 29, 'end': 37}]
