# Prepare Dataset and Train Model 

## Load Dataset

Load the dataset from a JSON file.

In [15]:
from datasets import load_dataset

# Load the dataset from a JSONL file
dataset = load_dataset("json", data_files="data/ner_dataset.jsonl")

# Check the structure
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'entities'],
        num_rows: 148500
    })
})


## Split Dataset

Reserve 10% for model testing.

In [16]:
# Split the dataset
dataset = dataset["train"].train_test_split(test_size=0.1)  # 10% for testing

train_dataset = dataset["train"]
test_dataset = dataset["test"]


## Tokenize Dataset

In [17]:
# Tokenize the dataset

import token
from transformers import AutoTokenizer

label_map = {
    "O": 0,  # "O" stands for "Outside" (no entity)
    "COMPANY": 1,
    "DOMAIN": 2,
    "IP_ADDR": 3,
    "URL": 4,
    "EMAIL": 5,
}

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=128, return_offsets_mapping=True
    )
    labels = []
    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        entity_positions = examples["entities"][i]
        label_ids = [label_map["O"]] * len(offsets)  # Initialize all tokens as "O"
        for entity in entity_positions:
            for idx, (start, end) in enumerate(offsets):
                if start >= entity["start"] and end <= entity["end"]:
                    label_ids[idx] = label_map[entity["label"]]  # Use numeric ID for the label
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    tokenized_inputs.pop("offset_mapping")  # Remove offset mapping as it’s not needed for training
    return tokenized_inputs

# Apply tokenization in batches
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

print(tokenized_train_dataset[0])
print(tokenized_test_dataset[0])

Map:   0%|          | 0/133650 [00:00<?, ? examples/s]

Map:   0%|          | 0/14850 [00:00<?, ? examples/s]

{'text': '15:13:21 ERROR Invalid API key used for request to dehmel.info.', 'entities': [{'start': 51, 'end': 62, 'label': 'DOMAIN'}], 'input_ids': [0, 996, 35, 1558, 35, 2146, 38586, 38539, 21013, 762, 341, 13, 2069, 7, 263, 298, 17170, 4, 23999, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [0, 0, 0, 

# Train the Model

In [18]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base", num_labels=len(label_map)
)

training_args = TrainingArguments(
    output_dir="models/ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="logs",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,    
)

trainer.train()


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/25062 [00:00<?, ?it/s]

{'loss': 0.0642, 'grad_norm': 0.01806827262043953, 'learning_rate': 1.9600989545926105e-05, 'epoch': 0.06}
{'loss': 0.0024, 'grad_norm': 0.056169819086790085, 'learning_rate': 1.9201979091852208e-05, 'epoch': 0.12}
{'loss': 0.0053, 'grad_norm': 0.0017644035397097468, 'learning_rate': 1.880296863777831e-05, 'epoch': 0.18}
{'loss': 0.0049, 'grad_norm': 0.007803365122526884, 'learning_rate': 1.8403958183704415e-05, 'epoch': 0.24}
{'loss': 0.0028, 'grad_norm': 0.0019374164985492826, 'learning_rate': 1.8004947729630518e-05, 'epoch': 0.3}
{'loss': 0.0005, 'grad_norm': 4.200196266174316, 'learning_rate': 1.760593727555662e-05, 'epoch': 0.36}
{'loss': 0.0008, 'grad_norm': 0.0004985977429896593, 'learning_rate': 1.7206926821482724e-05, 'epoch': 0.42}
{'loss': 0.0006, 'grad_norm': 0.0005463177803903818, 'learning_rate': 1.6807916367408827e-05, 'epoch': 0.48}
{'loss': 0.0002, 'grad_norm': 0.0005226978682912886, 'learning_rate': 1.640890591333493e-05, 'epoch': 0.54}
{'loss': 0.0009, 'grad_norm': 0

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_loss': 0.002017647260800004, 'eval_runtime': 62.4992, 'eval_samples_per_second': 237.603, 'eval_steps_per_second': 29.712, 'epoch': 1.0}
{'loss': 0.0015, 'grad_norm': 0.000349904817994684, 'learning_rate': 1.3216822280743758e-05, 'epoch': 1.02}
{'loss': 0.0018, 'grad_norm': 0.00033905485179275274, 'learning_rate': 1.281781182666986e-05, 'epoch': 1.08}
{'loss': 0.0002, 'grad_norm': 0.0025566082913428545, 'learning_rate': 1.2418801372595963e-05, 'epoch': 1.14}
{'loss': 0.0009, 'grad_norm': 0.0017402688972651958, 'learning_rate': 1.2019790918522068e-05, 'epoch': 1.2}
{'loss': 0.0009, 'grad_norm': 0.0007227610913105309, 'learning_rate': 1.162078046444817e-05, 'epoch': 1.26}
{'loss': 0.0002, 'grad_norm': 0.0016576049383729696, 'learning_rate': 1.1221770010374273e-05, 'epoch': 1.32}
{'loss': 0.0021, 'grad_norm': 0.0005424692062661052, 'learning_rate': 1.0822759556300376e-05, 'epoch': 1.38}
{'loss': 0.0009, 'grad_norm': 0.000179406299139373, 'learning_rate': 1.0423749102226479e-05, 'ep

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_loss': 0.0010500079952180386, 'eval_runtime': 61.9284, 'eval_samples_per_second': 239.793, 'eval_steps_per_second': 29.986, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': 0.0009540645405650139, 'learning_rate': 6.433644561487512e-06, 'epoch': 2.03}
{'loss': 0.001, 'grad_norm': 7.870363333495334e-05, 'learning_rate': 6.034634107413615e-06, 'epoch': 2.09}
{'loss': 0.0001, 'grad_norm': 5.6935397878987715e-05, 'learning_rate': 5.635623653339718e-06, 'epoch': 2.15}
{'loss': 0.0, 'grad_norm': 7.448661199305207e-05, 'learning_rate': 5.236613199265821e-06, 'epoch': 2.21}
{'loss': 0.0, 'grad_norm': 3.687303978949785e-05, 'learning_rate': 4.837602745191924e-06, 'epoch': 2.27}
{'loss': 0.001, 'grad_norm': 0.0019951483700424433, 'learning_rate': 4.4385922911180275e-06, 'epoch': 2.33}
{'loss': 0.0, 'grad_norm': 5.876285649719648e-05, 'learning_rate': 4.039581837044131e-06, 'epoch': 2.39}
{'loss': 0.0009, 'grad_norm': 0.00011076569353463128, 'learning_rate': 3.6405713829702344e-06, 'epoch': 2.45}
{

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_loss': 0.0004938127822242677, 'eval_runtime': 62.5652, 'eval_samples_per_second': 237.352, 'eval_steps_per_second': 29.681, 'epoch': 3.0}
{'train_runtime': 5793.2871, 'train_samples_per_second': 69.209, 'train_steps_per_second': 4.326, 'train_loss': 0.0022982391786587892, 'epoch': 3.0}


TrainOutput(global_step=25062, training_loss=0.0022982391786587892, metrics={'train_runtime': 5793.2871, 'train_samples_per_second': 69.209, 'train_steps_per_second': 4.326, 'total_flos': 2.61926808447744e+16, 'train_loss': 0.0022982391786587892, 'epoch': 3.0})

## Save the Trained Model

In [19]:
trainer.save_model("models/ner")
tokenizer.save_pretrained("models/ner")

('models/ner/tokenizer_config.json',
 'models/ner/special_tokens_map.json',
 'models/ner/vocab.json',
 'models/ner/merges.txt',
 'models/ner/added_tokens.json',
 'models/ner/tokenizer.json')

## Export Model to ONNX

Use:

```
optimum-cli export onnx --model models/ner --task token-classification models/onnx
```

## Verify Exported Model

In [20]:
import onnx
import torch
import onnxruntime as ort

onnx_model_path = "models/onnx/model.onnx"

# Load and check the ONNX model
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

# Run inference with ONNX Runtime
session = ort.InferenceSession(onnx_model_path)

# Prepare dummy input
input_ids = [[101, 2054, 2003, 1996, 2171, 102]]  # Token IDs (example)
attention_mask = [[1, 1, 1, 1, 1, 1]]  # Attention mask

inputs = {
    "input_ids": torch.tensor(input_ids, dtype=torch.long).numpy(),
    "attention_mask": torch.tensor(attention_mask, dtype=torch.long).numpy(),
}

# Run the ONNX model
outputs = session.run(None, inputs)
print("ONNX Model Outputs:", outputs)


ONNX Model Outputs: [array([[[ 4.4469266 ,  2.2834892 ,  1.2277882 , -3.4402235 ,
         -2.331243  , -3.9127386 ],
        [ 7.983002  , -0.6029357 , -0.89352286, -3.0848005 ,
         -1.571463  , -3.4708927 ],
        [ 4.613029  ,  1.105935  ,  1.8005015 , -3.7764654 ,
         -2.3312867 , -3.7697682 ],
        [ 3.2291615 ,  2.4427762 ,  2.321777  , -3.7389376 ,
         -2.4622188 , -3.6951158 ],
        [ 2.5670388 ,  3.318974  ,  2.627408  , -3.634867  ,
         -2.5895925 , -3.8985283 ],
        [ 3.1018884 ,  3.634954  ,  2.065163  , -3.442695  ,
         -2.5756822 , -4.1144876 ]]], dtype=float32)]


# Test

## Evaluate Model with Test Data

In [21]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import classification_report
import numpy as np

# Tokenized test dataset (ensure it's already tokenized)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Define the label map and ensure keys are plain integers
id2label = {int(v): k for k, v in label_map.items()}
label2id = {k: int(k) for k, v in id2label.items()}

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) from predictions and labels
    true_predictions = [
        [id2label[int(p)] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[int(l)] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    # Use sklearn's classification report for detailed metrics
    results = classification_report(
        [item for sublist in true_labels for item in sublist],
        [item for sublist in true_predictions for item in sublist],
        output_dict=True,
    )
    return {
        "precision": results["macro avg"]["precision"],
        "recall": results["macro avg"]["recall"],
        "f1": results["macro avg"]["f1-score"],
    }

# Load model
model = AutoModelForTokenClassification.from_pretrained("models/ner")

# Define trainer
trainer = Trainer(
    model=model,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Evaluate
results = trainer.evaluate()
print(results)


Map:   0%|          | 0/14850 [00:00<?, ? examples/s]

  trainer = Trainer(


  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_loss': 0.0004938127822242677, 'eval_model_preparation_time': 0.0014, 'eval_precision': 0.9995323743458274, 'eval_recall': 0.9999247970394518, 'eval_f1': 0.9997282489155647, 'eval_runtime': 83.4981, 'eval_samples_per_second': 177.848, 'eval_steps_per_second': 22.24}


## Test ONNX Model

In [22]:
from optimum.onnxruntime import ORTModelForTokenClassification
from transformers import AutoTokenizer, pipeline

onnx_path = "models/onnx"
trained_model_path = "models/ner"

# Load the ONNX model
onnx_model = ORTModelForTokenClassification.from_pretrained(onnx_path)
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)

# Define the label map
label_map = {0: "O", 1: "COMPANY", 2: "DOMAIN", 3: "IP_ADDR", 4: "URL", 5: "EMAIL"}

# Update the model's config with the label map
onnx_model.config.id2label = label_map
onnx_model.config.label2id = {v: k for k, v in label_map.items()}

# Create a pipeline for NER
onnx_pipeline = pipeline(
    "ner",
    model=onnx_model,
    tokenizer=tokenizer,
    device=0,
    aggregation_strategy="simple"
)

# Test inference
text = "Dr. Lynn H. Monkres works at ABC Corp."
results = onnx_pipeline(text)
print(results)


[{'entity_group': 'COMPANY', 'score': np.float32(0.731601), 'word': ' Monkres', 'start': 12, 'end': 19}, {'entity_group': 'COMPANY', 'score': np.float32(0.99096173), 'word': ' ABC Corp', 'start': 29, 'end': 37}]


In [24]:
from typing import List
from optimum.onnxruntime import ORTModelForTokenClassification
from transformers import AutoTokenizer, pipeline

def chunk_text_with_overlap(text: str, tokenizer, max_length: int = 512, overlap: int = 50) -> List[str]:
    """
    Splits `text` into chunks of up to `max_length` tokens (subword tokens),
    with `overlap` tokens overlap between consecutive chunks.

    :param text: The entire text string to split.
    :param tokenizer: The tokenizer instance to tokenize the text.
    :param max_length: Maximum number of tokens in a chunk (e.g., 512).
    :param overlap: Number of tokens to overlap between consecutive chunks.
    :return: A list of text chunks.
    """
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_length
        chunk_tokens = tokens[start:end]
        
        # Convert tokens back to string
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)

        # Move the start pointer (sliding window approach)
        start += max_length - overlap
        if start < 0:
            break

    return chunks

onnx_path = "models/onnx"
trained_model_path = "models/ner"

# Load the ONNX model and tokenizer
onnx_model = ORTModelForTokenClassification.from_pretrained(onnx_path)
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)

# Define the label map
label_map = {
    0: "O",
    1: "COMPANY",
    2: "DOMAIN",
    3: "IP_ADDR",
    4: "URL",
    5: "EMAIL"
}

# Update the model's config with the label map
onnx_model.config.id2label = label_map
onnx_model.config.label2id = {v: k for k, v in label_map.items()}

# Create a pipeline for NER
# Set device to 0 if you have a GPU, or -1 if you're using CPU
ner_pipeline = pipeline(
    "ner",
    model=onnx_model,
    tokenizer=tokenizer,
    device=0,  # use 0 (GPU) or -1 (CPU)
    aggregation_strategy="simple"
)

# Read text from file "testdoc.txt"
input_file = "testdoc.txt"
with open(input_file, 'r', encoding='utf-8') as f:
    text = f.read()

# Split the text into overlapping chunks
# Change max_length and overlap to suit your model/context needs
text_chunks = chunk_text_with_overlap(text, tokenizer, max_length=200, overlap=50)

all_results = []
for idx, chunk in enumerate(text_chunks):
    print(f"Processing chunk {idx+1}/{len(text_chunks)}...")
    results = ner_pipeline(chunk)
    all_results.extend(results)

for res in all_results:
    print(res)


Token indices sequence length is longer than the specified maximum sequence length for this model (881 > 512). Running this sequence through the model will result in indexing errors


Processing chunk 1/6...
Processing chunk 2/6...
Processing chunk 3/6...
Processing chunk 4/6...
Processing chunk 5/6...
Processing chunk 6/6...
{'entity_group': 'COMPANY', 'score': np.float32(0.9177847), 'word': ' Zenith Solutions', 'start': 307, 'end': 323}
{'entity_group': 'COMPANY', 'score': np.float32(0.75225896), 'word': ' NovaSphere Industries', 'start': 325, 'end': 346}
{'entity_group': 'COMPANY', 'score': np.float32(0.7816794), 'word': ' Bluebridge Analytics', 'start': 352, 'end': 372}
{'entity_group': 'COMPANY', 'score': np.float32(0.96439725), 'word': ' Vertex Tech Labs', 'start': 400, 'end': 416}
{'entity_group': 'COMPANY', 'score': np.float32(0.9970113), 'word': ' AstraCore Innovations', 'start': 421, 'end': 442}
{'entity_group': 'COMPANY', 'score': np.float32(0.99282616), 'word': ' Zenith Solutions', 'start': 748, 'end': 764}
{'entity_group': 'COMPANY', 'score': np.float32(0.83525735), 'word': ' NovaSphere Industries', 'start': 858, 'end': 879}
{'entity_group': 'COMPANY', 