In [None]:
!pip install transformers datasets evaluate seqeval accelerate>=0.26.0 transformers[torch] matplotlib seaborn onnx huggingface-hub onnxruntime-transformers

In [None]:
############################################
# Constants
############################################

# Dataset Constants
DATASET_NAME = "nlpaueb/finer-139"
CACHE_DIR = ".cache"
NUM_TOP_LABELS = 4
RANDOM_SEED = 42

# Label Types
LABEL_PREFIX_B = "B-"
LABEL_PREFIX_I = "I-"
LABEL_O = "O"  # Changed to string for new label set

# Dataset Splits
TRAIN_SPLIT = "train"
VALIDATION_SPLIT = "validation"
TEST_SPLIT = "test"

# Output Paths
OUTPUT_DIR_TRAIN = "./balanced_train"
OUTPUT_DIR_VALIDATION = "./balanced_validation"
OUTPUT_DIR_TEST = "./balanced_test"

# Feature Names
NER_TAGS = "ner_tags"
TOKENS = "tokens"

# Model names
ORIGINAL_MODEL = "distilbert-base-uncased"
MODEL_NAME = "finer-selected-4-labels"
HUGGINGFACE_HUB_NAME = f"sojimanatsu/{MODEL_NAME}"

# CUDA config
CUDA_LAUNCH_BLOCKING = "1"
TOKENIZERS_PARALLELISM = "false"

#We can extend this with all the hyperparameters too.

In [None]:
############################################
# Simple Data Analytics For Distributions
############################################

from collections import Counter
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns

dataset = load_dataset(DATASET_NAME)
dataset_train = dataset[TRAIN_SPLIT].select(range(5000))

# Calculate entity distribution
entity_distribution = Counter()
for example in dataset_train:
    entity_distribution.update(tag for tag in example[NER_TAGS])

# Remove '0' class (no entity) and get top 50 entities
if 0 in entity_distribution:
    del entity_distribution[0]
top_50_entities = [entity for entity, _ in entity_distribution.most_common(50)]

# Filter dataset to include only top 50 entities
filtered_dataset = [
    [tag for tag in example[NER_TAGS] if tag in top_50_entities] for example in dataset_train
]

# Recalculate entity distribution for top 50
entity_distribution_top_50 = Counter()
for tags in filtered_dataset:
    entity_distribution_top_50.update(tags)

# Absolute Entity Distribution (First 50 Labels)
plt.figure(figsize=(12, 6))
plt.bar(
    [str(entity) for entity, _ in entity_distribution_top_50.most_common()],
    [count for _, count in entity_distribution_top_50.most_common()],
    edgecolor='black',
)
plt.title('Absolute Entity Distribution (First 50 Labels)')
plt.xlabel('Entity')
plt.ylabel('Count')
plt.xticks(rotation=45, fontsize=10, ha='right')
plt.tight_layout()
plt.show()

# 1. Token Length Distribution
tokens_train = [len(example[TOKENS]) for example in dataset_train]
plt.figure(figsize=(12, 6))
plt.hist(tokens_train, bins=30, edgecolor='black')
plt.title('Token Length Distribution in First 5000 Samples')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

# 2. Entity Coverage
samples_with_entities = sum(1 for tags in filtered_dataset if tags)
samples_without_entities = len(filtered_dataset) - samples_with_entities

plt.figure(figsize=(8, 6))
plt.pie(
    [samples_with_entities, samples_without_entities],
    labels=["With Entities", "Without Entities"],
    autopct='%1.1f%%',
    startangle=90,
    colors=["#ff9999", "#66b3ff"]
)
plt.title('Entity Coverage (Top 50 Entities)')
plt.show()

# 5. Entity Co-occurrence
co_occurrence = Counter()
for tags in filtered_dataset:
    unique_tags = set(tags)
    for tag in unique_tags:
        for other_tag in unique_tags:
            if tag != other_tag:
                co_occurrence[(tag, other_tag)] += 1

# Prepare data for heatmap further reduce to 20 for better readability
entity_ids = sorted(top_50_entities)[:20]
heatmap_data = {tag: [co_occurrence.get((tag, other_tag), 0) for other_tag in entity_ids] for tag in entity_ids}
sns.heatmap(
    [heatmap_data[tag] for tag in entity_ids],
    xticklabels=entity_ids,
    yticklabels=entity_ids,
    cmap="Blues",
    cbar_kws={"label": "Co-occurrence Count"}
)
plt.title('Entity Co-occurrence Heatmap (Top 50 Entities)')
plt.xlabel('Entity')
plt.ylabel('Entity')
plt.show()

# 6. Sentence-Level Entity Count
entities_per_sentence = [len(tags) for tags in filtered_dataset]
plt.figure(figsize=(12, 6))
plt.hist(entities_per_sentence, bins=20, edgecolor='black')
plt.title('Entity Count per Sentence (Top 50 Entities)')
plt.xlabel('Number of Entities')
plt.ylabel('Frequency')
plt.show()

In [None]:
############################################
# Create a new dataset
############################################

import random
from datasets import load_dataset, Dataset, Features, Sequence, ClassLabel, Value

############################################
# Dataset Loading and Top-B Functions
############################################

def load_and_get_labels():
    """Load dataset and get original labels."""
    dataset = load_dataset(
        DATASET_NAME,
        cache_dir=CACHE_DIR,
        keep_in_memory=True
    )
    original_labels = dataset[TRAIN_SPLIT].features[NER_TAGS].feature.names
    return dataset, original_labels


def find_top_b_labels(dataset, original_labels):
    """Find top B- labels with zero I- counts."""
    train_counts = Counter()
    for example in dataset[TRAIN_SPLIT]:
        train_counts.update(example[NER_TAGS])

    b_and_i_counts = []
    for label_id, label_name in enumerate(original_labels):
        if label_name.startswith(LABEL_PREFIX_B):
            b_count = train_counts[label_id]
            i_label_name = label_name.replace(LABEL_PREFIX_B, LABEL_PREFIX_I)
            i_label_id = original_labels.index(i_label_name) if i_label_name in original_labels else None
            i_count = train_counts[i_label_id] if i_label_id is not None else 0

            if i_count == 0:
                b_and_i_counts.append((label_name, b_count, i_count))

    sorted_b_and_i_counts = sorted(b_and_i_counts, key=lambda x: x[1], reverse=True)
    return sorted_b_and_i_counts[:NUM_TOP_LABELS]


############################################
# Label Mapping and Filtering Functions
############################################

def create_label_map(original_labels, selected_labels):
    """Create mapping from original labels to new label indices."""
    new_labels = [LABEL_O]
    new_labels.extend(selected_labels)

    label_map = {
        original_labels.index(label): idx
        for idx, label in enumerate(new_labels)
    }

    return label_map, new_labels


def filter_and_map_labels(dataset, label_map, new_labels):
    """Filter examples to only keep those with selected labels and map to new label indices."""

    def process_example(example):
        valid_indices = [
            i for i, tag in enumerate(example[NER_TAGS])
            if tag in label_map
        ]

        if not valid_indices:
            return None

        new_tokens = [example[TOKENS][i] for i in valid_indices]
        new_tags = [label_map[example[NER_TAGS][i]] for i in valid_indices]

        return {
            'id': example['id'],
            TOKENS: new_tokens,
            NER_TAGS: new_tags
        }

    # Create new features
    new_features = Features({
        'id': Value('int32'),
        TOKENS: Sequence(Value("string")),
        NER_TAGS: Sequence(ClassLabel(names=new_labels))
    })

    # Filter and map
    filtered_dataset = dataset.map(process_example)
    filtered_dataset = filtered_dataset.filter(lambda x: x is not None)

    # Set new features
    filtered_dataset = filtered_dataset.cast(new_features)

    return filtered_dataset

############################################
# Count and Balance Functions
############################################

def get_min_counts(dataset, selected_label_ids):
    """Get minimum counts for all splits."""
    counts = {
        TRAIN_SPLIT: Counter(),
        VALIDATION_SPLIT: Counter(),
        TEST_SPLIT: Counter()
    }

    for split in counts.keys():
        for example in dataset[split]:
            counts[split].update(example[NER_TAGS])

    min_counts = {
        split: min(counter[lid] for lid in selected_label_ids)
        for split, counter in counts.items()
    }

    return min_counts


def get_examples_with_exactly_n_occurrences(dataset, label_id, target_count):
    """Get examples that will give exactly n occurrences of the label."""
    example_counts = [
        (idx, sum(1 for tag in example[NER_TAGS] if tag == label_id))
        for idx, example in enumerate(dataset)
        if label_id in example[NER_TAGS]
    ]

    random.shuffle(example_counts)

    selected_indices = []
    current_count = 0

    for idx, count in example_counts:
        if current_count + count <= target_count:
            selected_indices.append(idx)
            current_count += count
        if current_count == target_count:
            break

    return selected_indices


def create_balanced_dataset(dataset, target_count, split, label_map, new_labels):
    """Create balanced dataset for given split."""
    filtered_dataset = filter_and_map_labels(dataset[split], label_map, new_labels)

    label_indices = []
    for lid in range(1, len(new_labels)):
        indices = get_examples_with_exactly_n_occurrences(filtered_dataset, lid, target_count)
        label_indices.append(indices)

    combined_indices = set().union(*label_indices)
    return filtered_dataset.select(list(combined_indices))


############################################
# Main Execution
############################################

def main():
    """Main execution function"""
    random.seed(RANDOM_SEED)

    # Load dataset and get labels
    dataset, original_labels = load_and_get_labels()

    # Find top B labels
    top_b_labels = find_top_b_labels(dataset, original_labels)
    selected_labels = [item[0] for item in top_b_labels]
    print("Selected labels:", selected_labels)

    # Create label mapping
    label_map, new_labels = create_label_map(original_labels, selected_labels)
    print("New label set:", new_labels)

    # Get minimum counts
    selected_label_ids = [original_labels.index(lbl) for lbl in selected_labels]
    min_counts = get_min_counts(dataset, selected_label_ids)
    print("\nMinimum counts per split:")
    for split, count in min_counts.items():
        print(f"{split}: {count}")

    # Create balanced datasets
    balanced_datasets = {}
    for split in [TRAIN_SPLIT, VALIDATION_SPLIT, TEST_SPLIT]:
        balanced_datasets[split] = create_balanced_dataset(
            dataset,
            min_counts[split],
            split,
            label_map,
            new_labels
        )

    # Show distributions and save
    output_dirs = {
        TRAIN_SPLIT: OUTPUT_DIR_TRAIN,
        VALIDATION_SPLIT: OUTPUT_DIR_VALIDATION,
        TEST_SPLIT: OUTPUT_DIR_TEST
    }

    for split, ds in balanced_datasets.items():
        print(f"\nFinal {split} distribution:")
        distribution = Counter()
        for example in ds:
            distribution.update(example[NER_TAGS])
        for i, label in enumerate(new_labels):
            print(f"{label}: {distribution[i]}")

        print(f"Total examples in {split}: {len(ds)}")
        ds.save_to_disk(output_dirs[split])
        print(f"Saved {split} dataset to {output_dirs[split]}")


############################################
# Data Verification Function
############################################

def verify_datasets():
    """Verify the saved datasets."""
    dataset, original_labels = load_and_get_labels()
    top_b_labels = find_top_b_labels(dataset, original_labels)
    selected_labels = [item[0] for item in top_b_labels]
    _, new_labels = create_label_map(original_labels, selected_labels)

    output_dirs = {
        TRAIN_SPLIT: OUTPUT_DIR_TRAIN,
        VALIDATION_SPLIT: OUTPUT_DIR_VALIDATION,
        TEST_SPLIT: OUTPUT_DIR_TEST
    }

    print("\nVerifying saved datasets:")
    for split, path in output_dirs.items():
        ds = Dataset.load_from_disk(path)
        distribution = Counter()
        for example in ds:
            distribution.update(example[NER_TAGS])

        print(f"\n{split} distribution:")
        for i, label in enumerate(new_labels):
            print(f"{label}: {distribution[i]}")
        print(f"Total examples: {len(ds)}")


if __name__ == "__main__":
    main()
    verify_datasets()

In [None]:
############################################
# Training Block
############################################

import os
import torch
from datasets import load_from_disk, load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback
from collections import Counter

os.environ["CUDA_LAUNCH_BLOCKING"] = CUDA_LAUNCH_BLOCKING
os.environ["TOKENIZERS_PARALLELISM"] = TOKENIZERS_PARALLELISM

# Load balanced datasets from disk
balanced_train = load_from_disk(OUTPUT_DIR_TRAIN).shuffle()
balanced_validation = load_from_disk(OUTPUT_DIR_VALIDATION).shuffle()


# Debugging: Print sizes
print(f"Loaded Balanced Training Dataset Size: {len(balanced_train)}")
print(f"Loaded Balanced Validation Dataset Size: {len(balanced_validation)}")

# Extract label mappings directly from the dataset metadata
labels = balanced_train.features[NER_TAGS].feature.names
print(labels)
selected_label2id = {label: idx for idx, label in enumerate(labels)}
id2selected_label = {idx: label for idx, label in enumerate(labels)}

# Tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL)
model = AutoModelForTokenClassification.from_pretrained(
    ORIGINAL_MODEL,
    num_labels=len(selected_label2id),
    id2label=id2selected_label,
    label2id=selected_label2id
)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[TOKENS],
        truncation=True,
        padding=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[NER_TAGS]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Debugging: Check label distribution
train_label_distribution = Counter()
for example in balanced_train:
    train_label_distribution.update(example[NER_TAGS])
print("Training label distribution:", train_label_distribution)

validation_label_distribution = Counter()
for example in balanced_validation:
    validation_label_distribution.update(example[NER_TAGS])
print("Validation label distribution:", validation_label_distribution)

# Apply preprocessing
tokenized_train = balanced_train.map(tokenize_and_align_labels, batched=True)
tokenized_validation = balanced_validation.map(tokenize_and_align_labels, batched=True)

# Load data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load metric
metric = evaluate.load("seqeval", zero_division=1)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert to lists and handle tensor conversion
    true_predictions = [
        [id2selected_label[p.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2selected_label[l.item()] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training arguments
training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",         # F1 for best model selection
    learning_rate=2e-5,
    warmup_ratio=0.1,                   # Warmup for first 10% of steps
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=8,                 
    weight_decay=0.01,
    fp16=True,
    fp16_full_eval=True,
    seed=42,
    report_to="none",
    logging_dir="./logs",
    push_to_hub=False,
    save_only_model=True,
    greater_is_better=True,             # For F1 metric
)

# Add Early Stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,          # Stop if no improvement for 2 epochs
    early_stopping_threshold=0.001,     # Minimum change to qualify as improvement
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]

)

# Print features and label information for debugging
print("\nFirst example tokens:", balanced_train[0][TOKENS])
print("First example tags:", balanced_train[0][NER_TAGS])

# Train
trainer.train()

# Save
model.save_pretrained(MODEL_NAME)
tokenizer.save_pretrained(MODEL_NAME)
torch.cuda.empty_cache()
print("Model training and saving completed.")

In [None]:
############################################
# Review Confusion Matrix
############################################

import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import torch

# Step 1: Select a subset of examples from the test set
balanced_test = load_from_disk(OUTPUT_DIR_TEST)

# Step 2: Select a subset of the test dataset (up to 15,000 samples)
subset_test_set = balanced_test.select(range(min(15000, len(balanced_test))))

# Tokenize and align labels for the subset test set
tokenized_subset_test = subset_test_set.map(
    tokenize_and_align_labels, batched=True, remove_columns=subset_test_set.column_names
)

# Step 2: Make predictions on the subset test set
torch.cuda.empty_cache()  # Flush CUDA memory before prediction

with torch.no_grad():  # Disable gradient calculation
    predictions, labels, _ = trainer.predict(tokenized_subset_test)

predicted_labels = np.argmax(predictions, axis=2)

# Step 3: Align predictions and labels
# Ensure that both predictions and labels have the same number of valid entries
aligned_predictions = []
aligned_labels = []

for pred, lbl in zip(predicted_labels, labels):
    for p, l in zip(pred, lbl):
        if l != -100:  # Ignore special tokens
            aligned_predictions.append(id2selected_label[p.item()])
            aligned_labels.append(id2selected_label[l.item()])

# Step 4: Create the confusion matrix
selected_labels = balanced_test.features["ner_tags"].feature.names

cm = confusion_matrix(aligned_labels, aligned_predictions, labels=selected_labels)

# Flush CUDA memory after processing
torch.cuda.empty_cache()

# Step 5: Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="YlGnBu",
    xticklabels=selected_labels,
    yticklabels=selected_labels,
    cbar_kws={'label': 'Number of Tokens'}
)
plt.title("Confusion Matrix on Test Set (2000 Examples) for Selected Labels", fontsize=16)
plt.xlabel("Predicted Label", fontsize=14)
plt.ylabel("True Label", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
############################################
# Export to ONNX
############################################

from transformers import AutoTokenizer, AutoModelForTokenClassification
from pathlib import Path
import torch

# Define paths
export_dir = Path("./onnx_model")
export_dir.mkdir(parents=True, exist_ok=True)
onnx_model_path = export_dir / "distilbert_ner.onnx"

# Load the fine-tuned PyTorch model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

# Export to ONNX
dummy_input = tokenizer("This is a test input.", return_tensors="pt")
torch.onnx.export(
    model,
    args=(dummy_input["input_ids"], dummy_input["attention_mask"]),
    f=onnx_model_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "logits": {0: "batch_size", 1: "sequence_length"},
    },
    opset_version=14,
    do_constant_folding=True,
)

print(f"ONNX model exported to {onnx_model_path}")

In [None]:
############################################
# Utility Functions and Data Preparation
############################################

import gc
import numpy as np
from sklearn.metrics import classification_report
import onnxruntime as ort
from transformers import AutoModelForTokenClassification, AutoTokenizer
import time
import psutil
from datasets import load_from_disk

def get_model_size(model_path):
    """Get model size in MB"""
    if os.path.isdir(model_path):
        total_size = sum(os.path.getsize(os.path.join(dirpath,filename)) 
                        for dirpath, dirnames, filenames in os.walk(model_path)
                        for filename in filenames)
    else:
        total_size = os.path.getsize(model_path)
    return total_size / (1024 * 1024)

def get_process_memory():
    """Get current process memory usage in MB"""
    process = psutil.Process()
    return process.memory_info().rss / (1024 * 1024)

def measure_memory_usage(func, name="Function"):
    """Accurately measure CPU memory usage of a function with detailed tracking"""
    _ = gc.collect()
    start_mem = get_process_memory()
    print(f"\n{name} - Starting memory: {start_mem:.2f} MB")
    
    peak_mem = start_mem
    def memory_monitor():
        nonlocal peak_mem
        current_mem = get_process_memory()
        peak_mem = max(peak_mem, current_mem)
        return current_mem

    start_time = time.time()
    result = func()
    end_time = time.time()
    
    _ = gc.collect()
    end_mem = get_process_memory()
    execution_time = end_time - start_time
    memory_used = end_mem - start_mem
    
    print(f"{name} - Peak memory: {peak_mem:.2f} MB")
    print(f"{name} - End memory: {end_mem:.2f} MB")
    print(f"{name} - Memory change: {memory_used:.2f} MB")
    
    return result, execution_time, memory_used, peak_mem

# Load and prepare data
balanced_test = load_from_disk("./balanced_test")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Extract label mappings
selected_labels = balanced_test.features[NER_TAGS].feature.names
selected_label2id = {label: idx for idx, label in enumerate(selected_labels)}
selected_id2label = {idx: label for label, idx in selected_label2id.items()}

def tokenize_with_labels(batch):
    tokenized = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=True,
        return_tensors="np",
    )
    aligned_labels = []
    for i, labels in enumerate(batch["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    tokenized["labels"] = aligned_labels
    return tokenized

def group_by_length(tokenized_dataset):
    grouped_batches = {}
    for example in tokenized_dataset:
        seq_length = len(example["input_ids"])
        if seq_length not in grouped_batches:
            grouped_batches[seq_length] = {"input_ids": [], "attention_mask": [], "labels": []}
        grouped_batches[seq_length]["input_ids"].append(example["input_ids"])
        grouped_batches[seq_length]["attention_mask"].append(example["attention_mask"])
        grouped_batches[seq_length]["labels"].append(example["labels"])
    return grouped_batches

# Prepare test data
reduced_test_set = balanced_test.select(range(min(500, len(balanced_test))))
tokenized_test = reduced_test_set.map(tokenize_with_labels, batched=True)
grouped_batches = group_by_length(tokenized_test)

In [None]:
############################################
# PyTorch Model Inference
############################################

# Load PyTorch model
original_model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME).to('cpu')
original_model.eval()

pytorch_size = get_model_size(MODEL_NAME)
print(f"\nPyTorch Model Size: {pytorch_size:.2f} MB")

def run_pytorch_inference():
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for seq_length, batch in grouped_batches.items():
            input_ids = torch.tensor(np.array(batch["input_ids"]))
            attention_mask = torch.tensor(np.array(batch["attention_mask"]))
            labels = batch["labels"]
            
            outputs = original_model(input_ids, attention_mask=attention_mask)
            batch_preds = torch.argmax(outputs.logits, dim=2).numpy()
            
            for b in range(len(labels)):
                valid_indices = [i for i, l in enumerate(labels[b]) if l != -100]
                true_labels.extend([labels[b][i] for i in valid_indices])
                predictions.extend([batch_preds[b][i] for i in valid_indices])
    
    return predictions, true_labels

# Run PyTorch inference
_ = gc.collect()
initial_memory = get_process_memory()
print(f"Initial process memory: {initial_memory:.2f} MB")

print("\nRunning PyTorch inference...")
(pytorch_predictions, true_labels), pytorch_time, pytorch_mem_change, pytorch_peak = measure_memory_usage(
    run_pytorch_inference, 
    "PyTorch"
)

# Store metrics in global variables
pytorch_metrics = {
    'time': pytorch_time,
    'memory_change': pytorch_mem_change,
    'peak_memory': pytorch_peak,
    'model_size': pytorch_size
}

In [None]:
############################################
# ONNX Model Optimizer
############################################

from onnxruntime.transformers import optimizer

# Specify model and optimization paths
input_model_path = "./onnx_model/distilbert_ner.onnx"
output_model_path = "./onnx_model/distilbert_ner_optimized.onnx"

# Optimize the model
optimized_model = optimizer.optimize_model(
    input_model_path,
    model_type="bert",  
    num_heads=12,       # We use the same heads with distilbert
    hidden_size=768     # Same with distilbert model's hidden size 
)

# Save the optimized model
optimized_model.save_model_to_file(output_model_path)
print(f"Optimized model saved to {output_model_path}")

In [None]:
############################################
# ONNX Model Inference
############################################

# Load ONNX model - GPU was disabled due to environment config with CUDA for this task.
print("Available Providers:", ort.get_available_providers())
onnx_model_path = "./onnx_model/distilbert_ner_optimized.onnx"
sess_options = ort.SessionOptions()
sess_options.enable_mem_pattern = True  # Keep memory pattern optimization enabled
sess_options.enable_cpu_mem_arena = False  # Enable/Disable CPU memory arena
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL  # Use sequential execution for speed
onnx_session = ort.InferenceSession(onnx_model_path, sess_options=sess_options, providers=['CPUExecutionProvider'])

onnx_size = get_model_size(onnx_model_path)
print(f"\nONNX Model Size: {onnx_size:.2f} MB")

# Pre-convert inputs to NumPy arrays
grouped_batches = {
    seq_length: {
        "input_ids": np.asarray(batch["input_ids"], dtype=np.int64),
        "attention_mask": np.asarray(batch["attention_mask"], dtype=np.int64),
        "labels": batch["labels"]
    }
    for seq_length, batch in grouped_batches.items()
}

def run_onnx_inference():
    predictions = []
    
    # Increase batch size for better efficiency
    batch_size = 1

    for seq_length, batch in grouped_batches.items():
        total_examples = len(batch["input_ids"])

        for start_idx in range(0, total_examples, batch_size):
            end_idx = min(start_idx + batch_size, total_examples)
            
            # Slice batch inputs
            input_ids = batch["input_ids"][start_idx:end_idx]
            attention_mask = batch["attention_mask"][start_idx:end_idx]
            current_labels = batch["labels"][start_idx:end_idx]
            
            # Run inference
            ort_inputs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask
            }
            onnx_logits = onnx_session.run(None, ort_inputs)[0]
            batch_preds = np.argmax(onnx_logits, axis=2)

            # Collect valid predictions using NumPy vectorized operations
            for b in range(len(current_labels)):
                valid_indices = np.array(current_labels[b]) != -100
                predictions.extend(batch_preds[b][valid_indices])

    return predictions

# Run ONNX inference
_ = gc.collect()
initial_memory = get_process_memory()
print(f"Initial process memory: {initial_memory:.2f} MB")

print("\nRunning ONNX inference...")
onnx_predictions, onnx_time, onnx_mem_change, onnx_peak = measure_memory_usage(
    run_onnx_inference, 
    "ONNX"
)

# Store metrics in global variables
onnx_metrics = {
    'time': onnx_time,
    'memory_change': onnx_mem_change,
    'peak_memory': onnx_peak,
    'model_size': onnx_size
}

# Print immediate memory usage
current_mem = get_process_memory()
print(f"\nFinal memory after ONNX inference: {current_mem:.2f} MB")

In [None]:
############################################
# Model Comparison
############################################

# Convert predictions to named labels
true_labels_named = [selected_id2label[label] for label in true_labels]
pytorch_predictions_named = [selected_id2label[pred] for pred in pytorch_predictions]
onnx_predictions_named = [selected_id2label[pred] for pred in onnx_predictions]

# Print comparative metrics
print("\nComparative Performance Metrics:")
print("\nPyTorch Model:")
print(f"Inference Time: {pytorch_metrics['time']:.2f} seconds")
print(f"Memory Change: {pytorch_metrics['memory_change']:.2f} MB")
print(f"Peak Memory: {pytorch_metrics['peak_memory']:.2f} MB")
print(f"Model Size: {pytorch_metrics['model_size']:.2f} MB")

print("\nONNX Model:")
print(f"Inference Time: {onnx_metrics['time']:.2f} seconds")
print(f"Memory Change: {onnx_metrics['memory_change']:.2f} MB")
print(f"Peak Memory: {onnx_metrics['peak_memory']:.2f} MB")
print(f"Model Size: {onnx_metrics['model_size']:.2f} MB")

# Print relative improvements
print("\nRelative Improvements (ONNX vs PyTorch):")
print(f"Speed Improvement: {((pytorch_metrics['time'] - onnx_metrics['time']) / pytorch_metrics['time']) * 100:.2f}%")
print(f"Memory Change Improvement: {((pytorch_metrics['memory_change'] - onnx_metrics['memory_change']) / pytorch_metrics['memory_change']) * 100:.2f}%")
print(f"Peak Memory Improvement: {((pytorch_metrics['peak_memory'] - onnx_metrics['peak_memory']) / pytorch_metrics['peak_memory']) * 100:.2f}%")
print(f"Model Size Improvement: {((pytorch_metrics['model_size'] - onnx_metrics['model_size']) / pytorch_metrics['model_size']) * 100:.2f}%")

# Print classification reports
print("\nPyTorch Model Performance:")
print(classification_report(
    true_labels_named,
    pytorch_predictions_named,
    labels=selected_labels,
    zero_division=0
))

print("\nONNX Model Performance:")
print(classification_report(
    true_labels_named,
    onnx_predictions_named,
    labels=selected_labels,
    zero_division=0
))

# Compare predictions
num_differences = sum(o != p for o, p in zip(onnx_predictions, pytorch_predictions))
print(f"\nPrediction Differences:")
print(f"Number of different predictions: {num_differences}")
print(f"Percentage of different predictions: {(num_differences/len(onnx_predictions))*100:.2f}%")

if num_differences > 0:
    print("\nExample differences between ONNX and PyTorch predictions (first 10):")
    differences_shown = 0
    for i, (o, p) in enumerate(zip(onnx_predictions_named, pytorch_predictions_named)):
        if o != p and differences_shown < 10:
            print(f"Index {i}: ONNX predicted {o}, PyTorch predicted {p}")
            differences_shown += 1


In [None]:
############################################
# Inference Block for ONNX and Original Model
############################################

import torch
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load tokenizer and PyTorch model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
model.eval()

# Load ONNX session
onnx_model_path = "./onnx_model/distilbert_ner.onnx"
onnx_session = ort.InferenceSession(
    onnx_model_path,
    providers=['CPUExecutionProvider']  # Using CPU provider for now
)

def predict_ner_both(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt")
    
    # PyTorch inference
    with torch.no_grad():
        torch_outputs = model(**inputs)
        torch_predictions = torch.argmax(torch_outputs.logits, dim=2)[0]
    
    # ONNX inference
    # Convert inputs to numpy arrays
    onnx_inputs = {
        'input_ids': inputs['input_ids'].numpy(),
        'attention_mask': inputs['attention_mask'].numpy()
    }
    
    # Run ONNX inference
    onnx_outputs = onnx_session.run(None, onnx_inputs)
    onnx_predictions = np.argmax(onnx_outputs[0], axis=2)[0]
    
    # Get tokens and labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    torch_labels = [model.config.id2label[p.item()] for p in torch_predictions]
    onnx_labels = [model.config.id2label[p] for p in onnx_predictions]
    
    # Print predictions
    print("\nNER Predictions:")
    print("PyTorch vs ONNX comparison:")
    
    current_word = ""
    current_torch_label = None
    current_onnx_label = None
    
    for token, torch_label, onnx_label in zip(tokens, torch_labels, onnx_labels):
        if token in ['[CLS]', '[SEP]']:
            continue
        # If it's a subword (starts with ##)
        if token.startswith("##"):
            current_word += token[2:]
        else:
            # Print the previous word if it exists
            if current_word:
                print(f"{current_word}: PyTorch: {current_torch_label}, ONNX: {current_onnx_label}")
            current_word = token
            current_torch_label = torch_label
            current_onnx_label = onnx_label
    
    # Print the last word
    if current_word:
        print(f"{current_word}: PyTorch: {current_torch_label}, ONNX: {current_onnx_label}")

# Test examples
examples = [
    # DebtInstrumentInterestRateStatedPercentage
    "The loan carries an interest rate of 5.25% per annum.",
    "They secured a mortgage at 3.75% fixed rate.",
    "The bond yields 4.5% annually.",
    
    # LineOfCreditFacilityMaximumBorrowingCapacity
    "The company has a credit line of $50 million available.",
    "Their revolving credit facility has a maximum borrowing capacity of $100 million.",
    "The bank approved a credit line with $25 million borrowing limit.",
    
    # DebtInstrumentBasisSpreadOnVariableRate1
    "The loan has a spread of LIBOR + 2.5%.",
    "The variable rate includes a basis spread of 1.75% over prime.",
    "Interest is calculated at SOFR plus 200 basis points.",
    
    # AllocatedShareBasedCompensationExpense
    "The share-based compensation expense was $2.5 million.",
    "Stock options resulted in compensation expense of $750,000.",
    "They reported RSU compensation costs of $1.2 million."
]

# Test each example
for text in examples:
    print("\nExample:", text)
    predict_ner_both(text)

In [None]:
import huggingface_hub
huggingface_hub.notebook_login()

In [None]:
import os

output_dir = MODEL_NAME
checkpoints = [ckpt for ckpt in os.listdir(output_dir) if ckpt.startswith("checkpoint-")]
if checkpoints:
    last_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[1]))
    last_checkpoint_dir = os.path.join(output_dir, last_checkpoint)
    print(f"Last checkpoint directory: {last_checkpoint_dir}")
else:
    print("No checkpoints found!")

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer


# Load the model and tokenizer from the last checkpoint
model = AutoModelForTokenClassification.from_pretrained(last_checkpoint_dir)
tokenizer = AutoTokenizer.from_pretrained(last_checkpoint_dir)

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(HUGGINGFACE_HUB_NAME)
tokenizer.push_to_hub(HUGGINGFACE_HUB_NAME)

print("Checkpoint pushed to Hugging Face Hub!")

In [None]:
from datasets import load_from_disk, DatasetDict

# Load the datasets
train_dataset = load_from_disk(OUTPUT_DIR_TRAIN)
test_dataset = load_from_disk(OUTPUT_DIR_TEST)
validation_dataset = load_from_disk(OUTPUT_DIR_VALIDATION)

# Combine the datasets into a single DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": validation_dataset
})

# Save the merged dataset
dataset_dict.save_to_disk(MODEL_NAME)

In [None]:
dataset_dict.push_to_hub(HUGGINGFACE_HUB_NAME)
print("Dataset pushed to Hugging Face Hub!")