In [4]:
import torch
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from pathlib import Path
import numpy as np

In [5]:
# Sample text
sample_text = """John Doe was born on January 1, 1990, in New York City. He has lived in NYC all his life and graduated from Columbia University in 2012 with a degree in Computer Science. John's email address is johndoe@example.com, and his phone number is (555) 123-4567. Recently, he moved to a new house located at 123 Main St, New York, NY 10001. John works at TechSolutions, where he is a senior software engineer. His Social Security number is 123-45-6789."""

In [6]:
# Paths and configurations
DEBERTA_MODEL_PATH = "models/deberta3base_1024"
INFERENCE_MAX_LENGTH = 2048

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(DEBERTA_MODEL_PATH)

# Load id2label mapping
config = json.load(open(Path(DEBERTA_MODEL_PATH) / "config.json"))
id2label = config["id2label"]

# Tokenize the sample text
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, max_length=INFERENCE_MAX_LENGTH)

# Retrieve tokens from IDs
tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

# Prepare the data collator
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# Prepare the inputs for the trainer
dataset = Dataset.from_dict(inputs)
dataset.set_format("torch")
dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collator)

# Prediction setup
args = TrainingArguments(".", per_device_eval_batch_size=1, report_to="none")
trainer = Trainer(model=model, args=args, data_collator=collator, tokenizer=tokenizer)

# Predict and convert logits to labels
for batch in dataloader:
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = logits.argmax(dim=-1)

    # Convert predictions to readable labels using id2label
    pred_labels = [id2label[str(int(index))] for index in predictions[0].flatten()]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [7]:
def reconstruct_text(tokens, labels):
    # Initialize an empty string for the reconstructed text
    reconstructed_text = ""
    # Loop through each token and corresponding label
    for token, label in zip(tokens, labels):
        # Skip special tokens
        if token in ["[CLS]", "[SEP]"]:
            continue
        # Remove the first underscore and any subsequent underscores (subword pieces)
        if token.startswith("▁"):
            # Add a space before starting a new word (if not the start of the string)
            if reconstructed_text:
                reconstructed_text += " "
            # Add the cleaned token (without the underscore)
            reconstructed_text += token[1:]
        else:
            # Directly append subword pieces to the last word (no space)
            reconstructed_text += token
    
    return reconstructed_text

In [8]:
reconstruct_text(tokens, pred_labels)

"John Doe was born on January 1, 1990, in New York City. He has lived in NYC all his life and graduated from Columbia University in 2012 with a degree in Computer Science. John's email address is johndoe@example.com, and his phone number is (555) 123-4567. Recently, he moved to a new house located at 123 Main St, New York, NY 10001. John works at TechSolutions, where he is a senior software engineer. His Social Security number is 123-45-6789."