In [29]:
import torch
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification
from pathlib import Path


In [30]:
# Sample text
sample_text = """John Doe was born on January 1, 1990, in New York City. He has lived in NYC all his life and graduated from Columbia University in 2012 with a degree in Computer Science. John's email address is johndoe@example.com, and his phone number is (555) 123-4567. Recently, he moved to a new house located at 123 Main St, New York, NY 10001. John works at TechSolutions, where he is a senior software engineer. His Social Security number is 123-45-6789."""

In [31]:
# Paths and configurations
DEBERTA_MODEL_PATH = "models/deberta3base_1024"
INFERENCE_MAX_LENGTH = 2048

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(DEBERTA_MODEL_PATH)

# Load id2label mapping
config = json.load(open(Path(DEBERTA_MODEL_PATH) / "config.json"))
id2label = config["id2label"]

# # Tokenize the sample text
# inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, max_length=INFERENCE_MAX_LENGTH)

# Tokenize the sample text and prepare inputs
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, max_length=INFERENCE_MAX_LENGTH, padding=True)

# Retrieve tokens from IDs
tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits

# Obtain predictions for non-special, non-padded tokens
predictions = logits.argmax(dim=-1).squeeze(0)  # Remove the batch dimension
active_tokens = inputs['attention_mask'].squeeze(0) == 1  # Identify non-padded tokens

# Filter out predictions for special and padded tokens
filtered_predictions = predictions[active_tokens]

# Convert predictions to labels
pred_labels = [id2label[str(pred.item())] for pred in filtered_predictions]


In [44]:
def reconstruct_text(tokens, labels):
    # Initialize an empty string for the reconstructed text
    reconstructed_text = ""
    # Loop through each token and corresponding label
    for token, label in zip(tokens, labels):
        # Skip special tokens
        if token in ["[CLS]", "[SEP]"]:
            continue
        # Remove the first underscore and any subsequent underscores (subword pieces)
        if token.startswith("▁"):
            # Add a space before starting a new word (if not the start of the string)
            if reconstructed_text:
                reconstructed_text += " "
            # Add the cleaned token (without the underscore)
            reconstructed_text += token[1:]
        else:
            # Directly append subword pieces to the last word (no space)
            reconstructed_text += token
    
    return reconstructed_text

In [45]:
reconstruct_text(tokens, pred_labels)

reconstructed_text = 
WITH UNDERSCORE
reconstructed_text = John
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was born
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was born on
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was born on January
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was born on January 1
reconstructed_text = John Doe was born on January 1,
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was born on January 1, 1990
reconstructed_text = John Doe was born on January 1, 1990,
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was born on January 1, 1990, in
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe was born on January 1, 1990, in New
WITH UNDERSCORE
Adding space!!!!!!!
reconstructed_text = John Doe w

"John Doe was born on January 1, 1990, in New York City. He has lived in NYC all his life and graduated from Columbia University in 2012 with a degree in Computer Science. John's email address is johndoe@example.com, and his phone number is (555) 123-4567. Recently, he moved to a new house located at 123 Main St, New York, NY 10001. John works at TechSolutions, where he is a senior software engineer. His Social Security number is 123-45-6789."

In [34]:
for token, label in zip(tokens, pred_labels):
    print(f"{token}: {label}")

[CLS]: O
▁John: B-NAME_STUDENT
▁Doe: I-NAME_STUDENT
▁was: O
▁born: O
▁on: O
▁January: O
▁1: O
,: O
▁1990: O
,: O
▁in: O
▁New: O
▁York: O
▁City: O
.: O
▁He: O
▁has: O
▁lived: O
▁in: O
▁NYC: O
▁all: O
▁his: O
▁life: O
▁and: O
▁graduated: O
▁from: O
▁Columbia: O
▁University: O
▁in: O
▁2012: O
▁with: O
▁a: O
▁degree: O
▁in: O
▁Computer: O
▁Science: O
.: O
▁John: B-NAME_STUDENT
': O
s: O
▁email: O
▁address: O
▁is: O
▁john: B-EMAIL
do: B-EMAIL
e: B-EMAIL
@: B-EMAIL
example: B-EMAIL
.: B-EMAIL
com: B-EMAIL
,: O
▁and: O
▁his: O
▁phone: O
▁number: O
▁is: O
▁(: O
555: O
): O
▁123: O
-: O
45: O
67: O
.: O
▁Recently: O
,: O
▁he: O
▁moved: O
▁to: O
▁a: O
▁new: O
▁house: O
▁located: O
▁at: O
▁123: B-STREET_ADDRESS
▁Main: I-STREET_ADDRESS
▁St: I-STREET_ADDRESS
,: O
▁New: O
▁York: O
,: O
▁NY: O
▁1000: O
1: O
.: O
▁John: O
▁works: O
▁at: O
▁Tech: O
Solutions: O
,: O
▁where: O
▁he: O
▁is: O
▁a: O
▁senior: O
▁software: O
▁engineer: O
.: O
▁His: O
▁Social: O
▁Security: O
▁number: O
▁is: O
▁123: O
-: O
45:

In [35]:
tokens

['[CLS]',
 '▁John',
 '▁Doe',
 '▁was',
 '▁born',
 '▁on',
 '▁January',
 '▁1',
 ',',
 '▁1990',
 ',',
 '▁in',
 '▁New',
 '▁York',
 '▁City',
 '.',
 '▁He',
 '▁has',
 '▁lived',
 '▁in',
 '▁NYC',
 '▁all',
 '▁his',
 '▁life',
 '▁and',
 '▁graduated',
 '▁from',
 '▁Columbia',
 '▁University',
 '▁in',
 '▁2012',
 '▁with',
 '▁a',
 '▁degree',
 '▁in',
 '▁Computer',
 '▁Science',
 '.',
 '▁John',
 "'",
 's',
 '▁email',
 '▁address',
 '▁is',
 '▁john',
 'do',
 'e',
 '@',
 'example',
 '.',
 'com',
 ',',
 '▁and',
 '▁his',
 '▁phone',
 '▁number',
 '▁is',
 '▁(',
 '555',
 ')',
 '▁123',
 '-',
 '45',
 '67',
 '.',
 '▁Recently',
 ',',
 '▁he',
 '▁moved',
 '▁to',
 '▁a',
 '▁new',
 '▁house',
 '▁located',
 '▁at',
 '▁123',
 '▁Main',
 '▁St',
 ',',
 '▁New',
 '▁York',
 ',',
 '▁NY',
 '▁1000',
 '1',
 '.',
 '▁John',
 '▁works',
 '▁at',
 '▁Tech',
 'Solutions',
 ',',
 '▁where',
 '▁he',
 '▁is',
 '▁a',
 '▁senior',
 '▁software',
 '▁engineer',
 '.',
 '▁His',
 '▁Social',
 '▁Security',
 '▁number',
 '▁is',
 '▁123',
 '-',
 '45',
 '-',
 '6',
 '7

In [36]:
def clean_tokens_deberta(tokens):
    # Remove [CLS] and [SEP]
    removed_stream_identifiers = tokens[1:-1]
    removed_underscores = [token[1:] if (token.startswith("▁") and len(token) > 1) else token for token in removed_stream_identifiers]
    print(removed_underscores)
    return removed_underscores

In [37]:
tokens = ['[CLS]', '▁John', '▁Doe', '▁was', '▁born', '▁on', '▁January', '▁1', ',', '▁1990', ',', '▁in', '▁New', '▁York', '▁City', '.', '▁He', '▁has', '▁lived', '▁in', '▁NYC', '▁all', '▁his', '▁life', '▁and', '▁graduated', '▁from', '▁Columbia', '▁University', '▁in', '▁2012', '▁with', '▁a', '▁degree', '▁in', '▁Computer', '▁Science', '.', '▁John', "'", 's', '▁email', '▁address', '▁is', '▁john', 'do', 'e', '@', 'example', '.', 'com', ',', '▁and', '▁his', '▁phone', '▁number', '▁is', '▁(', '555', ')', '▁123', '-', '45', '67', '.', '▁Recently', ',', '▁he', '▁moved', '▁to', '▁a', '▁new', '▁house', '▁located', '▁at', '▁123', '▁Main', '▁St', ',', '▁New', '▁York', ',', '▁NY', '▁1000', '1', '.', '▁John', '▁works', '▁at', '▁Tech', 'Solutions', ',', '▁where', '▁he', '▁is', '▁a', '▁senior', '▁software', '▁engineer', '.', '▁His', '▁Social', '▁Security', '▁number', '▁is', '▁123', '-', '45', '-', '6', '789', '.', '[SEP]']
cleaned_tokens = clean_tokens_deberta(tokens)

['John', 'Doe', 'was', 'born', 'on', 'January', '1', ',', '1990', ',', 'in', 'New', 'York', 'City', '.', 'He', 'has', 'lived', 'in', 'NYC', 'all', 'his', 'life', 'and', 'graduated', 'from', 'Columbia', 'University', 'in', '2012', 'with', 'a', 'degree', 'in', 'Computer', 'Science', '.', 'John', "'", 's', 'email', 'address', 'is', 'john', 'do', 'e', '@', 'example', '.', 'com', ',', 'and', 'his', 'phone', 'number', 'is', '(', '555', ')', '123', '-', '45', '67', '.', 'Recently', ',', 'he', 'moved', 'to', 'a', 'new', 'house', 'located', 'at', '123', 'Main', 'St', ',', 'New', 'York', ',', 'NY', '1000', '1', '.', 'John', 'works', 'at', 'Tech', 'Solutions', ',', 'where', 'he', 'is', 'a', 'senior', 'software', 'engineer', '.', 'His', 'Social', 'Security', 'number', 'is', '123', '-', '45', '-', '6', '789', '.']
