In [2]:

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [3]:
model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cpu


In [5]:

text = "Elon Musk founded Tesla in California on March 12, 2024, and 2003."


entities = ner_pipeline(text)

for entity in entities:
    print(f"Word: {entity['word']}, Entity: {entity['entity_group']}, Score: {entity['score']:.2f}")


Word: Elon Musk, Entity: PER, Score: 1.00
Word: Tesla, Entity: ORG, Score: 1.00
Word: California, Entity: LOC, Score: 1.00


In [6]:
import re


anonymize_types = ["PER", "ORG", "LOC"]


date_patterns = [
    r"\b\d{1,2}/\d{1,2}/\d{2,4}\b",  # MM/DD/YYYY or DD/MM/YYYY
    r"\b\d{4}-\d{1,2}-\d{1,2}\b",    # YYYY-MM-DD
    r"\b\d{1,2} (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b",  # "12 March 2024"
    r"\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2},? \d{4}\b",  # "March 12, 2024"
    r"\b\d{4}\b"  # Any 4-digit year (e.g., "2003")
]


def anonymize_text(text):
    entities = ner_pipeline(text)

   
    for entity in entities:
        if entity["entity_group"] in anonymize_types:
            text = text.replace(entity["word"], "XXX")

  
    for pattern in date_patterns:
        text = re.sub(pattern, "XXX", text)

    return text


input_text = """In 2021, Dr. Jonathan Reed and his assistant, Emily Carter, conducted an unauthorized experiment at the Blackwood Research Facility in Germany. 
The experiment, named Project Genesis, involved exposing ten subjects—such as Michael Turner and Sarah Williams—to an untested neurochemical compound.
"""
anonymized_text = anonymize_text(input_text)

print("Anonymized Text:\n", anonymized_text)


Anonymized Text:
 In XXX, Dr. XXX and his assistant, XXX, conducted an unauthorized experiment at the XXX in XXX. 
The experiment, named Project Genesis, involved exposing ten subjects—such as XXX and XXX—to an untested neurochemical compound.

