In [1]:
# %%
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# %%
# Path to your trained French NER model
model_path = "./french_ner_model"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [3]:

# %%
# Create NER pipeline with your trained model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [7]:

# %%
# French sample text
text = """
En 2021, le Dr Jonathan Reed et son assistante, Emily Carter, ont mené une expérience non autorisée au centre de recherche Blackwood en Allemagne.
L'expérience, appelée Projet Genesis, consistait à exposer dix sujets — tels que Michael Turner et Sarah Williams — à un composé neurochimique non testé.
"""

In [8]:

# %%
# Get NER predictions
entities = ner_pipeline(text)

In [6]:

# %%
# Print entity results
for entity in entities:
    print(f"Word: {entity['word']}, Entity: {entity['entity_group']}, Score: {entity['score']:.2f}")


Word: Emmanuel Macron, Entity: PER, Score: 0.89
Word: président, Entity: ORG, Score: 0.58
Word: de la République, Entity: PER, Score: 0.51
Word: française, Entity: ORG, Score: 0.50


In [11]:
# %%
# Anonymize text by replacing PERS and ORG entities with 'XXX'
def anonymize_text(text, entities):
    # Sort entities by start index in reverse to safely replace from the end
    sorted_entities = sorted(
        [e for e in entities if e["entity_group"] in ("PER", "ORG")],
        key=lambda x: x["start"],
        reverse=True
    )
    for entity in sorted_entities:
        text = text[:entity["start"]] + "XXX" + text[entity["end"]:]
    return text

# Apply anonymization
anonymized_text = anonymize_text(text, entities)

# Print result
print(anonymized_text)



En 2021, le Dr XXX et son assistante, XXX, ont mené une expérience non autorisée au XXX en Allemagne.
L'expérience, appelée XXX, consistait à exposer dix sujets — tels que XXX et XXX — à un composé XXX non testé.

