In [4]:
# %%
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# %%
# Path to your trained French NER model
model_path = "../deploy/french_ner_model/"

In [5]:

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [9]:

# %%
# Create NER pipeline with your trained model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cpu


In [10]:

# %%
# French sample text
text = """
En 2021, le Dr Jonathan Reed et son assistante, Emily Carter, ont mené une expérience non autorisée au centre de recherche Blackwood en Allemagne.
L'expérience, appelée Projet Genesis, consistait à exposer dix sujets — tels que Michael Turner et Sarah Williams — à un composé neurochimique non testé.
"""

In [11]:

# %%
# Get NER predictions
entities = ner_pipeline(text)

In [12]:

# %%
# Print entity results
for entity in entities:
    print(entity)


{'entity_group': 'LOC', 'score': np.float32(0.5343956), 'word': '20', 'start': 4, 'end': 6}
{'entity_group': 'LOC', 'score': np.float32(0.53306395), 'word': '21', 'start': 6, 'end': 8}
{'entity_group': 'PER', 'score': np.float32(0.9198627), 'word': 'Jonathan Reed', 'start': 16, 'end': 29}
{'entity_group': 'PER', 'score': np.float32(0.9221792), 'word': 'Emily Carter', 'start': 49, 'end': 61}
{'entity_group': 'ORG', 'score': np.float32(0.8254844), 'word': 'centre de recherche Blackwood', 'start': 104, 'end': 133}
{'entity_group': 'LOC', 'score': np.float32(0.8911899), 'word': 'Allemagne', 'start': 137, 'end': 146}
{'entity_group': 'ORG', 'score': np.float32(0.8803517), 'word': 'Projet Genesis', 'start': 170, 'end': 184}
{'entity_group': 'PER', 'score': np.float32(0.9226653), 'word': 'Michael Turner', 'start': 229, 'end': 243}
{'entity_group': 'PER', 'score': np.float32(0.92147684), 'word': 'Sarah Williams', 'start': 247, 'end': 261}
{'entity_group': 'ORG', 'score': np.float32(0.42111385)

In [11]:
# %%
# Anonymize text by replacing PERS and ORG entities with 'XXX'
def anonymize_text(text, entities):
    # Sort entities by start index in reverse to safely replace from the end
    sorted_entities = sorted(
        [e for e in entities if e["entity_group"] in ("PER", "ORG")],
        key=lambda x: x["start"],
        reverse=True
    )
    for entity in sorted_entities:
        text = text[:entity["start"]] + "XXX" + text[entity["end"]:]
    return text

# Apply anonymization
anonymized_text = anonymize_text(text, entities)

# Print result
print(anonymized_text)



En 2021, le Dr XXX et son assistante, XXX, ont mené une expérience non autorisée au XXX en Allemagne.
L'expérience, appelée XXX, consistait à exposer dix sujets — tels que XXX et XXX — à un composé XXX non testé.

