In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load pre-trained NER model
model_checkpoint = "dbmdz/bert-base-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample text
text = "Elon Musk founded Tesla in California in 2003."

# Get NER predictions
entities = ner_pipeline(text)

# Print entity results
for entity in entities:
    print(f"Word: {entity['word']}, Entity: {entity['entity_group']}, Score: {entity['score']:.2f}")


Word: Elon Musk, Entity: LABEL_4, Score: 0.90
Word: founded, Entity: LABEL_0, Score: 1.00
Word: Tesla, Entity: LABEL_6, Score: 0.96
Word: in, Entity: LABEL_0, Score: 1.00
Word: California, Entity: LABEL_8, Score: 1.00
Word: in 2003., Entity: LABEL_0, Score: 1.00


In [14]:
# Load pre-trained NER model
model_checkpoint = "dbmdz/bert-base-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
label_mapping = {
    "LABEL_0": "O",  # Outside any entity
    "LABEL_2": "MISC",
    "LABEL_4": "PER",  # Person
    "LABEL_6": "ORG",  # Organization
    "LABEL_8": "LOC"   # Location
}
# Sample text for anonymization
text = """In 2021, Dr. Jonathan Reed and his assistant, Emily Carter, conducted an unauthorized experiment at the Blackwood Research Facility in Germany. 
The experiment, named Project Genesis, involved exposing ten subjects—such as Michael Turner and Sarah Williams—to an untested neurochemical compound."""

# Get NER predictions
entities = ner_pipeline(text)
# Define which entity types to anonymize
anonymize_types = ["PER", "ORG", "LOC"]

# Replace detected entities if their type matches
for entity in entities:
    entity_label = label_mapping.get(entity['entity'], "UNKNOWN")
    if entity_label in anonymize_types:
        text = text.replace(entity["word"], "XXX")

print("Anonymized Text:\n", text)

Anonymized Text:
 In 2021, Dr. XXX XXX and his assistant, XXX XXX, conducted an unauthorized experiment at the XXXwood XXX XXX in XXX. 
The experiment, named Project Genesis, involved exposing ten subjects—such as XXX XXX and XXX XXX—to an untested neurochemical compound.


In [15]:
import os
import warnings
import logging
import re
import random
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForMaskedLM, pipeline, FillMaskPipeline

# Suppress transformers warnings
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
warnings.simplefilter("ignore")  # Ignore all warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

# Load pre-trained NER model (for detecting entities)
ner_model_checkpoint = "dbmdz/bert-base-cased-finetuned-conll03-english"
ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_checkpoint)
ner_model = AutoModelForTokenClassification.from_pretrained(ner_model_checkpoint)
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

# Load pre-trained BERT MLM model (for replacements)
mlm_model_checkpoint = "bert-base-cased"
mlm_tokenizer = AutoTokenizer.from_pretrained(mlm_model_checkpoint)
mlm_model = AutoModelForMaskedLM.from_pretrained(mlm_model_checkpoint)
mlm_pipeline = pipeline("fill-mask", model=mlm_model, tokenizer=mlm_tokenizer)

# Label mapping based on observed outputs
label_mapping = {
    "LABEL_0": "O",  # Outside any entity
    "LABEL_2": "MISC",
    "LABEL_4": "PER",  # Person
    "LABEL_6": "ORG",  # Organization
    "LABEL_8": "LOC"   # Location
}

# Choose which entity types to anonymize
anonymize_types = ["PER", "ORG", "LOC"]  # Modify this list as needed

# Sample text for NER
text = """In 2021, Dr. Jonathan Reed and his assistant, Emily Carter, conducted an unauthorized experiment at the Blackwood Research Facility in Germany. 
The experiment, named Project Genesis, involved exposing ten subjects—such as Michael Turner and Sarah Williams to an untested neurochemical compound.
"""

# Get NER predictions
entities = ner_pipeline(text)

# Convert entity labels using the mapping
for entity in entities:
    entity_label = entity.get("entity_group", entity.get("entity", "O"))
    entity["entity_group"] = label_mapping.get(entity_label, entity_label)

# Function to predict replacements using Masked Language Model (MLM)
def get_mlm_replacement(original_text, masked_text):
    """ Uses BERT MLM to predict a contextual replacement for a masked entity """
    prediction = mlm_pipeline(masked_text, top_k=5)  # Get top 5 predictions
    for pred in prediction:
        if pred["token_str"] not in original_text:  # Ensure it's a different word
            return pred["token_str"]
    return "UNKNOWN"

# Replace entities with BERT-generated alternatives
masked_text = text
for entity in sorted(entities, key=lambda x: x['start'], reverse=True):
    if entity["entity_group"] in anonymize_types:
        masked_version = masked_text[:entity["start"]] + "[MASK]" + masked_text[entity["end"]:]
        replacement = get_mlm_replacement(text, masked_version)
        masked_text = masked_text[:entity["start"]] + replacement + masked_text[entity["end"]:]

# Print results
print("Original Text:\n", text)
print("\nAnonymized Text:\n", masked_text)


Original Text:
 In 2021, Dr. Jonathan Reed and his assistant, Emily Carter, conducted an unauthorized experiment at the Blackwood Research Facility in Germany. 
The experiment, named Project Genesis, involved exposing ten subjects—such as Michael Turner and Sarah Williams to an untested neurochemical compound.


Anonymized Text:
 In 2021, Dr. Smith and his assistant, James, conducted an unauthorized experiment at the university in California. 
The experiment, named Project Genesis, involved exposing ten subjects—such as himself and others to an untested neurochemical compound.

