In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load pre-trained NER model
model_checkpoint = "dbmdz/bert-base-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Label mapping
label_mapping = {
    "O": "O",  # Outside any entity
    "B-PER": "PER", "I-PER": "PER",
    "B-ORG": "ORG", "I-ORG": "ORG",
    "B-LOC": "LOC", "I-LOC": "LOC",
    "B-MISC": "MISC", "I-MISC": "MISC"
}

# Sample text for NER
text = """In 2021, Dr. Jonathan Reed and his assistant, Emily Carter, conducted an unauthorized experiment at the Blackwood Research Facility in Germany. 
The experiment, named Project Genesis, involved exposing ten subjects—such as Michael Turner and Sarah Williams—to an untested neurochemical compound. 
Reports suggest that some participants, including David Larson, suffered severe cognitive impairment. 
The lead scientist, Dr. Reed, allegedly received funding from Orion Pharmaceuticals, a company with ties to undisclosed military projects. 
After an anonymous tip, Interpol arrested the researchers on July 15, 2022, leading to a controversial trial in Berlin."""

# Get NER predictions
entities = ner_pipeline(text)

# Print detected entities for debugging
print(entities)  # Check the output structure

# Map detected entities to readable labels
for entity in entities:
    entity_label = entity.get("entity_group", entity.get("entity", "O"))  # Adjusting for possible key names
    entity["entity_group"] = label_mapping.get(entity_label, entity_label)  # Replace model's label with readable label

# Print entities with labels
for entity in entities:
    print(f"Entity: {entity['word']} | Label: {entity['entity_group']} | Score: {entity['score']:.4f}")


Some weights of the model checkpoint at dbmdz/bert-base-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity_group': 'LABEL_0', 'score': np.float32(0.99259037), 'word': 'In 2021, Dr.', 'start': 0, 'end': 12}, {'entity_group': 'LABEL_4', 'score': np.float32(0.9992299), 'word': 'Jonathan Reed', 'start': 13, 'end': 26}, {'entity_group': 'LABEL_0', 'score': np.float32(0.9999364), 'word': 'and his assistant,', 'start': 27, 'end': 45}, {'entity_group': 'LABEL_4', 'score': np.float32(0.99899566), 'word': 'Emily Carter', 'start': 46, 'end': 58}, {'entity_group': 'LABEL_0', 'score': np.float32(0.9999281), 'word': ', conducted an unauthorized experiment at the', 'start': 58, 'end': 103}, {'entity_group': 'LABEL_8', 'score': np.float32(0.9180295), 'word': 'Blackwood Research Facility', 'start': 104, 'end': 131}, {'entity_group': 'LABEL_0', 'score': np.float32(0.9999167), 'word': 'in', 'start': 132, 'end': 134}, {'entity_group': 'LABEL_8', 'score': np.float32(0.99896014), 'word': 'Germany', 'start': 135, 'end': 142}, {'entity_group': 'LABEL_0', 'score': np.float32(0.99993384), 'word': '. The ex