In [5]:
from transformers import pipeline
import pandas as pd

class NERExtractor:
    def __init__(self, model_name="dbmdz/bert-large-cased-finetuned-conll03-english"):
        # Load the NER pipeline with the specified model
        self.ner_pipeline = pipeline("ner", model=model_name, aggregation_strategy="simple")
    
    def extract_entities(self, text):
        """Extract named entities from the provided text."""
        return self.ner_pipeline(text)

    def format_entities(self, entities):
        """Format extracted entities for easy integration into a knowledge graph."""
        formatted_entities = []
        for entity in entities:
            # Standardize entity type names
            entity_type = self.map_entity_type(entity['entity_group'])
            formatted_entities.append({
                "entity": entity['word'],
                "type": entity_type,
                "score": round(entity['score'], 2)
            })
        return formatted_entities

    def map_entity_type(self, entity_group):
        """Map entity types to more readable names."""
        entity_mapping = {
            "PER": "PERSON",
            "ORG": "ORGANIZATION",
            "LOC": "LOCATION",
            "MISC": "MISCELLANEOUS"
        }
        return entity_mapping.get(entity_group, entity_group)  # Default to original if not found

    def filter_entities_by_type(self, entities, entity_types):
        """Filter entities by one or more specific types."""
        if isinstance(entity_types, str):
            entity_types = [entity_types]  # Convert single type to a list
        return [entity for entity in entities if entity['type'] in entity_types]

def main():
    # Sample text for NER (NASA-related)
    text = """
    NASA's Artemis program aims to return humans to the Moon by 2024. 
    On July 29, 2023, NASA launched the Orion spacecraft from Kennedy Space Center in Florida. 
    The mission, known as Artemis II, will carry astronauts to orbit the Moon, including commander Charles Duke.
    """
    
    # Initialize the NER extractor
    ner_extractor = NERExtractor()

    # Extract entities from the text
    extracted_entities = ner_extractor.extract_entities(text)

    # Format the extracted entities
    formatted_entities = ner_extractor.format_entities(extracted_entities)

    # Print all extracted entities
    print("Extracted Entities:")
    for entity in formatted_entities:
        print(f"Entity: {entity['entity']}, Type: {entity['type']}, Score: {entity['score']}")

    # Example: Filter entities by type
    print("\nFiltered Entities (Person):")
    person_entities = ner_extractor.filter_entities_by_type(formatted_entities, "PERSON")
    for entity in person_entities:
        print(entity)

    print("\nFiltered Entities (Organization):")
    organization_entities = ner_extractor.filter_entities_by_type(formatted_entities, "ORGANIZATION")
    for entity in organization_entities:
        print(entity)

    print("\nFiltered Entities (Location):")
    location_entities = ner_extractor.filter_entities_by_type(formatted_entities, "LOCATION")
    for entity in location_entities:
        print(entity)

if __name__ == "__main__":
    main()

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Extracted Entities:
Entity: NASA, Type: ORGANIZATION, Score: 0.9900000095367432
Entity: Artemis, Type: MISCELLANEOUS, Score: 1.0
Entity: Moon, Type: MISCELLANEOUS, Score: 0.5199999809265137
Entity: NASA, Type: ORGANIZATION, Score: 1.0
Entity: Orion, Type: MISCELLANEOUS, Score: 0.9800000190734863
Entity: Kennedy Space Center, Type: LOCATION, Score: 0.8799999952316284
Entity: Florida, Type: LOCATION, Score: 1.0
Entity: Artemis II, Type: MISCELLANEOUS, Score: 0.9900000095367432
Entity: Moon, Type: LOCATION, Score: 0.800000011920929
Entity: Charles Duke, Type: PERSON, Score: 1.0

Filtered Entities (Person):
{'entity': 'Charles Duke', 'type': 'PERSON', 'score': 1.0}

Filtered Entities (Organization):
{'entity': 'NASA', 'type': 'ORGANIZATION', 'score': 0.99}
{'entity': 'NASA', 'type': 'ORGANIZATION', 'score': 1.0}

Filtered Entities (Location):
{'entity': 'Kennedy Space Center', 'type': 'LOCATION', 'score': 0.88}
{'entity': 'Florida', 'type': 'LOCATION', 'score': 1.0}
{'entity': 'Moon', 'typ