# Named Entity Recognition and Linking with Impresso BERT models

## Good to know before starting


In [None]:
# Import necessary modules from the transformers library
from transformers import pipeline
from transformers import AutoModelForTokenClassification, AutoTokenizer

# Define the model name to be used for token classification, we use the Impresso NER
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"

# Load the tokenizer corresponding to the specified model name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
# Create a pipeline for named entity recognition (NER) using the loaded model and tokenizer
nlp = pipeline("generic-ner", model=MODEL_NAME, tokenizer=tokenizer, trust_remote_code=True)


In [None]:
def print_nicely(results):
    for key, entities in results.items():
        if entities:
            print(f"\n**{key}**\n")
            print(f"{'Entity':<15} {'Type':<10} {'Score':<8} {'Index':<5} {'Word':<20} {'Start':<5} {'End':<5}")
            print("-" * 70)
            for entity in entities:
                print(f"{entity['word']:<15} {entity['entity']:<10} {entity['score']:<8.4f} {entity['index']:<5} {entity['word']:<20} {entity['start']:<5} {entity['end']:<5}")

In [None]:
sentences = ["""Apple est créée le 1er avril 1976 dans le garage de la maison 
            d'enfance de Steve Jobs à Los Altos en Californie par Steve Jobs, Steve Wozniak 
            et Ronald Wayne, puis constituée sous forme de société le 3 janvier 1977 à l'origine 
            sous le nom d'Apple Computer, mais pour ses 30 ans et pour refléter la diversification 
            de ses produits, le mot « computer » est retiré le 9 janvier 2015.
            """]

for sentence in sentences:
    # Input text to be processed for named entity recognition
    print(f'Sentence: {sentence}')
    results = nlp(sentence)
    print_nicely(results)

Entity Linking

In [None]:
def add_entity_tags(text, results):
    entities = []
    for task, ents in results.items():
        for entity in ents:
            entities.append((entity['start'], entity['end'], entity['word']))

    already_done = []
    # Insert tags in the text using replace method
    for start, end, word in entities:
        if word not in already_done:
            text = text.replace(word, f"[START] {word} [END]")
            already_done.append(word)
    
    return text


In [None]:
# Import the necessary modules from the transformers library
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model from the specified pre-trained model name
# The model used here is "impresso-project/nel-mgenre-multilingual"
tokenizer = AutoTokenizer.from_pretrained("impresso-project/nel-mgenre-multilingual")
model = AutoModelForSeq2SeqLM.from_pretrained("impresso-project/nel-mgenre-multilingual").eval()


In [None]:
# Process each sentence for named entity recognition and linking
for sentence in sentences:
    # Input text to be processed for named entity recognition
    print(f'Sentence: {sentence}')
    
    # Run the NER pipeline on the input sentence and store the results
    results = nlp(sentence)

    # Initialize a list to hold the entities
    entities = []

    # Extract entities from the results
    for task, ents in results.items():
        for entity in ents:
            entities.append((entity['start'], entity['end'], entity['word']))
    
    # List to keep track of already processed words to avoid duplicate tagging
    already_done = []

    # Process each entity for linking
    for start, end, word in entities:
        if word not in already_done:
            # Tag the entity in the text
            entity_text = sentence.replace(word, f"[START] {word} [END]")
            print(f"\nEntity: {word}, Tagged Text: {entity_text}\n")

            # Generate Wikipedia links for the tagged text
            outputs = model.generate(
                **tokenizer([entity_text], return_tensors="pt"),
                num_beams=5,
                num_return_sequences=5
            )
            
            # Decode the generated output to get the Wikipedia links
            wikipedia_links = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            print(f"\nEntity: {word}, Wikipedia Links: {wikipedia_links}")
            
            # Add the word to the already processed list
            already_done.append(word)


In [None]:
from utils import get_wikipedia_page_props

# Process each sentence for named entity recognition and linking
for sentence in sentences:
    # Input text to be processed for named entity recognition
    print(f'Sentence: {sentence}')
    
    # Run the NER pipeline on the input sentence and store the results
    results = nlp(sentence)

    # Initialize a list to hold the entities
    entities = []

    # Extract entities from the results
    for task, ents in results.items():
        for entity in ents:
            entities.append((entity['start'], entity['end'], entity['word']))
    
    # List to keep track of already processed words to avoid duplicate tagging
    already_done = []

    # Process each entity for linking
    for start, end, word in entities:
        if word not in already_done:
            # Tag the entity in the text
            entity_text = sentence.replace(word, f"[START] {word} [END]")
            print(f"\nEntity: {word}, Tagged Text: {entity_text}\n")

            # Generate Wikipedia links for the tagged text
            outputs = model.generate(
                **tokenizer([entity_text], return_tensors="pt"),
                num_beams=5,
                num_return_sequences=5
            )
            
            # Decode the generated output to get the Wikipedia links
            wikipedia_links = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            print(f"\nEntity: {word}, Wikipedia Links: {wikipedia_links}")
            
            # Add the word to the already processed list
            already_done.append(word)
            
            # Retrieve and print Wikidata QID for each Wikipedia link
            for wikipedia_link in wikipedia_links:
                qid = get_wikipedia_page_props(wikipedia_link)
                print(f"  Wikidata: {wikipedia_link} -> {qid}")
