In [None]:
!pip install spacy

In [6]:
import spacy
from tqdm.auto import tqdm

# Define a dictionary of supported language codes and their corresponding model names
LANGUAGES = {
    'ca': 'ca_core_news_sm',  # Catalan
    'zh': 'zh_core_web_sm',  # Chinese
    'hr': 'hr_core_news_sm',  # Croatian
    'da': 'da_core_news_sm',  # Danish
    'nl': 'nl_core_news_sm',  # Dutch
    'en': 'en_core_web_sm',  # English
    'fi': 'fi_core_news_sm',  # Finnish
    'fr': 'fr_core_news_sm',  # French
    'de': 'de_core_news_sm',  # German
    'el': 'el_core_news_sm',  # Greek
    'it': 'it_core_news_sm',  # Italian
    'ja': 'ja_core_news_sm',  # Japanese
    'ko': 'ko_core_news_sm',  # Korean
    'lt': 'lt_core_news_sm',  # Lithuanian
    'mk': 'mk_core_news_sm',  # Macedonian
    'nb': 'nb_core_news_sm',  # Norwegian Bokmål
    'pl': 'pl_core_news_sm',  # Polish
    'pt': 'pt_core_news_sm',  # Portuguese
    'ro': 'ro_core_news_sm',  # Romanian
    'ru': 'ru_core_news_sm',  # Russian
    'es': 'es_core_news_sm',  # Spanish
    'sv': 'sv_core_news_sm',   # Swedish
}

def multi_lang_ner(text, lang):
    # Load the appropriate spaCy model based on the language code
    if lang in LANGUAGES:
        model_name = LANGUAGES[lang]
        if not spacy.util.is_package(model_name):
            spacy.cli.download(model_name)
        nlp = spacy.load(model_name)
    else:
        return []

    # Process the text with spaCy's NER pipeline
    doc = nlp(text)
    
    # Extract the entities and their types and positions
    entities = []
    for ent in doc.ents:
        entity = {
            'text': ent.text,
            'type': ent.label_,
            'start_pos': ent.start_char,
            'end_pos': ent.end_char
        }
        entities += [entity]
    
    return entities


In [8]:
text = 'Khalid Kahloot is from Budapest, but he works in New York for Cactus Communications.'
lang = 'en'
entities = multi_lang_ner(text, lang)
print(entities)


  0%|          | 0/22 [00:00<?, ?it/s]


[{'text': 'Khalid Kahloot', 'type': 'PERSON', 'start_pos': 0, 'end_pos': 14}, {'text': 'Budapest', 'type': 'GPE', 'start_pos': 23, 'end_pos': 31}, {'text': 'New York', 'type': 'GPE', 'start_pos': 49, 'end_pos': 57}]


# Bonus

In [10]:
from spacy import displacy

def visualize_ner(text, lang):
    # Call the multi_lang_ner() function to extract the entities from the text
    entities = multi_lang_ner(text, lang)

    # Create a spaCy Doc object from the text
    nlp = spacy.load(LANGUAGES[lang])
    doc = nlp(text)

    # Generate the visualization and display it in Jupyter notebook
    colors = {'PERSON': 'yellow', 'ORG': 'orange', 'GPE': 'pink'}
    options = {'ents': [ent['type'] for ent in entities], 'colors': colors}
    displacy.render(doc, style='ent', options=options, jupyter=True)


In [12]:
text = 'Khalid Kahloot is from Budapest, but he works in New York for Cactus Communications.'
lang = 'en'
visualize_ner(text, lang)


  0%|          | 0/22 [00:00<?, ?it/s]
