# Named-entity recognition with SpaCy

In [1]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [2]:
nlp = spacy.load('fr_core_news_sm')

## Fonctions

In [3]:
def test():
    """Basic test on sample sentences"""
    for sent in sentences:
        doc = nlp(sent)
        entities = []
        for ent in doc.ents:
            entities.append(f"{ent.text} ({ent.label_})")
        if entities:
            print(f"'{doc.text}' contains the following entities: {', '.join(entities)}")
        else:
            print(f"'{doc.text}' contains no entities")

In [4]:
def search(n=1000000):
    text = open("1850.txt", encoding='latin-1').read()[:n]
    doc = nlp(text)
    people = defaultdict(int)
    location = defaultdict(int)
    organisation = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "PER" and len(ent.text) > 3:
            people[ent.text] += 1
    sorted_people = sorted(people.items(), key=lambda kv: kv[1], reverse=True)
    for person, freq in sorted_people[:10]:
        print(f"{person} appears {freq} times in the corpus")
    for ent in doc.ents:
        if ent.label_ == "LOC" and len(ent.text) > 3:
            location[ent.text] += 1
    sorted_location = sorted(location.items(), key=lambda kv: kv[1], reverse=True)
    for location, freq in sorted_location[:10]:
        print(f"{location} appears {freq} times in the corpus")
    for ent in doc.ents:
        if ent.label_ == "ORG" and len(ent.text) > 3:
            organisation[ent.text] += 1
    sorted_organisation = sorted(organisation.items(), key=lambda kv: kv[1], reverse=True)
    for organisation, freq in sorted_organisation[:10]:
        print(f"{organisation} appears {freq} times in the corpus")

## NER sur des données de test

In [5]:
test()

'Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars' contains the following entities: Apple (ORG)
'Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs' contains no entities
'San Francisco envisage d'interdire les robots coursiers sur les trottoirs' contains the following entities: San Francisco (LOC)
'Londres est une grande ville du Royaume-Uni' contains the following entities: Londres (LOC), Royaume-Uni (LOC)
'L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe' contains the following entities: ArcelorMittal (MISC), Europe (LOC)
'Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon' contains the following entities: Apple (ORG), HomePod (MISC), Echo (MISC)
'La France ne devrait pas manquer d'électricité cet été, même en cas de canicule' contains the following entities: La France (LOC)
'Nouvelles attaques de Trump contre le maire de Londres' contains the following entities: Trump (LOC), Lo

## NER sur le corpus des bulletins communaux

In [6]:
search(n=10000)

Fourniture appears 3 times in the corpus
Construction appears 2 times in the corpus
Isabelle appears 2 times in the corpus
Approbation appears 2 times in the corpus
Autorisation appears 2 times in the corpus
Adjudications appears 1 times in the corpus
Présidence de M appears 1 times in the corpus
Communications de M. le Bourgmestre appears 1 times in the corpus
Rapport appears 1 times in the corpus
M. Téchcvin Hlaes appears 1 times in the corpus
Bruxelles appears 3 times in the corpus
BRUXELLES appears 2 times in the corpus
1 U K appears 1 times in the corpus
Echevins appears 1 times in the corpus
caserne du Petit-Château appears 1 times in the corpus
Panoramas appears 1 times in the corpus
Trois-Trous appears 1 times in the corpus
Quais appears 1 times in the corpus
Canal appears 1 times in the corpus
Parc appears 1 times in the corpus
Conseil appears 9 times in the corpus
COMMUNAL appears 2 times in the corpus
Collège des Bourgmestre appears 2 times in the corpus
Collège appears 2 ti