# Named-entity recognition with SpaCy année 1903

In [7]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [8]:
nlp = spacy.load('fr_core_news_sm')

# Exemple sur un corpus de Spacy

In [9]:
def test():
    """Basic test on sample sentences"""
    for sent in sentences:
        doc = nlp(sent)
        entities = []
        for ent in doc.ents:
            entities.append(f"{ent.text} ({ent.label_})")
        if entities:
            print(f"'{doc.text}' contains the following entities: {', '.join(entities)}")
        else:
            print(f"'{doc.text}' contains no entities")

In [10]:
def search_people(n=1000000):
    text = open("1903_keywords.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    people = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "PER" and len(ent.text) > 3:
            people[ent.text] += 1
    sorted_people = sorted(people.items(), key=lambda kv: kv[1], reverse=True)
    for person, freq in sorted_people[:20]:
        print(f"{person} appears {freq} times in the corpus")

In [11]:
def search_locations(n=1000000):
    text = open("1903_keywords.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    location = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "LOC" and len(ent.text) > 3:
            location[ent.text] += 1
    sorted_location = sorted(location.items(), key=lambda kv: kv[1], reverse=True)
    for location, freq in sorted_location[:20]:
        print(f"{location} appears {freq} times in the corpus")

In [12]:
def search_organisation(n=1000000):
    text = open("1903_keywords.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    organisation = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "ORG" and len(ent.text) > 3:
            organisation[ent.text] += 1
    sorted_organisation = sorted(organisation.items(), key=lambda kv: kv[1], reverse=True)
    for organisation, freq in sorted_organisation[:20]:
        print(f"{organisation} appears {freq} times in the corpus")

In [13]:
nlp.entity.labels

('LOC', 'MISC', 'ORG', 'PER')

## NER sur le corpus des bulletins communaux

### Trouver les personnes, organisations et endroits les plus populaires.

In [14]:
search_people()

renvoi appears 19 times in the corpus
hectare appears 18 times in the corpus
roger grimberghe appears 18 times in the corpus
loyer appears 15 times in the corpus
lemonnier appears 14 times in the corpus
voulez appears 14 times in the corpus
brabandt appears 14 times in the corpus
grimard hallet appears 13 times in the corpus
grimard appears 13 times in the corpus
verheven bosquet appears 13 times in the corpus
solde appears 13 times in the corpus
vandenbosch lemonnier appears 10 times in the corpus
hubert appears 10 times in the corpus
michel gudule appears 9 times in the corpus
lorsqu appears 9 times in the corpus
wauwermans appears 9 times in the corpus
theodor locht appears 8 times in the corpus
brabandt theodor locht appears 8 times in the corpus
saint gilles appears 8 times in the corpus
brabandt locht appears 6 times in the corpus


In [None]:
# Charger le texte

n=1000000
text = open("../data/all.txt", encoding='utf-8').read()[:n]

In [None]:
%%time
# Traiter le texte

doc = nlp(text)

In [None]:
search_locations()

In [16]:
search_organisation()

crédit supplémentaire francs appears 6 times in the corpus
acte seing appears 5 times in the corpus
legs genechten appears 3 times in the corpus
section police appears 3 times in the corpus
sis schaerbeek appears 3 times in the corpus
treurenberg appears 3 times in the corpus
legs brugmann appears 3 times in the corpus
union syndicale appears 3 times in the corpus
acte seing privé contenance appears 2 times in the corpus
conrardy conrardy appears 2 times in the corpus
legs appears 2 times in the corpus
école normale appears 2 times in the corpus
hospice pachéco appears 2 times in the corpus
commission spéciale appears 2 times in the corpus
legs dauwé appears 2 times in the corpus
réparation église appears 2 times in the corpus
crédit supplémentaire exercice appears 2 times in the corpus
groupe socialiste appears 2 times in the corpus
locht appears 2 times in the corpus
woluwe saint appears 2 times in the corpus
