In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
batches = ['leeuwarder_courant', 'telegraaf', 'trouw', 'volkskrant']

In [None]:
import spacy
import pandas as pd
from itertools import combinations
from spacy.pipeline import EntityRuler
from src.config import PATH_MODEL, PATH_RESOURCES, PARAM
from src.spacy_helpers import serialize_batch
from src.geo_data import load_rest_countries, load_cbs_dutch_cities, parse_wiki_place_lists
from src.geo_data import create_geonames_datasets

In [None]:
create_geonames_datasets(language='nl')

In [None]:
cbs_data = load_cbs_dutch_cities()
df_alt = pd.read_pickle(PATH_RESOURCES / 'df_alt.pkl')
df_geo = pd.read_pickle(PATH_RESOURCES / 'df_geo.pkl')
geonames_data = df_alt.merge(df_geo, on='geoname_id')
exclude = ['GB', 'NL']
topo = {
    'countries': load_rest_countries(language='nl', alts_json='alts_countries.json'),
    'places'   : geonames_data.query("country_code not in @exclude").alternate_name.values,
    'places_uk': parse_wiki_place_lists('https://nl.wikipedia.org/wiki/Lijst_van_plaatsen_in_Engeland'),
    'places_nl': cbs_data.query("provincie != 'Friesland'").gemeentenaam.values,
    'places_fr': cbs_data.query("provincie == 'Friesland'").gemeentenaam.values,
    }

geo_ents = dict()
for label in topo:
    geo_ents[label] = [{'label': label, 'pattern': p} for p in topo[label]]

In [None]:
place_check = dict()
for key in geo_ents:
    ent_set = set()
    for d in geo_ents[key]:
        ent_set.add(d['pattern'])
    place_check[key] = ent_set
    
for key1, key2 in list(combinations(place_check, r=2)):
    if place_check[key1] & place_check[key2]:
        print(f"{key1}-{key2}\n", place_check[key1] & place_check[key2], "\n")

In [None]:
removal = set()
for key in place_check:
    if key == 'places':
        continue
    removal = removal | (place_check[key] & place_check['places'])

for item in geo_ents['places']:
    if item['pattern'] in removal:
        geo_ents['places'].remove(item)

In [None]:
nlp = spacy.load('nl', disable=['ner'])
ruler = EntityRuler(nlp)
for label in geo_ents:
    ruler.add_patterns(geo_ents[label])
nlp.add_pipe(ruler)
nlp.to_disk(PATH_MODEL)

In [None]:
for batch in batches:
    serialize_batch(nlp, batch)