In [None]:
import sys
sys.path.insert(0, '../')

In [None]:
import json
from itertools import combinations

import spacy
from spacy.pipeline import EntityRuler

from src.config import PATH_RESOURCES, PATH_MODEL
from src.geo_data import load_geonames, load_rest_countries, load_cbs_municipalities

In [None]:
geonames = load_geonames()
cbs_data = load_cbs_municipalities(alts_json='../alts_places_nl.json')

exclude = ['GB', 'NL']
topo = {
    'countries': load_rest_countries(language='nl', alts_json='../alts_countries.json'),
    'places'   : geonames.query("country_code not in @exclude").alternate_name.values,
    'places_uk': geonames.query("country_code == 'GB'").alternate_name.values,
    'places_nl': geonames.query("country_code == 'NL' and admin_name1 != 'Friesland'").alternate_name.values,
    'places_fr': geonames.query("country_code == 'NL' and admin_name1 == 'Friesland'").alternate_name.values,
    }

geo_ents = dict()
for label in topo:
    geo_ents[label] = [{'label': label, 'pattern': p} for p in topo[label]]

In [None]:
problems = dict()
places = dict()
for key in geo_ents:
    ent_set = set()
    for d in geo_ents[key]:
        ent_set.add(d['pattern'])
    places[key] = ent_set
    
for key1, key2 in list(combinations(places, r=2)):
    if places[key1] & places[key2]:
        new_key = f"{key1}-{key2}"
        duplicates = place_check[key1] & place_check[key2]
        print(new_key, "\n", duplicates, "\n")
        problems[new_key] = list(duplicates)

with open('../duplicate_places.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(problems))

In [None]:
removal = set()
for key in place_check:
    if key == 'places':
        continue
    removal = removal | (place_check[key] & place_check['places'])

for item in geo_ents['places']:
    if item['pattern'] in removal:
        geo_ents['places'].remove(item)

In [None]:
nlp = spacy.load('nl', disable=['ner'])
ruler = EntityRuler(nlp)
for label in geo_ents:
    ruler.add_patterns(geo_ents[label])
nlp.add_pipe(ruler)
nlp.to_disk(PATH_MODEL)