In [None]:
import sys
sys.path.insert(0, '../')

In [None]:
import json
from itertools import combinations

import pandas as pd
import spacy
from spacy.pipeline import EntityRuler

from src.config import PATH_RESOURCES, PATH_RESULTS, PATH_MODEL
from src.geo_data import load_geonames, load_rest_countries, load_cbs_municipalities
from src.doc_analysis import get_positives

## Prepare geo entities

In [None]:
# load datasets
countries = load_rest_countries(language='nl', alts_json='../alts_countries.json')
geonames = load_geonames()

# remove geonames that are also country names
geonames = geonames.query("alternate_name not in @countries.keys()")

# store results
geonames.to_pickle(PATH_RESOURCES / 'geonames/df_geonames.pkl')

In [None]:
exclude = ['GB', 'NL']
topo = {
    'countries': countries,
    'places'   : geonames.query("country_code not in @exclude").alternate_name.values,
    'places_uk': geonames.query("country_code == 'GB'").alternate_name.values,
    'places_nl': geonames.query("country_code == 'NL' and admin_name1 != 'Friesland'").alternate_name.values,
    'places_fr': geonames.query("country_code == 'NL' and admin_name1 == 'Friesland'").alternate_name.values,
    }

geo_ents = dict()
for label in topo:
    geo_ents[label] = [{'label': label, 'pattern': p} for p in topo[label]]

## Check for duplicates between geo_ent collections

In [None]:
problems = dict()
place_names = dict()
for key in geo_ents:
    ent_set = set()
    for d in geo_ents[key]:
        ent_set.add(d['pattern'])
    place_names[key] = ent_set
    
for key1, key2 in list(combinations(place_names, r=2)):
    if place_names[key1] & place_names[key2]:
        new_key = f"{key1}-{key2}"
        duplicates = place_names[key1] & place_names[key2]
        print(new_key, "\n", duplicates, "\n")
        problems[new_key] = list(duplicates)

with open('../duplicate_place_names.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(problems, indent=4))

## Select geo_ents based on annotation results

In [None]:
for key in topo.keys():
    patterns = list()
    try:
        annotation = PATH_RESULTS / f"df_annotations_{key}.pkl"
        positives = get_positives(pd.read_pickle(annotation))
    except FileNotFoundError:
        continue

    for item in geo_ents[key]:
        if item['pattern'] in positives:
            patterns.append(item)

    geo_ents[key] = patterns

## Create model

In [None]:
nlp = spacy.load('nl', disable=['ner'])
ruler = EntityRuler(nlp)
for label in geo_ents:
    ruler.add_patterns(geo_ents[label])
nlp.add_pipe(ruler)
nlp.to_disk(PATH_MODEL)