<font size="6">1. Create identifier list</font> 

In [None]:
#Load packages
import pandas as pd
import numpy as np
import random
import re
import timeit
import io
import os
from datetime import datetime

In [None]:
#Show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#Progress bar
from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()

# Non-identifiers

## Danish orthographic dictionary

In [None]:
#We compare all names to a list of Danish words to identify ambiguous names and remove them from list
#Load dictionary
words = pd.read_csv('Databases/RO2012 fuldformer 2019.txt', sep=';', comment='#', keep_default_na=False, usecols=[1], names=['word'])

In [None]:
#Clean
words['word'] = words['word'].str.lower()
words.drop_duplicates(subset=['word'], inplace=True)

## Product names

In [None]:
#We compare all names to a list of Danish words to identify ambiguous names and remove them from list
#Load dictionary
drugs = pd.read_excel('Databases/ListeOverGodkendteLaegemidler.xlsx', sheet_name=0, keep_default_na=False, comment='#', usecols=[1], names=['drug'])

In [None]:
#Cleaning: Removing empty lines
drugs.drop([13783, 13784], inplace=True)

In [None]:
#Convert to lowercase and drop duplicates
drugs['drug'] = drugs['drug'].str.lower()
drugs.drop_duplicates(subset=['drug'], inplace=True)

## Medical abbreviations

In [None]:
#Load eponyms
abb = pd.read_excel('Databases/Abbreviations.xlsx', sheet_name="All appended", header=None)
abb.columns = ['abb']
abb['abb'] = abb['abb'].str.lower()
abb.drop_duplicates(subset=['abb'], inplace=True)

## SNOMED CT

In [None]:
#Load snomed descriptions
snomed = pd.read_csv('Databases/sct2_Description_Snapshot-da_DK1000005_20200930.txt', sep='\t', keep_default_na=False, quoting=3, usecols=['term'])
snomed = snomed.append(pd.read_csv('Databases/sct2_Description_Snapshot-en_DK1000005_20200930.txt', sep='\t', keep_default_na=False, quoting=3, usecols=['term']))
snomed = snomed.append(pd.read_csv('Databases/sct2_Description_Snapshot-en_INT_20200731.txt', sep='\t', keep_default_na=False, quoting=3, usecols=['term']))

In [None]:
#Cleaning
snomed['term'] = snomed['term'].str.lower()
snomed.drop_duplicates(subset=['term'], inplace=True)
snomed.reset_index(drop=True, inplace=True)

## The Danish healthcare system’s classification system

In [None]:
#Load sks descriptions
sks = pd.read_fwf('Databases/SKScomplete.txt', encoding='cp1252', widths=[3,20,8,8,8,120,3,3,3,3,3,1,2,2,1,25,1], usecols=[5], names=['kodetekst'])

In [None]:
#Cleaning
sks['kodetekst'] = sks['kodetekst'].str.lower()
sks.drop_duplicates(subset=['kodetekst'], inplace=True)
sks.reset_index(drop=True, inplace=True)

# Identifiers and rate of occurrence in population

## Names

### Last names

In [None]:
#Load last names
last = pd.read_csv('Databases/efternavne 2021.txt', encoding='cp1252', comment='#', keep_default_na=False, sep='\t', usecols=[0,1], names=['entity','count'], dtype={'entity': 'object'})

In [None]:
last

In [None]:
#Cleaning: Removing 000, 'Ej efternavn', and empty names
last.drop([35, 532, 905, 19884, 87391, 195990, 195997, 216509, 286667], inplace=True)

### Male names

In [None]:
#Load male names
male = pd.read_csv('Databases/fornavne 2021 - mænd.txt', encoding='cp1252', comment='#', keep_default_na=False, sep='\t', usecols=[0,1], names=['entity','count'], dtype={'entity': 'object'})

In [None]:
male

In [None]:
#Cleaning: Removing 000, and empty names
male.drop([545, 102, 58924], inplace=True)

### Female names

In [None]:
#Load female names
female = pd.read_csv('Databases/fornavne 2021 - kvinder.txt', encoding='cp1252', comment='#', keep_default_na=False, sep='\t', usecols=[0,1], names=['entity','count'], dtype={'entity': 'object'})

In [None]:
female

In [None]:
#Cleaning: Removing 000, and empty names
female.drop([654, 121, 28485], inplace=True)

### All names

In [None]:
frames = [last, male, female]
names = pd.concat(frames)

In [None]:
#Converting to lowercase
names['entity'] = names['entity'].str.lower()

In [None]:
#Removing duplicated and adding their count
names = names.groupby(['entity'], dropna=False)['count'].apply(sum).reset_index(name='count').sort_values(by='count',ascending=False, ignore_index=True)

In [None]:
#Removing single-letter names
#names = names[names['entity'].str.len()>1]

In [None]:
names

In [None]:
names_total = names['count'].sum()

In [None]:
names['prob_pop']=names['count']/names_total

In [None]:
names['tag'] = 'NAME'

In [None]:
names.head()

## Locations

In [None]:
use_cols = ['vejnavn', 'adresseringsvejnavn', 'supplerendebynavn', 'postnrnavn', 'kommunenavn', 'regionsnavn',  'landsdelsnavn']
cities_streets = pd.read_csv('Databases/adresser.csv', sep=',', usecols=use_cols, keep_default_na=False, na_values='', dtype='str')

In [None]:
cities_streets

In [None]:
cities_streets['vejnavn'] = cities_streets['vejnavn'].str.lower()
cities_streets['adresseringsvejnavn'] = cities_streets['adresseringsvejnavn'].str.lower()
cities_streets['supplerendebynavn'] = cities_streets['supplerendebynavn'].str.lower()
cities_streets['postnrnavn'] = cities_streets['postnrnavn'].str.lower()
cities_streets['kommunenavn'] = cities_streets['kommunenavn'].str.lower()
cities_streets['regionsnavn'] = cities_streets['regionsnavn'].str.lower()
cities_streets['landsdelsnavn'] = cities_streets['landsdelsnavn'].str.lower()
cities_streets

In [None]:
#Removal of duplicates per row

In [None]:
cities_streets.loc[(cities_streets['supplerendebynavn']==cities_streets['postnrnavn']) | (cities_streets['supplerendebynavn']==cities_streets['kommunenavn']),'supplerendebynavn'] = np.nan

In [None]:
cities_streets.loc[cities_streets['postnrnavn']==cities_streets['kommunenavn'],'postnrnavn'] = np.nan

In [None]:
cities_streets.loc[cities_streets['kommunenavn']==cities_streets['landsdelsnavn'],'kommunenavn'] = np.nan

In [None]:
total_cities_streets = len(cities_streets)

In [None]:
supplerendebynavn = pd.DataFrame(cities_streets.groupby(['supplerendebynavn'], dropna=True).size())
supplerendebynavn.columns = ['count']

In [None]:
postnrnavn = pd.DataFrame(cities_streets.groupby(['postnrnavn'], dropna=True).size())
postnrnavn.columns = ['count']

In [None]:
kommunenavn = pd.DataFrame(cities_streets.groupby(['kommunenavn'], dropna=True).size())
kommunenavn.columns = ['count']

In [None]:
regionsnavn = pd.DataFrame(cities_streets.groupby(['regionsnavn'], dropna=True).size())
regionsnavn.columns = ['count']

In [None]:
landsdelsnavn = pd.DataFrame(cities_streets.groupby(['landsdelsnavn'], dropna=True).size())
landsdelsnavn.columns = ['count']

In [None]:
frames = [supplerendebynavn, postnrnavn, kommunenavn, regionsnavn, landsdelsnavn]
cities = pd.concat(frames)

In [None]:
cities.reset_index(inplace=True)
cities.rename({'index': 'entity'}, axis='columns', inplace=True)

In [None]:
#We groupby and add here because København as a municipality will appear x times and københavn as city y times = x+y
cities = cities.groupby(['entity'], dropna=False)['count'].apply(sum).reset_index(name='count').sort_values(by='count',ascending=False, ignore_index=True)

In [None]:
cities['prob_pop'] = cities['count']/total_cities_streets

In [None]:
cities['tag'] = 'CITY'

In [None]:
cities

## Streets

In [None]:
cities_streets

In [None]:
cities_streets.loc[cities_streets['vejnavn']==cities_streets['adresseringsvejnavn'],'vejnavn'] = np.nan

In [None]:
cities_streets

In [None]:
vejnavn = pd.DataFrame(cities_streets.groupby(['vejnavn'], dropna=True).size())
vejnavn.columns = ['count']

In [None]:
adresseringsvejnavn = pd.DataFrame(cities_streets.groupby(['adresseringsvejnavn'], dropna=True).size())
adresseringsvejnavn.columns = ['count']

In [None]:
frames = [vejnavn, adresseringsvejnavn]
streets = pd.concat(frames)

In [None]:
streets.reset_index(inplace=True)
streets.rename({'index': 'entity'}, axis='columns', inplace=True)

In [None]:
#We groupby and add here because København as a municipality will appear x times and københavn as city y times = x+y
streets = streets.groupby(['entity'], dropna=False)['count'].apply(sum).reset_index(name='count').sort_values(by='count',ascending=False, ignore_index=True)

In [None]:
streets['prob_pop'] = streets['count']/total_cities_streets

In [None]:
streets['tag'] = 'STREET'

In [None]:
streets

## Append all identifiers

In [None]:
frames = [names, cities, streets]
entities = pd.concat(frames)

In [None]:
entities.drop(labels='count', axis=1,inplace=True)

In [None]:
entities.sort_values(by='prob_pop')

# Ambiguous identifiers

## Ambiguous with multiple types

In [None]:
#Add the duplicate entities to ambiguous
ambiguous = entities.loc[entities.duplicated(subset='entity', keep=False),:]

In [None]:
ambiguous.sort_values(by='entity',inplace=False)

In [None]:
#Group by entity
tag_series = ambiguous.groupby(['entity'], dropna=False)['tag'].apply(list).reset_index(name='tag')['tag']

In [None]:
#Group by entity
ambiguous = ambiguous.groupby(['entity'], dropna=False)['prob_pop'].apply(sum).reset_index(name='prob_pop')

In [None]:
ambiguous['tag'] = tag_series

In [None]:
ambiguous

In [None]:
#Delete from entities
entities.drop_duplicates(subset='entity', keep=False, inplace=True)

In [None]:
entities['tag'] = entities['tag'].progress_apply(lambda x: [x])

In [None]:
entities

In [None]:
#CONCAT AMBIGUOUS AND ENTITIES

In [None]:
frames = [entities, ambiguous]
all_tags = pd.concat(frames)

In [None]:
all_tags.sort_values(by='prob_pop',inplace=False, ascending=False)

## Ambiguous with non-identifier

In [None]:
class Trie():
    """
    Source: https://stackoverflow.com/questions/42742810/speed-up-millions-of-regex-replacements-in-python-3/42789508#42789508
    
    Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
    The corresponding Regex should match much faster than a simple Regex union.
    """

    def __init__(self):
        self.data = {}

    def add(self, word):
        ref = self.data
        for char in word:
            ref[char] = char in ref and ref[char] or {}
            ref = ref[char]
        ref[''] = 1

    def dump(self):
        return self.data

    def quote(self, char):
        return re.escape(char)

    def _pattern(self, pData):
        data = pData
        if "" in data and len(data.keys()) == 1:
            return None

        alt = []
        cc = []
        q = 0
        for char in sorted(data.keys()):
            if isinstance(data[char], dict):
                try:
                    recurse = self._pattern(data[char])
                    alt.append(self.quote(char) + recurse)
                except:
                    cc.append(self.quote(char))
            else:
                q = 1
        cconly = not len(alt) > 0

        if len(cc) > 0:
            if len(cc) == 1:
                alt.append(cc[0])
            else:
                alt.append('[' + ''.join(cc) + ']')

        if len(alt) == 1:
            result = alt[0]
        else:
            result = "(?:" + "|".join(alt) + ")"

        if q:
            if cconly:
                result += "?"
            else:
                result = "(?:%s)?" % result
        return result

    def pattern(self):
        return self._pattern(self.dump())

In [None]:
def print_screen_terminal(string):
    now = datetime.now().strftime("[%d/%m/%Y %H:%M:%S]")
    print(now+" "+string)
    write = os.write(1, bytes(now+" "+string+"\n", 'utf-8'))

In [None]:
def findAmbi(names_series, ambi_series):
    '''
    Takes pandas series of names and ambiguous names and returns dataframe of positive matches with column of ambiguous examples for each row.
    
    Identifiers were also matched if they were followed by a genitive case, which for Danish is for all words not ending in s, x, or z to take a possessive ending s. Words that do end in s, x, or z take an apostrophe.
    This introduced a problem for ambiguous identifiers only differentiated by an ending s, e.g. the names (lowercased) “han” and “hans”: the latter was always returned as the longest match even if it carried the meaning of “han” taking a possessive ending s.
    For this reason, the trie-based regex for identifiers was split into two: one for all identifiers ending in s, x or z, and one for all other identifiers. The final regex was:
    The regex is run twice with trie and trie_s in the equation matching identifiers ending and not ending in s, x, and z, respectively.
    
    '''
    
    print_screen_terminal('Creating regex')
    trie = Trie()
    trie_s = Trie()
    for key in names_series.tolist():
        trie_s.add(key) if key.endswith('s') else trie.add(key)
        
    regex= re.compile(r"(?<!\w)" + trie.pattern() + r"(?:(?:(?<![szx])(?:(?!\w)|(?=s(?!\w))))|(?:(?<=[szx])(?!\w)))", re.IGNORECASE)
    regex_s= re.compile(r"(?<!\w)" + trie_s.pattern() + r"(?:(?:(?<![szx])(?:(?!\w)|(?=s(?!\w))))|(?:(?<=[szx])(?!\w)))", re.IGNORECASE)
    
    frame = pd.DataFrame({'ambi': ambi_series})
    
    # Extract the names that occur in the example
    print_screen_terminal('Creating column with list of entites that match ambi for each row')
    frame['entity'] = frame['ambi'].progress_apply(lambda x: list(set(entity.lower() for entity in (regex.findall(x)+regex_s.findall(x))))) #if name appears multiple times, we lower all, and remove duplicates
    
    #Split those names so that they get a row each (with example exploded)
    print_screen_terminal('Exploding the lists of entities and dropping resulting rows with nas')
    frame = frame.explode('entity', ignore_index=True)
    frame.dropna(axis=0, how='any', thresh=None, subset=['entity'], inplace=True)
    
    #Join all the examples for each name
    print_screen_terminal('Grouping by entity and appending ambi rows that it was matched again in list')
    frame = frame.groupby(['entity'], dropna=False)['ambi'].apply(list).reset_index(name='ambis')
    
    frame.to_csv('ambi_examples.txt', header=None, index=None, sep=' ', mode='a')
    
    return frame['entity'].tolist()

In [None]:
ambiguous_words_list = []
print_screen_terminal('CHECKING AMBIGUOUS WORDS')
ambiguous_words_list += findAmbi(all_tags['entity'], words['word'])
print_screen_terminal('APPENDING AMBIGUOUS WORDS TAG TO ENTITIES')
all_tags['tag'] = all_tags.progress_apply(lambda x: x['tag']+['WORDS'] if x['entity'] in ambiguous_words_list else x['tag'],axis=1)

In [None]:
ambiguous_abb_list = []
print_screen_terminal('CHECKING AMBIGUOUS ABBREVIATIONS')
ambiguous_abb_list += findAmbi(all_tags['entity'], abb['abb'])
print_screen_terminal('APPENDING AMBIGUOUS ABBREVIATIONS TAG TO ENTITIES')
all_tags['tag'] = all_tags.progress_apply(lambda x: x['tag']+['ABB'] if x['entity'] in ambiguous_abb_list else x['tag'],axis=1)

In [None]:
ambiguous_drugs_list = []
print_screen_terminal('CHECKING AMBIGUOUS DRUGS')
ambiguous_drugs_list += findAmbi(all_tags['entity'], drugs['drug'])
print_screen_terminal('APPENDING AMBIGUOUS DRUGS TAG TO ENTITIES')
all_tags['tag'] = all_tags.progress_apply(lambda x: x['tag']+['DRUGS'] if x['entity'] in ambiguous_drugs_list else x['tag'],axis=1)

In [None]:
ambiguous_snomed_list = []
print_screen_terminal('CHECKING AMBIGUOUS SNOMED TERMS')
ambiguous_snomed_list += findAmbi(all_tags['entity'], snomed['term'])
print_screen_terminal('APPENDING AMBIGUOUS SNOMED TERMS TAG TO ENTITIES')
all_tags['tag'] = all_tags.progress_apply(lambda x: x['tag']+['SNOMED'] if x['entity'] in ambiguous_snomed_list else x['tag'],axis=1)

In [None]:
ambiguous_sks_list = []
print_screen_terminal('CHECKING AMBIGUOUS SKS TERMS')
ambiguous_sks_list += findAmbi(all_tags['entity'], sks['kodetekst'])
print_screen_terminal('APPENDING AMBIGUOUS SKS TERMS TAG TO ENTITIES')
all_tags['tag'] = all_tags.progress_apply(lambda x: x['tag']+['SKS'] if x['entity'] in ambiguous_sks_list else x['tag'],axis=1)

In [None]:
all_tags

In [None]:
all_tags.sort_values(by='prob_pop',ascending=False,inplace=False, ignore_index=True).tail(50)

In [None]:
all_tags.to_csv('all_tags.txt', index=False)