In [25]:
def clean_accent(phone):
    if phone.startswith("'") or phone.startswith(",") or phone.startswith('"'):
        return phone[1:]
    return phone

def clean_phone(phones):
    phones = phones.strip()
    each_phone = phones.split(" ")
    each_phone = [clean_accent(p) for p in each_phone]
    return " ".join(each_phone)

def clean_phones(pron_parts):
    return [clean_phone(p) for p in pron_parts]

def ends_with_list(word, endings):
    for ending in endings:
        if word.endswith(ending):
            return ending
    return ""

def split_acronym(word, pron):
    base_word = word
    endings = [":s"]
    ending = ends_with_list(word, endings)
    if ending:
        base_word = word[:-len(ending)]
    if not "|" in pron:
        pron_parts = pron.split("~")
        pron_parts = clean_phones(pron_parts)
        letters = list(base_word)
        if ending:
            letters[-1] = letters[-1] + ending
        if len(pron_parts) == len(letters):
            pieces = zip(letters, pron_parts)
            return [p for p in pieces]
            #if base_word != word:
            #    pass


In [26]:
split_acronym("sRPE:s", "e s ~ ae r ~ p e: ~ 'e s")

[('s', 'e s'), ('R', 'ae r'), ('P', 'p e:'), ('E:s', 'e s')]

In [41]:
parts = []
with open("/Users/joregan/Playing/braxen/dict/braxen-sv.tsv") as braxen:
    for line in braxen:
        if line.startswith("#"):
            continue
        pieces = line.strip().split("\t")

        parts.append(
            {
                "id": pieces[-1],
                "orth": pieces[0],
                "phones": pieces[1],
                "pos": pieces[2],
                "lang": pieces[3],
                "pos_tb": pieces[2].replace(" ", "|"),
            }
        )

```bash
cat ~/Playing/UD/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu |awk -F'\t' '{print $5 "\t" $4 "\t" $6}'|sort|uniq > /tmp/sv_talbanken-tags.txt
```

In [42]:
def read_talbanken_tags(file="/tmp/sv_talbanken-tags.txt"):
    tagset = {}
    with open(file) as tags_file:
        tags = {}
        for line in tags_file:
            line = line.strip()
            if line == "":
                continue
            parts = line.split("\t")
            tags = {
                "tag": parts[0],
                "pos": parts[1],
                "ud_tags": parts[2]
            }
            if not parts[0] in tagset:
                tagset[parts[0]] = []
            tagset[parts[0]].append(tags)
    return tagset

In [43]:
tagset = read_talbanken_tags()

In [44]:
for entry in parts:
    pos_tb = entry["pos_tb"]
    if pos_tb in tagset:
        if len(tagset[pos_tb]) == 1:
            entry["tags_ud"] = tagset[pos_tb][0]

In [45]:
import json

with open("/tmp/braxen-sv-with-ud.json", "w") as out_file:
    json.dump(parts, out_file, indent=2)

```
5-vxl	f "e m - v ,ä k . s l a d	AB	swe	-	-	-	733024
5-växlad	f "e m - v ,ä k . s l a d	JJ POS UTR SIN IND NOM	swe	751234
femväxlad	f "ä m - v ,ä k . s l a d	JJ POS UTR SIN IND NOM	swe	115986
```

In [46]:
braxen = {}
for part in parts:
    orth = part["orth"]
    if not orth in braxen:
        braxen[orth] = []
    braxen[orth].append(part)
with open("/tmp/braxen-sv-by-orth.json", "w") as out_file:
    json.dump(braxen, out_file, indent=2)

In [2]:
import json
with open("/tmp/braxen-sv-by-orth.json") as in_file:
    braxen = json.load(in_file)

In [3]:
ambiguous = {}
for orth in braxen:
    entries = braxen[orth]
    if len(entries) > 1:
        ambiguous[orth] = entries

In [52]:
with open("/tmp/braxen-sv-ambiguous.json", "w") as out_file:
    json.dump(ambiguous, out_file, indent=2)

In [7]:
for amb in ambiguous.keys():
    pronunciations = set()
    languages = set()
    for entry in ambiguous[amb]:
        pronunciations.add(entry["phones"])
    if len(pronunciations) > 1 and len(languages) != 1:
        for entry in ambiguous[amb]:
            print(amb, entry["phones"], entry["lang"])

Aaron 'a: . r o n swe
Aaron 'ae . rh ex n eng
Abagail 'ae . b ex . g e j l eng
Abagail 'ae . b ex . g ei l eng
Abaljarnasdottir 'a . b a l . j a r . n a s ~ d o . t i r ice
Abaljarnasdottir a . b "a l . j a . rn a s - d ,o . t i r ice
Abbi 'a . b i swe
Abbi a . b 'i: unk
Abel 'a: . b ex l swe
Abel 'ei . b ex l eng
Abramovitj a . b r 'a: . m u . v i tc sla
Abramovitj a . b r a . m 'o: . v i tc rus
Abul-Qasem a . b uu l ~ k 'a . s ex m per
Abul-Qasem a: . b uu l ~ k 'a . s ex m swe
Acheson 'a . tc i . s o n eng
Acheson 'ae . tc i . s o n swe
Achille a . k 'i . l ex swe
Achille a . k 'i l swe
Actionaid 'a k . rs ex n ~ e j d eng
Actionaid 'ae k . rs ex n ~ ei d eng
Adam "a: . d ,a m swe
Adam 'ae . d ex m eng
Adams "a: . d ,a m s swe
Adams 'ae . d ex m s eng
Adamson "a: . d a m . s ,o n swe
Adamson 'ae . d ex m . s ex n eng
Adewale a . d ex . w 'a . l ex afr
Adewale ae . d i . v 'a: . l ei eng
Adrians 'a: . d r ih . a n s swe
Adrians 'e j . d r ih . a n s swe
Adwords 'ae d ~ w oe: r0 d s e

In [None]:
REPLACEMENTS = """
Hlavac;Hlaváč;cze
"""