In [15]:
BRAXEN_DIR = "/Users/joregan/Playing/braxen/"

In [3]:
BRAXEN_IPA_RAW = """
p	p
b	b
t	t
rt	ʈ
d	d
rd	ɖ
k	k
g	ɡ
g	g
f	f
v	v
s	s
rs	ʂ
sh	ʃ
zh	ʒ
z	z
dh	ð
th	θ
h	h
x	ɧ
xx	x
c	ɕ
tc	t͡ʃ
dj	d͡ʒ
m	m
n	n
rn	ɳ
ng	ŋ
r	r
l	l
rl	ɭ
j	j
w	w
rh	ɾ
r0	r0
rx	ʀ
i:	iː
i	ɪ
ih	ɪ̯
y:	yː
y	ʏ
e:	eː
e	e
eh	e̝
ex	ə
ä:	ɛː
ä	ɛ
ae:	æː
ae	æ
ö:	øː
ö	ø
oe:	ɶː
oe	ɶ
u:	uː
u	u
oh	o
o:	oː
o	ɔ
uu:	ʉː
uu	ɵ
uuh	ʉ
uw:	ʊː
uw	ʊ
a:	ɑː
a	a
aa:	aː
au	aʊ
eu	ɛʊ
ei	eɪ
ai	aɪ
oi	ɔɪ
ou	əʊ
eex	eə
iex	ɪə
uex	ʊə
an	ã
en	ɛ̃
on	õ
un	œ̃
.	.
"""

In [4]:
BRAXEN_IPA = {}
for line in BRAXEN_IPA_RAW.strip().split("\n"):
    if not line.strip():
        continue
    parts = line.split("\t")
    if len(parts) != 2:
        continue
    BRAXEN_IPA[parts[0]] = parts[1]

In [12]:
def braxen_encode(phoneme_string, phoneme_to_ipa, strictly_braxen=False, stress_type=None):
    """
    Convert a string of symbolic phonemes to IPA using stress handling.
    
    Args:
        phoneme_string (str): e.g., "'a: . r ex n"
        phoneme_to_ipa (dict): mapping of phoneme symbols to IPA
        
    Returns:
        str: IPA transcription
    """
    # Replace morpheme/compound boundaries with syllable breaks
    phoneme_string = phoneme_string.replace('-', '.').replace('~', '.').replace('|', '.')
    if not strictly_braxen:
        BRAXEN_IPA['r0'] = 'ɹ'

    ipa_output = []

    for p in phoneme_string.strip().split():
        stress = None

        if strictly_braxen:
            if p == 'r0':
                continue

            # Handle stress markers
            if p.startswith("'"):
                stress = 'ˈ́'
                p = p[1:]
            elif p.startswith('"'):
                stress = 'ˈ̀'
                p = p[1:]
            elif p.startswith(','):
                stress = 'ˌ'
                p = p[1:]
        elif stress_type and stress_type == 'wiktionary':
            if p.startswith("'"):
                stress = '¹'
                p = p[1:]
            elif p.startswith('"'):
                stress = '²'
                p = p[1:]
            elif p.startswith(','):
                stress = 'ˌ'
                p = p[1:]
        else:
            if p.startswith("'"):
                stress = 'ˈ'
                p = p[1:]
            elif p.startswith('"'):
                stress = 'ˈ'
                p = p[1:]
            elif p.startswith(','):
                stress = 'ˌ'
                p = p[1:]

        if p in phoneme_to_ipa:
            ipa = phoneme_to_ipa[p]
            if stress:
                ipa = stress + ipa
            ipa_output.append(ipa)
        else:
            print(f"[WARN] No match for phoneme: {p}")
    
    return ''.join(ipa_output)


In [16]:
assert braxen_encode("""g rh 'ae n d ~ m a: . s t ex r0""", BRAXEN_IPA) == 'gɾˈænd.mɑː.stəɹ'

In [None]:
from pathlib import Path

DICT_PATH = Path(BRAXEN_DIR) / "dict" / "braxen-sv.tsv"

with open(DICT_PATH, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        if line.startswith("#") or not line.strip():
            continue
        parts = line.strip().split("\t")
        word = parts[0]
        transcription = parts[1]
        pos_tags = parts[2]
        ipa = braxen_encode(transcription, BRAXEN_IPA, strictly_braxen=False)
        print(f"{word}\t{ipa}\t{pos_tags}")