In [58]:
from pathlib import Path
import xml.etree.ElementTree as ET

In [1]:
LABELS_PATH = Path("/media/storage/harald/git/abair-gitea/abair-software/nnmnkwii_tests/data/pmg_ga_demo/label_state_align")

In [2]:
label_files = {a.stem: a for a in LABELS_PATH.glob('*.lab')}

In [4]:
GITEA_XML_PATH = Path("/media/storage/harald/git/abair-gitea/abair-corpora/pmg_ga_co/RCPiarsachALL/xml")

In [7]:
gitea_xml_files = {a.stem: a for a in GITEA_XML_PATH.glob('*.xml')}

In [9]:
SVN_XML_PATH = Path("/media/storage/jim/speech-syn/Corpora/ga_CO/pmg/RC_ALL_141118/xml")

In [10]:
svn_xml_files = {a.stem: a for a in SVN_XML_PATH.glob('*.xml')}

Filter the xml files to only include what was in the labels

In [11]:
svn_xml_filt = {a: svn_xml_files[a] for a in label_files.keys()}

In [14]:
gitea_xml_filt = {a: gitea_xml_files[a] for a in label_files.keys()}

Check that the contents seem right

In [12]:
len(label_files)

91

In [13]:
len(svn_xml_filt)

91

In [15]:
len(gitea_xml_filt)

91

In [17]:
assert set(label_files) == set(svn_xml_filt) == set(gitea_xml_filt), "Nope"

In [37]:
def break_phones(string):
    i = 0
    mark = 0
    toks = []
    pieces = {
        0: '^',
        1: '-',
        2: '+',
        3: '=',
        4: '@'
    }
    piece = 0
    while i <= len(string):
        if string[i:i+1] == pieces[piece]:
            if piece < 4:
                toks.append(string[mark:i])
            else:
                if string[i+1:i+2].isdigit():
                    toks.append(string[mark:i])
                else:
                    toks.append("@")
                break
            piece += 1
            mark = i + 1
        i += 1
    return toks

In [82]:
assert break_phones('nnj^ii-lj+sil=@@3_2/') == ['nnj', 'ii', 'lj', 'sil', '@']
assert break_phones("x^sil-nnj+ii=lj@1_4") == ['x', 'sil', 'nnj', 'ii', 'lj']
assert break_phones("oo^r-sil+x=x@1_1") == ['oo', 'r', 'sil', 'x', 'x']

In [41]:
def read_phonemes_lab(filename):
    phn_bits = []
    with open(filename, "r") as f:
        for line in f.readlines():
            _, _, phones = line.split(' ')
            phones = break_phones(phones)
            phn_bits.append(phones)
    return phn_bits        

In [51]:
# Dumb way of doing this, but it works
def check_len(phone_list):
    length = str(len(phone_list))
    return (length[-1] == "0" or length[-1] == "5")

In [56]:
def prune_phones(phone_list):
    if not check_len(phone_list):
        return []
    return [a[2] for a in phone_list[::5]]

In [84]:
lab_phonemes_raw = {a: read_phonemes_lab(b) for (a, b) in label_files.items()}

In [85]:
lab_phonemes = {a: prune_phones(b) for (a, b) in lab_phonemes_raw.items()}

In [74]:
def xml_phones(filename):
    phonemes = []
    tree = ET.parse(filename)
    root = tree.getroot()
    for phoneme in root.findall('.//phoneme'):
        if 'symbol' in phoneme.attrib:
            phonemes.append(phoneme.attrib['symbol'])
    return phonemes

In [87]:
svn_phonemes = {a: xml_phones(b) for (a, b) in svn_xml_filt.items()}

In [89]:
gitea_phonemes = {a: xml_phones(b) for (a, b) in gitea_xml_filt.items()}