In [21]:
import soundfile as sf


In [30]:
def fix_text(text: str) -> str:
    replacements = text.maketrans("{}|\\", "äåöÖ")
    return text.translate(replacements)


class FR:
    def __init__(self, text: str):
        if not text.startswith("FR"):
            raise IOError("Unknown line type (does not begin with 'FR'): " + text)
        parts = text.split("\t")
        if len(parts) == 5:
            self.type = 'B'
        if len(parts) == 4:
            self.type = 'I'
        if len(parts) == 3:
            self.type = 'E'
            if parts[1].strip() != "OK":
                raise IOError("Unexpected line: " + text)
        self.frame = parts[0][2:].strip()
        if len(parts) > 3:
            p1s = parts[1].strip()
            if p1s.startswith("$#"):
                self.phone_type = p1s[0:2]
                self.phone = p1s[2:]
            else:
                self.phone_type = p1s[0:1]
                self.phone = p1s[1:]
            if not parts[2].strip().startswith(">pm "):
                raise IOError("Unexpected line: " + text)
            self.pm_type = parts[2].strip()[4:5]
            self.pm = parts[2].strip()[5:]
        if len(parts) == 5:
            if not parts[3].strip().startswith(">w "):
                raise IOError("Unexpected line: " + text)
            self.word = fix_text(parts[3].strip()[3:])
        if parts[-1].strip().endswith(" sec"):
            self.seconds = parts[-1].strip()[0:-4]

    def __repr__(self):
        parts = []
        parts.append(f"type: {self.type}")
        parts.append(f"frame: {self.frame}")
        if self.type != 'E':
            parts.append(f"phone: {self.phone}")
        if 'word' in self.__dict__:
            parts.append(f"word: {self.word}")
        if 'pm_type' in self.__dict__:
            parts.append(f"pm_type: {self.pm_type}")
        if 'pm' in self.__dict__:
            parts.append(f"pm: {self.pm}")
        parts.append(f"sec: {self.seconds}")
        return f"FR(" + ", ".join(parts) + ")"


class Mix():
    def __init__(self, filepath: str):
        self.fr = []
        with open(filepath) as inpf:
            saw_text = False
            saw_phoneme = False
            saw_labels = False
            for line in inpf.readlines():
                if line.startswith("Waxholm dialog."):
                    self.filepath = line[15:].strip()
                if line.startswith("TEXT:"):
                    saw_text = True
                if saw_text:
                    self.text = fix_text(line.strip())
                    saw_text = False
                if line.startswith("FR "):
                    if saw_labels:
                        saw_labels = False
                    self.fr.append(FR(line))
                if line.startswith("Labels: "):
                    self.labels = line[8:].strip()
                    saw_labels = True
                if saw_labels and line.startswith(" "):
                    self.labels += line.strip()

    def get_pronunciations(self):
        words = []
        curword = ""
        curphones = []
        for item in self.fr:
            if item.type == 'B':
                if curword != "":
                    words.append((curword, curphones))
                curword = item.word
                curphones = []
                curphones.append(item.phone)
            elif item.type == 'I':
                curphones.append(item.phone)
            elif item.type == 'E':
                words.append((curword, curphones))
            else:
                raise Exception("Unknown FR type: ", item)
        return words
            

def smp_probe(filename: str) -> bool:
    with open(filename, "rb") as f:
        return f.read(9) == b"file=samp"


def smp_headers(filename: str):
    with open(filename, "rb") as f:
        f.seek(0)
        raw_headers = f.read(1024)
        raw_headers = raw_headers.rstrip(b'\x00')
        asc_headers = raw_headers.decode("ascii")
        asc_headers.rstrip('\x00')
        tmp = [a for a in asc_headers.split("\r\n")]
        back = -1
        while abs(back) > len(tmp) + 1:
            if tmp[back] == '=':
                break
            back -= 1
        tmp = tmp[0:back-1]
        return dict(a.split("=") for a in tmp)


def smp_read_sf(filename: str):
    headers = smp_headers(filename)
    if headers["msb"] == "last":
        ENDIAN = "LITTLE"
    else:
        ENDIAN = "BIG"

    data, sr = sf.read(filename, channels=int(headers["nchans"]),
                       samplerate=16000, endian=ENDIAN, start=512,
                       dtype="int16", format="RAW", subtype="PCM_16")
    return (data, sr)


def write_wav(filename, arr):
    import wave

    with wave.open(filename, "w") as f:
        f.setnchannels(1)
        f.setsampwidth(2)
        f.setframerate(16000)
        f.writeframes(arr)

In [23]:
t = FR("FR       4188	 #N	>pm #N	>w n{r	 0.262 sec")

In [24]:
sample = """
FR       4188	 #N	>pm #N	>w n{r	 0.262 sec
FR       5915	 $'[3	>pm $'[3	 0.370 sec
FR       8147	 $R	>pm $R+	 0.509 sec
FR       9125	 #G	>pm #G	>w g}r	 0.570 sec
FR      10400	 $g	>pm $g	 0.650 sec
FR      10728	 $']:	>pm $']:	 0.670 sec
FR      12377	 $R	>pm $R	 0.774 sec
FR      13176	 #p:	>pm #p:	>w XX	 0.823 sec
FR      13176	 #B	>pm #B	>w b}tarna	 0.823 sec
FR      14643	 $b	>pm $b	 0.915 sec
FR      14767	 $"]:	>pm $"]:	 0.923 sec
FR      18315	 $T	>pm $T	 1.145 sec
FR      20455	 $t	>pm $t	 1.278 sec
FR      20669	 $A	>pm $A	 1.292 sec
FR      21830	 $2N	>pm $2N	 1.364 sec
FR      22503	 $A	>pm $A	 1.406 sec
FR      23813	 #p:	>pm #p:	>w XX	 1.488 sec
FR      23813	 #T	>pm #T	>w till	 1.488 sec
FR      25208	 $t	>pm $t	 1.575 sec
FR      26404	 $L	>pm $L	 1.650 sec
FR      27778	 #p:	>pm #p:	>w XX	 1.736 sec
FR      27778	 #V	>pm #V	>w vaxholm	 1.736 sec
FR      28691	 $"A	>pm $"A	 1.793 sec
FR      30303	 $K	>pm $K	 1.894 sec
FR      31749	 $k	>pm $k	 1.984 sec
FR      32013	 $S	>pm $S	 2.001 sec
FR      35062	 $#H	>pm $#H	 2.191 sec
FR      35062	 $`]	>pm $`]	 2.191 sec
FR      36637	 $L	>pm $L	 2.290 sec
FR      37716	 $M	>pm $M	 2.357 sec
FR      39081	 #p:	>pm #p:	>w XX	 2.443 sec
FR      40506	 #I	>pm #I	>w ikv{ll	 2.532 sec
FR      41192	 $K	>pm $K	 2.574 sec
FR      42115	 $k	>pm $k	 2.632 sec
FR      43309	 $V	>pm $V	 2.707 sec
FR      43944	 $'[	>pm $'[	 2.746 sec
FR      45828	 $L	>pm $L	 2.864 sec
FR      47638	 #.	>pm #.	>w .	 2.977 sec
FR      48495	 OK	 3.031 sec
"""

In [25]:
conv = [FR(l) for l in sample.split("\n") if l != ""]

In [26]:
conv

[FR(type: B, frame: 4188, phone: N, word: när, pm_type: #, pm: N, sec: 0.262),
 FR(type: I, frame: 5915, phone: '[3, pm_type: $, pm: '[3, sec: 0.370),
 FR(type: I, frame: 8147, phone: R, pm_type: $, pm: R+, sec: 0.509),
 FR(type: B, frame: 9125, phone: G, word: går, pm_type: #, pm: G, sec: 0.570),
 FR(type: I, frame: 10400, phone: g, pm_type: $, pm: g, sec: 0.650),
 FR(type: I, frame: 10728, phone: ']:, pm_type: $, pm: ']:, sec: 0.670),
 FR(type: I, frame: 12377, phone: R, pm_type: $, pm: R, sec: 0.774),
 FR(type: B, frame: 13176, phone: p:, word: XX, pm_type: #, pm: p:, sec: 0.823),
 FR(type: B, frame: 13176, phone: B, word: båtarna, pm_type: #, pm: B, sec: 0.823),
 FR(type: I, frame: 14643, phone: b, pm_type: $, pm: b, sec: 0.915),
 FR(type: I, frame: 14767, phone: "]:, pm_type: $, pm: "]:, sec: 0.923),
 FR(type: I, frame: 18315, phone: T, pm_type: $, pm: T, sec: 1.145),
 FR(type: I, frame: 20455, phone: t, pm_type: $, pm: t, sec: 1.278),
 FR(type: I, frame: 20669, phone: A, pm_type:

In [27]:
def merge_fr_parts(input):
    words = []
    curword = ""
    curphones = []
    for item in input:
        if item.type == 'B':
            if curword != "":
                words.append((curword, curphones))
            curword = item.word
            curphones = []
            curphones.append(item.phone)
        elif item.type == 'I':
            curphones.append(item.phone)
        elif item.type == 'E':
            words.append((curword, curphones))
        else:
            raise Exception("Unknown FR type: ", item)
    return words

In [28]:
wds = merge_fr_parts(conv)

In [29]:
wds

[('när', ['N', "'[3", 'R']),
 ('går', ['G', 'g', "']:", 'R']),
 ('XX', ['p:']),
 ('båtarna', ['B', 'b', '"]:', 'T', 't', 'A', '2N', 'A']),
 ('XX', ['p:']),
 ('till', ['T', 't', 'L']),
 ('XX', ['p:']),
 ('vaxholm', ['V', '"A', 'K', 'k', 'S', 'H', '`]', 'L', 'M']),
 ('XX', ['p:']),
 ('ikväll', ['I', 'K', 'k', 'V', "'[", 'L']),
 ('.', ['.'])]