In [2]:
WAXHOLM_DIR = "/Users/joregan/Playing/waxholm/scenes_formatted/"

```bash
$find ~/Playing/waxholm/scenes_formatted/ -name '*.mix'|while read i;do \
cat $i |grep -v '^$'|grep -v '^FR'|grep -v '^CT'|grep -v '^WIZARD'|\
grep -v '^AUTOLAB'|grep -v 'CORRECTED:'|grep -v 'DATA BANK MATERIAL'|grep -v '^SPEAKER'; done
```

```perl
#!/usr/bin/perl

use warnings;
use strict;
use utf8;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my $smp = '';
my $last = '';
my $text = '';
my $phone = '';
my $label = '';

sub clean_up {
        my $text = shift;

        $text =~ s/\s+/ /g;
        $text =~ s/\{/ä/g;
        $text =~ s/\}/å/g;
        $text =~ s/\|/ö/g;
        $text =~ s/\\/Ö/g;
        return $text;
}

while(<>) {
        chomp;
        if(m!Waxholm dialog. /u/wax/data/scenes/[^/]*/(.*)!) {
                my $newsmp = $1;
                $last = 'smp';
                $text = clean_up($text);
                $phone = clean_up($phone);
                $label = clean_up($label);

                if ($smp ne '') {
                        print "$smp\t$text\t$phone\t$label\n";
                }
                $smp = $newsmp;
                $text = '';
                $phone = '';
                $label = '';
        } elsif (m!^PHONEME:!) {
                s/^PHONEME:\s+//;
                $phone = $_;
                $last = 'phone';
        } elsif (m!^Labels:!) {
                s/^Labels:\s+//;
                $label = $_;
                $last = 'label';
        } elsif (m!^TEXT:!) {
                s/^TEXT:\s*//;
                $text = $_;
                $last = 'text';
        } elsif (m!^S{g f|ljande mening:!) {
                print STDERR "$smp:$_\n";
        } else {
                if($last eq 'text') {
                        $text = "$text $_";
                } elsif($last eq 'label') {
                        $label = "$label $_";
                } elsif($last eq 'phone') {
                        $phone = "$phone $_";
                } else {
                        print "DAFUQ: $_";
                }
        }
}
```

In [29]:
from pathlib import Path
import re

WAXHOLM_PATH = Path(WAXHOLM_DIR)

def skippable(text):
    if text.startswith("CT"):
        return True
    elif text.startswith("CORRECTED:"):
        return True
    elif text.startswith("AUTOLABEL:"):
        return True
    elif text.startswith("DATA BANK MATERIAL:"):
        return True
    elif text.startswith("WIZARD:"):
        return True
    elif text.startswith("SPEAKER"):
        return True
    elif text.startswith("Digital recording"):
        return True
    elif text.startswith("FR"):
        return True
    elif text.startswith("S{g f|ljande mening:"):
        return True
    elif text.startswith("Correction"):
        return True
    elif text.strip().startswith("Corrected"):
        return True
    elif text.strip() == "":
        return True
    return False


def fix_text(text: str) -> str:
    if text == "":
        return ""
    text = text.strip()
    replacements = text.maketrans("{}|\\[]", "äåöÖÄÅ")
    tr = text.translate(replacements)
    spaced = re.sub("\s+", " ", tr)
    if spaced[-1] == ".":
        spaced = spaced[:-1]
    return spaced.strip()


mixfiles = []
for mixfile in WAXHOLM_PATH.glob("*/*.mix"):
    current = {}
    with open(mixfile) as infile:
        last = ''
        for line in infile.readlines():
            if skippable(line):
                continue
            elif line.startswith("Waxholm dialog."):
                SCENES = "/scenes/"
                scene_start = line.find(SCENES)
                smp = line[scene_start+len(SCENES):].strip()
                current["smp"] = smp
            elif line.startswith("PHONEME:"):
                current["phoneme"] = line[9:].strip()
                last = "phoneme"
            elif line.startswith("Labels:"):
                current["labels"] = line[7:].strip()
                last = "labels"
            elif line.startswith("TEXT:"):
                current["text"] = line[5:].strip()
                last = "text"
            elif current["smp"] == "fp2015/fp2015.1.03.smp" and line.startswith("J'A:+"):
                current["phoneme"] = line.strip()
            else:
                if last == "":
                    print(smp, line)
                current[last] = " ".join([current[last], line.strip()]).strip()
    for key in ["text", "phoneme", "labels"]:
        if not "phoneme" in current:
            current["phoneme"] = ""
        else:
            current[key] = fix_text(current[key])
    mixfiles.append(current)

In [30]:
import json
with open("/tmp/waxholm_raw_lexicon.json", "w") as outf:
    json.dump(mixfiles, outf)