In [1]:
import soundfile as sf
import wave

In [2]:
def smp_headers(filename: str):
    with open(filename, "rb") as f:
        f.seek(0)
        raw_headers = f.read(1024)
        raw_headers = raw_headers.rstrip(b'\x00')
        asc_headers = raw_headers.decode("ascii")
        asc_headers.rstrip('\x00')
        tmp = [a for a in asc_headers.split("\r\n")]
        back = -1
        while abs(back) > len(tmp) + 1:
            if tmp[back] == '=':
                break
            back -= 1
        tmp = tmp[0:back-1]
        return dict(a.split("=") for a in tmp)


def smp_read_sf(filename: str):
    headers = smp_headers(filename)
    if headers["msb"] == "last":
        ENDIAN = "LITTLE"
    else:
        ENDIAN = "BIG"

    data, sr = sf.read(filename, channels=int(headers["nchans"]),
                       samplerate=16000, endian=ENDIAN, start=512,
                       dtype="int16", format="RAW", subtype="PCM_16")
    return (data, sr)


def write_wav(filename, arr):
    with wave.open(filename, "w") as f:
        f.setnchannels(1)
        f.setsampwidth(2)
        f.setframerate(16000)
        f.writeframes(arr)

In [3]:
from pathlib import Path

In [6]:
WAXHOLM = "/Users/joregan/Playing/waxholm"
OUTPUT = "/Users/joregan/Playing/waxholm_fairseq"

In [7]:
SCENES_PATH = Path(WAXHOLM) / "scenes_formatted"
OUTPUT_PATH = Path(OUTPUT)
if not OUTPUT_PATH.is_dir():
    OUTPUT_PATH.mkdir()

In [12]:
TRAIN_FILES = []
with open(Path(WAXHOLM) / "alloktrainfiles") as trainf:
    for line in trainf.readlines():
        TRAIN_FILES.append(line.strip())
TEST_FILES = []
with open(Path(WAXHOLM) / "testfiles") as testf:
    for line in testf.readlines():
        TEST_FILES.append(line.strip())

In [14]:
print(len(TRAIN_FILES), len(TEST_FILES))

1835 327


In [20]:
import re

def get_labels(mixfile):
    labels = ""
    saw_label = False
    with open(mixfile) as infile:
        for line in infile.readlines():
            if not saw_label:
                if line.lower().startswith("labels:"):
                    saw_label = True
                    labels = line[7:].strip()
            else:
                if line.startswith("FR"):
                    break
                else:
                    labels = " ".join([labels, line.strip()])
        labels = re.sub("  +", " ", labels)
    return labels

In [21]:
get_labels("/Users/joregan/Playing/waxholm/scenes_formatted/fp2043/fp2043.16.03.smp.mix")

'A:H\'A: pa p: |h J\'A:Ggv V\'ILv pap: sm p:v S\'E: pa H\'U:R 2Dd\'EM Bb\']:TtE0NG Gg\']:R 2Tt\'I STt"A:VE0#STtR`\\M p: \']: p: \']M J\'A: Kk\'AN F"O#2S`[TtA Tt\'I F"IN#H`AM .'

In [10]:
with open(OUTPUT_PATH / "train.tsv", "w") as train_tsv,\
     open(OUTPUT_PATH / "train.ltr", "w") as train_ltr,\
     open(OUTPUT_PATH / "test.tsv", "w") as test_tsv,\
     open(OUTPUT_PATH / "test.ltr", "w") as test_ltr:
    for smpfile in SCENES_PATH.glob("fp*/*.smp"):
        mixfile = f"{smpfile}.mix"
        if not Path(mixfile).exists():
            continue
        stem = smpfile.stem
        outwav = str(OUTPUT_PATH / f"{stem}.wav")
        

OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.2.02.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.2.03.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.2.01.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.pr.09.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.pr.08.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.2.00.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.1.03.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.1.02.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.1.00.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.1.01.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.5.00.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2024.pr.11.smp.mix
OK /Users/joregan/Playing/waxholm/scenes_formatted/fp2024/fp2