# Split into train/test/valid

In [1]:
!echo /kaggle/input/ljspeech-for-asr/wav16/ > valid.tsv
!cat ../input/ljspeech-for-asr/frames.tsv | tail -n $((1310)) | tail -n $((1310 / 2)) |awk -F'\t' '{print $1 ".wav\t" $2}' >> valid.tsv

In [2]:
!echo /kaggle/input/ljspeech-for-asr/wav16/ > test.tsv
!cat ../input/ljspeech-for-asr/frames.tsv | tail -n $((1310)) | head -n $((1310 / 2)) |awk -F'\t' '{print $1 ".wav\t" $2}' >> test.tsv

In [3]:
!echo /kaggle/input/ljspeech-for-asr/wav16/ > train.tsv
!cat ../input/ljspeech-for-asr/frames.tsv | head -n $((13100 - 1310)) |awk -F'\t' '{print $1 ".wav\t" $2}' >> train.tsv

# Load frame lengths

In [4]:
train_frames = {}

with open("train.tsv") as f:
    for line in f.readlines():
        if not "\t" in line:
            continue
        pieces = line.strip().split("\t")
        assert len(pieces) == 2
        id = pieces[0].replace(".wav", "")
        train_frames[id] = int(pieces[1])

In [5]:
MINS = [i * 5 for i in range(1, 25)]
HOURS = [i * 2 for i in range(1, 9)]

In [6]:
WAVDIR = "/kaggle/input/ljspeech-for-asr/wav16"

# Minute splits

In [7]:
for min in MINS:
    frames = min * 60 * 16000
    idlist = [k for k in train_frames.keys()]
    outtsv = f"{min}mins.tsv"
    with open(outtsv, "w") as of:
        current = 0
        of.write(f"{WAVDIR}\n")
        while frames > 0 and frames > current:
            id = idlist.pop(0)
            current = train_frames[id]
            of.write(f"{id}.wav\t{current}\n")
            frames = frames - current
        max = 0
        maxid = ""
        for id in idlist:
            time = train_frames[id]
            if time > current:
                continue
            if time > max:
                max = time
                maxid = id
        of.write(f"{maxid}.wav\t{max}\n")

# Hour splits

In [8]:
for min in HOURS:
    frames = min * 60 * 60 * 16000
    idlist = [k for k in train_frames.keys()]
    outtsv = f"{min}hrs.tsv"
    with open(outtsv, "w") as of:
        current = 0
        of.write(f"{WAVDIR}\n")
        while frames > 0 and frames > current:
            id = idlist.pop(0)
            current = train_frames[id]
            of.write(f"{id}.wav\t{current}\n")
            frames = frames - current
        max = 0
        maxid = ""
        for id in idlist:
            time = train_frames[id]
            if time > current:
                continue
            if time > max:
                max = time
                maxid = id
        of.write(f"{maxid}.wav\t{max}\n")

# Generate `ltr` files

In [9]:
def fairseqify(text):
    text = text.strip().replace("  ", " ")
    words = text.split(" ")
    spread = [" ".join(a) for a in words]
    return " | ".join(spread) + " |"

In [10]:
transcripts = {}
with open("../input/ljspeech-for-asr/transcripts.tsv") as tf:
    for line in tf.readlines():
        line = line.strip()
        if not "\t" in line:
            pass
        parts = line.split("\t")
        assert len(parts) == 2
        transcripts[parts[0]] = fairseqify(parts[1])

In [11]:
import glob
for tsv in glob.glob("*.tsv"):
    out = tsv.replace(".tsv", ".ltr")
    with open(tsv) as inf, open(out, "w") as outf:
        for line in inf.readlines()[1:]:
            id, _ = line.split("\t")
            id = id.replace(".wav", "")
            outf.write(f"{transcripts[id]}\n")

# Tidy up

In [12]:
!for i in *mins.tsv;do b=$(basename $i ".tsv");mkdir $b; mv $b.tsv $b.ltr $b/; cp test.* valid.* $b/;done
!for i in *hrs.tsv;do b=$(basename $i ".tsv");mkdir $b; mv $b.tsv $b.ltr $b/; cp test.* valid.* $b/;done