In [1]:
# License: Apache 2.0
_URL='https://nlp.kemt.fei.tuke.sk/tedx'
_AUDIO='https://nlp.kemt.fei.tuke.sk/static/files/_TEDxSK/training_set/training_set.zip'
_TEXT='https://nlp.kemt.fei.tuke.sk/static/files/_TEDxSK/manual_transcriptions_v1/manual_transcriptions_v1.zip'

_CITATION = """\
@article{stavs2017tedxsk,
  title={TEDxSK and JumpSK: A new Slovak speech recognition dedicated corpus},
  author={Sta{\v{s}}, J{\'a}n and Hl{\'a}dek, Daniel and Viszlay, Peter and Koct{\'u}r, Tom{\'a}{\v{s}}},
  journal={Journal of Linguistics/Jazykovedn{\`y} \v{c}asopis},
  volume={68},
  number={2},
  pages={346--354},
  year={2017},
  publisher={Sciendo}
}
@inproceedings{stas_automaticka_2016,
  address = {Smolenice, Slovakia},
  title = {Automatick{\'a} anot{\'a}cia a tvorba re\v{c}ov{\'e}ho korpusu predn{\'a}\v{s}ok {TEDxSK} a {JumpSK}},
  isbn = {978-80-227-4619-9},
  url = {https://wikt-daz2016.fiit.stuba.sk/wp-content/uploads/2016/11/WIKT-DaZ-2016_Proceedings.pdf},
  language = {Slovakian},
  booktitle = {Proc. of 11th {Workshop} on {Intelligent} and {Knowledge} {Oriented} {Technologies}},
  author = {Sta{\v{s}}, J{\'a}n and Koct{\'u}r, Tom{\'a}{\v{s}} and Viszlay, Peter},
  year = {2016},
  pages = {127--132},
}
"""

In [None]:
!wget https://nlp.kemt.fei.tuke.sk/static/files/_TEDxSK/training_set/training_set.zip -O /workspace/data/slovakian/training_set.zip
!mkdir -p /workspace/data/slovakian/training
!unzip /workspace/data/slovakian/training_set.zip -d /workspace/data/slovakian/training

In [None]:
!wget https://nlp.kemt.fei.tuke.sk/static/files/_TEDxSK/manual_transcriptions_v1/manual_transcriptions_v1.zip -O /workspace/data/slovakian/transcript.zip
!mkdir -p /workspace/data/slovakian/transcript
!unzip /workspace/data/slovakian/transcript.zip -d /workspace/data/slovakian/transcript

In [56]:
import re
def split_pronounced(text):
    if '*/' in text:
        return text.split('*/')[0].replace('_', ' ')
    elif '/*' in text:
        return text.split('/*')[0].replace('_', ' ')
    elif '/' in text:
        return text.split('/')[0].replace('_', ' ')
    else:
        return text
# might be useful for other ASR; '[i]' is 'inhalation'; '[ex]', exhalation, etc.
# see: http://trans.sourceforge.net/en/transguidFR.php
# for now, only keeping 'mm'
def do_bracket(tok):
    if tok == '[mm]':
        return 'mm'
    elif len(tok) > 2 and tok[0] == '[' and tok[-1] == ']':
        return ''
    else:
        return tok
def do_carets(tok):
    if tok.startswith('^^'):
        return split_pronounced(tok[2:])
    elif tok.startswith('^'):
        return split_pronounced(tok[1:])
    else:
        return split_pronounced(tok)
def unknown_mark(tok):
    if tok == '%@':
        return ''
    elif '%@' in tok:
        return tok.replace('%@', '')
    else:
        return tok
def do_parens(tok):
    if tok.startswith('((') and tok.endswith('))'):
        return do_carets(tok[2:-2])
    else:
        return do_carets(tok)
def clean_text(text):
    tmptext = ' '.join(text)
    tmptext = tmptext.replace('[ ', '[').replace(' ]', ']')
    tmptext = tmptext.replace('(( ', ' ((').replace(' ))', ')) ')
    # make sure there's a space before these
    tmptext = tmptext.replace('((', ' ((').replace('))', ')) ')
    tmptext = tmptext.replace('/ ', '/').replace(' /', '/')
    text = tmptext.split()
    out = list()
    for tok in text:
        tok = unknown_mark(tok)
        tok = do_bracket(tok)
        tok = do_parens(tok)
        if len(tok) > 1 and tok[0] in ['@', '~']:
            tok = tok[1:]
        out.append(tok)
    cleaned = re.sub('[\.\!\?,%@\n\r:;]', '', ' '.join(out))
    return cleaned.lower().strip()
def is_unclear(text):
    for xx in ['((xx))', '((xxx))', '((xxxx))', '((xxxxx))', '((xxxxxx']:
        if xx in text:
            return True
    return False
def has_control_char(text):
    for ch in ['(', ')', '@', '^', '[', ']', '´', '*', '~', '_']:
        if ch in text:
            return True
    return False

In [57]:
import glob
items = list()
for path in glob.glob('/workspace/data/slovakian/transcript/*.stm'):
    file = open(path, 'r')
    for line in file.readlines():
        item = dict()
        line = line.rstrip()
        if line.startswith(';'):
            continue
        tmp = line.split(' ')
        if len(tmp) < 6:
            continue
        if tmp[2] in ['excluded_region', 'inter_segment_gap']:
            continue
        item['source_wav'] = tmp[0]
        item['speaker_id'] = tmp[2]
        item['start'] = tmp[3]
        item['end'] = tmp[4]
        ctrl = tmp[5]
        if ctrl[0] == '<' and ctrl[-1] == '>':
            ctrl_inner = ctrl[1:-1].split(',')
            if ctrl_inner[2] in ['male', 'female']:
                item['gender'] = ctrl_inner[2]
        if is_unclear(' '.join(tmp[6:])):
            continue
        if(len(' '.join(tmp[6:])) == 0):
            continue
        item['text'] = clean_text(tmp[6:])
        # there's more processing that could be done to recover more
        # but for now, skip the remaining noise
        if has_control_char(item['text']):
            item['text'] = ''
            continue
        item['id'] = '{}_{}_{}_{}'.format(tmp[2], tmp[0], tmp[3], tmp[4])
        if(item['text'] != ''):
            items.append(item)

In [65]:
path = '/workspace/data/slovakian/training'
with open('write-wavs.sh', 'w') as gensh:
    gensh.write('#!/bin/sh\n')
    for item in items:
        dur = float(item['end']) - float(item['start'])
        inwav = '{}/{}.wav'.format(path, item['source_wav'])
        outwav = '{}/{}.wav'.format(path, item['id'])
        gensh.write('ffmpeg -y -ss {} -t {} -i {} {}\n'.format(item['start'], dur, inwav, outwav))

In [None]:
!sh write-wavs.sh

In [71]:
sentences = [s['text'] for s in items]
paths = ['{}/{}.wav'.format(path, item['id']) for item in items]

In [72]:
datain = {'sentence': sentences,
          'path': paths}

In [73]:
from datasets import Dataset
dataset = Dataset.from_dict(datain)

In [74]:
dataset.save_to_disk('/workspace/data/slovakian/tedxsk')