In [1]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/audio-iium.zip
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/collections/shuffled-iium.json
# !unzip audio-iium.zip -d iium

In [2]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/audio-wattpad.zip
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/collections/transcript-wattpad.json
# !unzip audio-wattpad.zip -d wattpad

In [3]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/text-audiobook.tar.gz
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/testset-audiobook.tar.gz
# !tar -zxf text-audiobook.tar.gz
# !tar -xf testset-audiobook.tar.gz

In [1]:
import os
from glob import glob
from tqdm import tqdm
import json

In [2]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
        
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [3]:
base_directory = '/c/Users/liana/Documents/Projects/malaya-speech/pretrained-model'

In [7]:
wattpad = []
wavs = glob('wattpad/audio-wattpad/*wav')

with open('transcript-wattpad.json') as fopen:
    transcript = json.load(fopen)
    
for i in tqdm(wavs):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript[int(index)]
    wattpad.append((i, text))

100%|██████████| 146/146 [00:00<00:00, 99604.49it/s]


In [8]:
iium = []
wavs = glob('iium/audio-iium/*wav')

with open('shuffled-iium.json') as fopen:
    transcript = json.load(fopen)
    
for i in tqdm(wavs):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript[int(index)]
    iium.append((i, text))

100%|██████████| 97/97 [00:00<00:00, 141956.56it/s]


In [9]:
khalil = glob(f'{base_directory}/tolong-sebut/*.wav')
mas = glob(f'{base_directory}/sebut-perkataan-woman/*.wav')
husein = glob(f'{base_directory}/sebut-perkataan-man/*.wav')
len(khalil), len(mas), len(husein)

(565, 200, 698)

In [10]:
khalils = []
for i in tqdm(khalil[-int(len(khalil) * 0.05):]):
    try:
        t = i.split('/')[-1].replace('.wav','')
        text = f'tolong sebut {t}'
        khalils.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 28/28 [00:00<00:00, 31714.96it/s]


In [11]:
mass = []
for i in tqdm(mas[-int(len(mas) * 0.05):]):
    try:
        t = i.split('/')[-1].replace('.wav','')
        text = f'sebut perkataan {t}'
        mass.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 10/10 [00:00<00:00, 12606.87it/s]


In [12]:
huseins = []
for i in tqdm(husein[-int(len(husein) * 0.05):]):
    try:
        t = i.split('/')[-1].replace('.wav','')
        text = f'sebut perkataan {t}'
        huseins.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 34/34 [00:00<00:00, 104092.22it/s]


In [13]:
wikipedia = []
wavs = glob(f'{base_directory}/streaming/*wav')
for i in tqdm(wavs[-int(len(wavs) * 0.05):]):
    text = os.path.split(i)[1].replace('.wav', '')
    wikipedia.append((i, text))
    
len(wikipedia)

100%|██████████| 144/144 [00:00<00:00, 53397.56it/s]


144

In [14]:
news = []
wavs = glob(f'{base_directory}/news/audio/*wav')

with open(f'{base_directory}/transcript-news.json') as fopen:
    transcript_news = json.load(fopen)
    
for i in tqdm(wavs[-int(len(wavs) * 0.05):]):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript_news[int(index)]
    news.append((i, text))

100%|██████████| 107/107 [00:00<00:00, 147725.65it/s]


In [15]:
import pandas as pd

df = pd.read_csv(f'{base_directory}/haqkiem/metadata.csv', header = None, sep = '|')
txts = df.values.tolist()
haqkiem = []
for f in tqdm(txts[-int(len(txts) * 0.05):]):
    text = f[1]
    text = text.split('.,,')[0]
    f = f[0]
    r = f'{base_directory}/haqkiem/{f}.wav'
    haqkiem.append((r, text))

100%|██████████| 214/214 [00:00<00:00, 201658.29it/s]


In [16]:
audios = wattpad + iium + khalils + mass + wikipedia + news + haqkiem + huseins
audios, texts = zip(*audios)

In [17]:
processed_text = [preprocessing_text(t) for t in texts]

In [18]:
from sklearn.utils import shuffle
audios, processed_text = shuffle(audios, processed_text)

In [19]:
with open('bahasa-asr-test.json', 'w') as fopen:
    json.dump({'X': audios, 'Y':processed_text}, fopen)

In [20]:
import json

with open('bahasa-asr-test.json') as fopen:
    data = json.load(fopen)

In [3]:
6000000 / 16000

375.0

In [21]:
# import malaya_speech

In [22]:
# tokenizer = malaya_speech.subword.load('transducer.subword')
# tokenizer

In [23]:
# malaya_speech.subword.decode(tokenizer, [0, 2, 133, 875])

In [24]:
# from pydub import AudioSegment
# import numpy as np

# sr = 16000

# def mp3_to_wav(file, sr = sr):
#     audio = AudioSegment.from_file(file)
#     audio = audio.set_frame_rate(sr).set_channels(1)
#     sample = np.array(audio.get_array_of_samples())
#     return malaya_speech.astype.int_to_float(sample), sr

# def generator(maxlen = 18, min_length_text = 2):
#     for i in tqdm(range(len(audios))):
#         try:
#             if audios[i].endswith('.mp3'):
#                 wav_data, _ = mp3_to_wav(audios[i])
#             else:
#                 wav_data, _ = malaya_speech.load(audios[i])
                
#             if (len(wav_data) / sr) > maxlen:
#                 print(f'skipped audio too long {audios[i]}')
#                 continue
                
#             if len(processed_text[i]) < min_length_text:
#                 print(f'skipped text too short {audios[i]}')
#                 continue    

#             yield {
#                 'waveforms': wav_data.tolist(),
#                 'waveform_lens': [len(wav_data)],
#                 'targets': malaya_speech.subword.encode(tokenizer, processed_text[i], add_blank = False),
#             }
#         except Exception as e:
#             print(e)
            
# generator = generator()

In [25]:
# import os
# import tensorflow as tf

# os.system('rm bahasa-asr-test/data/*')
# DATA_DIR = os.path.expanduser('bahasa-asr-test/data')
# tf.gfile.MakeDirs(DATA_DIR)

In [26]:
# shards = [{'split': 'dev', 'shards': 10}]

In [27]:
# import malaya_speech.train as train

# train.prepare_dataset(generator, DATA_DIR, shards, prefix = 'bahasa-asr')