In [1]:
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Osman/resolve/main/tts-malay-osman.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/normalized-texts.json
# !tar -xf tts-malay-osman.tar.gz
# !rm tts-malay-osman.tar.gz

In [2]:
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Osman/resolve/main/tts-malay-osman-parliament.tar.gz
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/normalized-parliaments.json
# !tar -xf tts-malay-osman-parliament.tar.gz
# !rm tts-malay-osman-parliament.tar.gz

In [3]:
import os
import malaya_speech
from malaya_speech import Pipeline
from tqdm import tqdm
import numpy as np
import soundfile as sf
from glob import glob

In [4]:
config = {'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 0,
 'fmax': None,
 'global_gain_scale': 1.0,
 'trim_silence': True}

In [9]:
directory = 'output-osman'
os.system(f'mkdir {directory}')
directories = ['audios']
for d in directories:
    os.system(f'mkdir {directory}/{d}')

In [5]:
import json

with open('normalized-texts.json') as fopen:
    texts = json.load(fopen)
    
with open('normalized-parliaments.json') as fopen:
    parliament = json.load(fopen)

In [6]:
len(texts), len(parliament)

(50000, 59601)

In [8]:
def process(txts, 
            start_silent_trail = int(0.05 * config['sampling_rate']),
            middle_silent_trail = int(0.12 * config['sampling_rate']),
            end_silent_trail = int(0.1 * config['sampling_rate']),
            process_middle_silent = True):
    
    txts = txts[0]
    vad = malaya_speech.vad.webrtc()

    for f in txts:
        directory = f[2]
        index = f[1]
        f = f[0]
        
        audio, _ = malaya_speech.load(f, sr = config['sampling_rate'])
        audio = audio[start_silent_trail:]

        if config['trim_silence']:
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.15)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:end_silent_trail]
                    else:
                        if process_middle_silent:
                            g = np.concatenate([g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]])
                        else:
                            g = g[0].array
                        
                r.append(g)
            audio = np.concatenate(r)
        
        
        sf.write(f'{directory}/audios/{index}.wav', audio, config['sampling_rate'])

In [37]:
txts = [(f"male/{t['index']}.wav", t['index'], directory) for t in texts]

In [62]:
import mp

for i in tqdm(range(0, len(txts), 1000)):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    mp.multiprocessing(b, process, cores = 15, returned = False)

100%|██████████| 50/50 [04:55<00:00,  5.91s/it]


In [11]:
directory = 'output-osman-parliament'
os.system(f'mkdir {directory}')
directories = ['audios']
for d in directories:
    os.system(f'mkdir {directory}/{d}')

In [36]:
txts = [(f"male-parliament/{t['index']}.wav", t['index'], directory) for t in parliament]

In [38]:
len(txts)

59601

In [13]:
i = 1508
process((txts[i: i + 10], 0))

In [19]:
import mp

for i in tqdm(range(0, len(txts), 1000)):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    mp.multiprocessing(b, process, cores = 15, returned = False)

100%|██████████| 60/60 [09:09<00:00,  9.16s/it]


In [7]:
directory = 'output-osman'
wavs = glob(f'/home/husein/speech-bahasa/{directory}/audios/*.wav')
osman = []

for f in wavs:
    left = f
    index = int(os.path.split(f)[1].replace('.wav', ''))
    right = texts[index]['normalized']
    osman.append((left, right))
         
osman[:2]

[('/home/husein/speech-bahasa/output-osman/audios/9176.wav',
  'Beliau juga menyusun kurikulum Sekolah - Sekolah Rendah di Kedah .'),
 ('/home/husein/speech-bahasa/output-osman/audios/31535.wav',
  'Ainina turut menyifatkan Mukhriz sebagai pemimpin muda paling gagal dan tidak layak mendepani masa depan generasi milennial Malaysia " .')]

In [44]:
ipd.Audio(osman[1][0])

In [9]:
parliament_dict = {i['index']: i for i in parliament}

In [10]:
parliament_dict[index]

{'index': 47044,
 'text': 'Hasil bersih bermaksud hasil yang terakru kepada Persekutuan , ditolak amaun yang diterima oleh Sabah berkenaan dengan penyerahhakan hasil itu .',
 'normalized': 'Hasil bersih bermaksud hasil yang terakru kepada Persekutuan , ditolak amaun yang diterima oleh Sabah berkenaan dengan penyerahhakan hasil itu .'}

In [11]:
directory = 'output-osman-parliament'
wavs = glob(f'/home/husein/speech-bahasa/{directory}/audios/*.wav')

for f in wavs:
    left = f
    try:
        index = int(os.path.split(f)[1].replace('.wav', ''))
        right = parliament_dict[index]['normalized']
        osman.append((left, right))
    except Exception as e:
        print(e)
         
osman[-2:]

[('/home/husein/speech-bahasa/output-osman-parliament/audios/41760.wav',
  'Selain itu , faktor - faktor utama yang menyokong kekuatan ringgit termasuk kadar .'),
 ('/home/husein/speech-bahasa/output-osman-parliament/audios/4855.wav',
  'Pada masa ini pengeluaran padi dalam negara adalah tujuh puluh satu perpuluhan empat %.')]

In [48]:
ipd.Audio(osman[-2][0])

In [12]:
len(osman)

109601

In [13]:
from sklearn.model_selection import train_test_split

osman_train, osman_test = train_test_split(osman, test_size = 2000)

In [14]:
with open('osman-vits-test-set.txt', 'w') as fopen:
    json.dump(osman_test, fopen)

In [None]:
with open('osman-vits-train-set.txt', 'w') as fopen:
    json.dump(osman_train, fopen)