In [2]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/dari-pasentran-ke-istana.gz
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/salina.gz

In [16]:
# !tar -xf dari-pasentran-ke-istana.gz
# !tar -xf salina.gz

In [17]:
# !pip3 install malaya-speech --no-deps -U

In [1]:
from glob import glob

import soundfile as sf
import os
import numpy as np
import malaya_speech
import malaya_speech.augmentation.waveform as augmentation
import random
from tqdm import tqdm
import pyrubberband as pyrb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wavs = glob('../data/salina/output-wav-salina/*.wav')
# wavs.extend(glob('dari-pasentran-ke-istana/output-wav-dari-pasentran-ke-istana/*.wav'))
random.shuffle(wavs)

In [3]:
wavs[:10]

['../data/salina/output-wav-salina\\dua-puluh-enam-27.mp3-454.wav',
 '../data/salina/output-wav-salina\\tiga-belas-14.mp3-73.wav',
 '../data/salina/output-wav-salina\\tiga-puluh-31.mp3-31.wav',
 '../data/salina/output-wav-salina\\lapan-belas-19.mp3-281.wav',
 '../data/salina/output-wav-salina\\sebelas-12.mp3-287.wav',
 '../data/salina/output-wav-salina\\sebelas-12.mp3-125.wav',
 '../data/salina/output-wav-salina\\enam-belas-17.mp3-217.wav',
 '../data/salina/output-wav-salina\\sebelas-12.mp3-135.wav',
 '../data/salina/output-wav-salina\\tiga-belas-14.mp3-119.wav',
 '../data/salina/output-wav-salina\\lima-6.mp3-346.wav']

In [4]:
test_size = 300
test_set = wavs[:test_size]
train_set = wavs[test_size:]

In [5]:
import json

with open('../data/salina.json', 'w') as fopen:
    json.dump({'train': train_set, 'test': test_set}, fopen)

In [6]:
lengths = []
for wav in tqdm(test_set):
    y, sr = sf.read(wav)
    lengths.append(len(y) / sr)
    
sum(lengths) / 60

100%|██████████| 300/300 [00:03<00:00, 98.98it/s] 


35.525499999999994

In [23]:
!mkdir text-audiobook
!cp -r salina/output-text-salina text-audiobook
!cp -r dari-pasentran-ke-istana/output-text-dari-pasentran-ke-istana text-audiobook
!tar -czf text-audiobook.tar.gz text-audiobook

mkdir: cannot create directory ‘text-audiobook’: File exists


In [26]:
not_music = glob('../not-music/clean-wav/*.wav') + glob('../musan/music/**/*.wav', recursive = True) \
+ glob('../musan/noise/**/*.wav', recursive = True)

In [7]:
sr = 44100

def read_wav(f):
    return malaya_speech.load(f, sr = sr)

def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = sr, length = length)

def combine_speakers(files, n = 5):
    w_samples = random.sample(files, n)
    w_samples = [
        random_sampling(
            read_wav(f)[0],
            length = min(
                random.randint(20000 // n, 240_000 // n), 100_000 // n
            ),
        )
        for f in w_samples
    ]
    y = [w_samples[0]]
    left = w_samples[0].copy() * random.uniform(0.5, 1.0)
    for i in range(1, n):

        right = w_samples[i].copy() * random.uniform(0.5, 1.0)

        overlap = random.uniform(0.01, 1.25)
        left_len = int(overlap * len(left))

        padded_right = np.pad(right, (left_len, 0))

        if len(left) > len(padded_right):
            padded_right = np.pad(
                padded_right, (0, len(left) - len(padded_right))
            )
        else:
            left = np.pad(left, (0, len(padded_right) - len(left)))

        y.append(padded_right)
        left = left + padded_right
    return left, y

def random_amplitude(sample, low = 3, high = 5):
    y_aug = sample.copy()
    dyn_change = np.random.uniform(low = low, high = high)
    y_aug = y_aug * dyn_change
    return np.clip(y_aug, -1, 1)


def random_amplitude_threshold(sample, low = 1, high = 2, threshold = 0.4):
    y_aug = sample.copy()
    y_aug = y_aug / (np.max(np.abs(y_aug)) + 1e-9)
    dyn_change = np.random.uniform(low = low, high = high)
    y_aug[np.abs(y_aug) >= threshold] = (
        y_aug[np.abs(y_aug) >= threshold] * dyn_change
    )
    return np.clip(y_aug, -1, 1)


def downsample(y, sr, down_sr):
    y_ = malaya_speech.resample(y, sr, down_sr)
    return malaya_speech.resample(y_, down_sr, sr)

def calc(signal, sr, seed, add_uniform = False):
    random.seed(seed)
    
    signal = pyrb.pitch_shift(signal, sr, np.random.randint(-10, 10))
    signal = pyrb.time_stretch(signal, sr, random.uniform(0.5, 1.3))

    if not add_uniform:
        signal = downsample(signal, sr, random.randint(8000, 16000))

    choice = random.randint(0, 6)
    # print('choice', choice)
    if choice == 0:

        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 50),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 1,
        )
    if choice == 1:
        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 70),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 0,
        )
    if choice == 2:
        x = augmentation.sox_augment_low(
            signal,
            min_bass_gain = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = random.randint(0, 1),
        )
    if choice == 3:
        x = augmentation.sox_augment_combine(
            signal,
            min_bass_gain_high = random.randint(25, 70),
            min_bass_gain_low = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 90),
        )
    if choice == 4:
        x = augmentation.sox_reverb(
            signal,
            reverberance = random.randint(10, 80),
            hf_damping = 10,
            room_scale = random.randint(10, 90),
        )
    if choice == 5:
        x = random_amplitude_threshold(
            signal, threshold = random.uniform(0.35, 0.8)
        )

    if choice > 5:
        x = signal

    if choice not in [5] and random.gauss(0.5, 0.14) > 0.6:
        x = random_amplitude_threshold(
            x, low = 1.0, high = 2.0, threshold = random.uniform(0.6, 0.9)
        )

    if random.gauss(0.5, 0.14) > 0.6 and add_uniform:
        x = augmentation.add_uniform_noise(
            x, power = random.uniform(0.005, 0.015)
        )

    return x

In [8]:
y, sr = malaya_speech.load(train_set[0], sr = sr)

In [9]:
import IPython.display as ipd

In [71]:
!mkdir combined

In [87]:
def parallel(f, repeat = 2):
    for i in range(repeat):
        y, _ = malaya_speech.load(f, sr = sr)
        seed = random.randint(0, 100_000_000)
        x = calc(y.copy(), sr, seed)
        x = x / (np.max(np.abs(x)) + 1e-9)
        combined = x

        combined = malaya_speech.resample(combined, sr, 16000)

        sf.write(f"combined/{f.replace('/', '<>')}-aug-{i}.wav", combined, 16000)
    return True

In [88]:
c = parallel(train_set[0])

In [90]:
def loop(files):
    files = files[0]
    results = []
    for f in tqdm(files):
        results.append(parallel(f))
    return results

In [91]:
import mp
mp.multiprocessing(train_set, loop, cores = 6, returned = False)

100%|██████████| 3574/3574 [2:23:12<00:00,  2.40s/it]  
100%|██████████| 3/3 [00:07<00:00,  2.35s/it].37s/it]
100%|██████████| 3574/3574 [2:23:41<00:00,  2.41s/it]
100%|██████████| 3574/3574 [2:23:43<00:00,  2.41s/it]
100%|██████████| 3574/3574 [2:23:44<00:00,  2.41s/it]
100%|██████████| 3574/3574 [2:24:43<00:00,  2.43s/it]
100%|██████████| 3574/3574 [2:25:23<00:00,  2.44s/it]


In [93]:
def resample(f):
    y, _ = malaya_speech.load(f, sr = sr)
    combined = malaya_speech.resample(y, sr, 16000)
    sf.write(f"combined/{f.replace('/', '<>')}.wav", combined, 16000)
    
for f in tqdm(train_set):
    resample(f)

100%|██████████| 21447/21447 [09:54<00:00, 36.08it/s]


In [94]:
!tar -cf trainset-audiobook.tar.gz combined

In [95]:
!mkdir test-set

In [96]:
def resample(f):
    y, _ = malaya_speech.load(f, sr = sr)
    combined = malaya_speech.resample(y, sr, 16000)
    sf.write(f"test-set/{f.replace('/', '<>')}.wav", combined, 16000)
    
for f in tqdm(test_set):
    resample(f)

100%|██████████| 300/300 [00:28<00:00, 10.64it/s]


In [99]:
!tar -cf testset-audiobook.tar.gz test-set

In [102]:
wavs = glob('combined/*.wav')
len(wavs)

64341

In [103]:
wavs[0]

'combined/salina<>output-wav-salina<>dua-puluh-21.mp3-364.wav-aug-0.wav'

In [26]:
import IPython.display as ipd

In [116]:
t = wavs[0].replace('<>', '/').replace('combined/', '').split('-aug')[0]
t

'salina/output-wav-salina/dua-puluh-21.mp3-364.wav'

In [10]:
b = wavs[0]
ipd.Audio(b)

In [16]:
import wave
import numpy as np

ifile = wave.open(b)
rate = ifile.getsampwidth()
sample = ifile.getnframes()
audio = ifile.readframes(sample)
audio_float32 = np.frombuffer(audio, dtype=np.float32)

In [17]:
ipd.Audio(data=audio_float32, rate=rate)

In [18]:
from playsound import playsound

playsound(b) # Alternative to ipd

In [31]:
b

'../data/salina/output-wav-salina\\dua-puluh-sembilan-30.mp3-154.wav'

In [28]:
from vscode_audio import Audio

Audio(b, sr)

AttributeError: 'str' object has no attribute 'tolist'

In [115]:
with open(t.replace('output-wav', 'output-text') + '.txt') as fopen:
    print(fopen.read())

budak-budak di kampung ni pun Nampaknya tolong tampalkan poster-poster tu katanya diupah


In [117]:
ipd.Audio(t)

In [118]:
ipd.Audio(wavs[0])