## 資料夾結構
Dataset/train

Dataset/public_test

Dataset/augment/{某個方法}/train

Dataset/augment/{某個方法}/public_test

---

- origin
- 0.9
- 1.1
- +noise

In [21]:
from IPython.display import Audio, display
import random
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torchaudio
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Resample

DATA_PATH = Path('Dataset')

# train & public test
merged_path = [f"train/train_{i:05d}" for i in range(1,1201)] +[f"public_test/public_{i:05d}" for i in range(1,10001)]

In [22]:
(DATA_PATH/f'augmented/0.9/train').mkdir(parents=True, exist_ok=True)
(DATA_PATH/f'augmented/1.1/train').mkdir(parents=True, exist_ok=True)
(DATA_PATH/f'augmented/noise/train').mkdir(parents=True, exist_ok=True)

(DATA_PATH/f'augmented/0.9/public_test').mkdir(parents=True, exist_ok=True)
(DATA_PATH/f'augmented/1.1/public_test').mkdir(parents=True, exist_ok=True)
(DATA_PATH/f'augmented/noise/public_test').mkdir(parents=True, exist_ok=True)

## Speed perturbation + Time Shift

In [23]:
augment = Compose([
    TimeStretch(min_rate=0.8, max_rate=1, p=1),
    Shift(min_fraction=1, max_fraction=1,rollover=True, p=0.5)
])

for f in tqdm(merged_path):
    waveform, sr = torchaudio.load(DATA_PATH/f"{f}.wav")
    
    augmented_sample = augment(samples=waveform.numpy(), sample_rate=sr)
    
    torchaudio.save(DATA_PATH/f'augmented/0.9/{f}.wav', 
                    torch.tensor(augmented_sample),
                    sample_rate=sr)
    
augment = Compose([
    TimeStretch(min_rate=1, max_rate=1.2, p=1),
    Shift(min_fraction=1, max_fraction=1,rollover=True, p=0.5)
])
for f in tqdm(merged_path):
    waveform, sr = torchaudio.load(DATA_PATH/f"{f}.wav")
    
    augmented_sample = augment(samples=waveform.numpy(), sample_rate=sr)
    
    torchaudio.save(DATA_PATH/f'augmented/1.1/{f}.wav', 
                    torch.tensor(augmented_sample),
                    sample_rate=sr)

100%|██████████| 11200/11200 [03:13<00:00, 57.86it/s]
100%|██████████| 11200/11200 [02:49<00:00, 66.05it/s]


## + Musan Noise

In [24]:
musan_path = Path("Dataset/musan")
noise_files = sorted(list((musan_path/"noise/sound-bible").glob("*.wav")))
noise_files[:5], len(noise_files)

([PosixPath('Dataset/musan/noise/sound-bible/noise-sound-bible-0000.wav'),
  PosixPath('Dataset/musan/noise/sound-bible/noise-sound-bible-0001.wav'),
  PosixPath('Dataset/musan/noise/sound-bible/noise-sound-bible-0002.wav'),
  PosixPath('Dataset/musan/noise/sound-bible/noise-sound-bible-0003.wav'),
  PosixPath('Dataset/musan/noise/sound-bible/noise-sound-bible-0004.wav')],
 87)

In [26]:
augment = Compose([
    Shift(min_fraction=-1, max_fraction=1,rollover=True, p=0.5)
])

N_noise = 5
for f in tqdm(merged_path):
    for i in range(N_noise):
        wav, sr = torchaudio.load(DATA_PATH/f"{f}.wav")

        flag = True
        while flag:
            noise_f = random.choice(noise_files)
            noise_wav, noise_sr = torchaudio.load(noise_f)
            if noise_wav.shape[1] >= 80000:
                flag = False
            noise_wav = augment(noise_wav.numpy(), sample_rate=noise_sr)

        wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=noise_sr)(wav)
        if wav.shape[1] != 80000:
            wav = torch.cat([wav, torch.zeros([1, 80000-wav.shape[1]])], dim=1)
        wav = augment(wav.numpy(), sample_rate=noise_sr)
        
        alpha = random.random()*0.3 + 0.1
        new_wav = (wav + noise_wav[:,:80000]*alpha)/(1+alpha)
        torchaudio.save(DATA_PATH/f'augmented/noise/{f}_{i}.wav', 
                        torch.tensor(new_wav),
                        sample_rate=noise_sr)

100%|██████████| 11200/11200 [05:51<00:00, 31.88it/s]
