In [10]:
import os
import numpy as np
import librosa
import pandas as pd


from sklearn.model_selection import train_test_split

from tqdm import tqdm
import random
from sklearn.metrics import f1_score
import wandb
from audiomentations import AddBackgroundNoise
import soundfile as sf


In [11]:
import warnings
warnings.filterwarnings("ignore")

In [12]:
class CONFIG:
    VOICE_DIR = './cv-corpus-19.0-2024-09-13/ko/clips/'
    NOISE_DIR = './ESC-50-master/audio/'

In [13]:
train_df = pd.read_csv('./cv-corpus-19.0-2024-09-13/ko/train.tsv', sep='\t')

train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [14]:
test_df = pd.read_csv('./cv-corpus-19.0-2024-09-13/ko/test.tsv', sep='\t')

In [15]:
noise_df = pd.read_csv('./ESC-50-master/meta/esc50.csv')

In [16]:
train_voice_paths = train_df['path'].values
valid_voice_paths = valid_df['path'].values
test_voice_paths = test_df['path'].values

noise_paths = noise_df['filename'].values

In [None]:
def get_dataset(voice_paths, noise_paths, dir="./train"):
    df = pd.DataFrame(columns=['aug', 'gt'])
    for i, voice_path in tqdm(enumerate(voice_paths)):
        noise_path = CONFIG.NOISE_DIR + random.choice(noise_paths)

        voice, _ = librosa.load(CONFIG.VOICE_DIR + voice_path, sr=16000)
        max_start = len(voice) - 51500
        if max_start < 0:
            continue

        start = np.random.randint(0, max_start)
        voice_frame = voice[start:start + 51500] 

        transform = AddBackgroundNoise(
        sounds_path=noise_path,
        min_snr_db=3.0,
        max_snr_db=30.0,
        p=1.0,
        )
        voice_noise_frame = transform(voice_frame, sample_rate=16000)
            

        sf.write(f"{dir}/aug/{i}.wav", voice_noise_frame, 16000)
        sf.write(f"{dir}/gt/{i}.wav", voice_frame, 16000)
        df.loc[i] = [f"{dir}/aug/{i}.wav", f"{dir}/gt/{i}.wav"]
    
    df.to_csv(f"{dir}_dataset.csv", index=False)


In [18]:
get_dataset(train_voice_paths, noise_paths, dir="./train_denoiser")

426it [00:38, 11.18it/s]


In [19]:
get_dataset(valid_voice_paths, noise_paths, dir="./valid_denoiser")

107it [00:09, 11.58it/s]


In [20]:
get_dataset(test_voice_paths, noise_paths, dir="./test_denoiser")

352it [00:29, 12.01it/s]
