In [1]:
from pathlib import Path
import pandas as pd
TRAIN_AUDIO_DIR = Path("../input/rfcx-species-audio-detection/train/")
TEST_AUDIO_DIR = Path("../input/rfcx-species-audio-detection/test/")

train_tp = pd.read_csv("../input/rfcx-species-audio-detection/train_tp.csv")
train_fp = pd.read_csv("../input/rfcx-species-audio-detection/train_fp.csv")
train_tp["data_type"] = "tp"
train_fp["data_type"] = "fp"
train = pd.concat([train_tp, train_fp])
sub = pd.read_csv('../input/rfcx-species-audio-detection/sample_submission.csv')

In [14]:
# output dir
TRAIN_SPEC_DIR = Path("../melspec/train/")
TEST_SPEC_DIR = Path("../melspec/test/")
TRAIN_SPEC_DIR.mkdir(parents=True, exist_ok=True)
TEST_SPEC_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# # extract "recording_id" and  "species_id"
train_audio_infos = train["recording_id"].unique()
test_audio_infos = sub["recording_id"].unique()

In [4]:
import numpy as np

def wave2image(y, sr, width, height, mel_params):
    """
    通常のmelspectrogram変換
    """
    melspec = librosa.feature.melspectrogram(y, sr=sr, **mel_params)
    melspec = librosa.power_to_db(melspec).astype(np.float32)

    image = mono_to_color(melspec)
    image = cv2.resize(image, (width, height))
    image = np.moveaxis(image, 2, 0)
    image = (image / 255.0).astype(np.float32)
    return image

def normalize_melspec(X: np.ndarray):
    eps = 1e-6
    mean = X.mean()
    X = X - mean
    std = X.std()
    Xstd = X / (std + eps)
    norm_min, norm_max = Xstd.min(), Xstd.max()
    if (norm_max - norm_min) > eps:
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V


def mono_to_color(X: np.ndarray,
                  mean=None,
                  std=None,
                  norm_max=None,
                  norm_min=None,
                  eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V


In [5]:
import soundfile as sf
import yaml
import numpy as np
import librosa
import cv2

mel_params = """
n_fft: 2048
n_mels: 128
fmin: 80
fmax: 15000
power: 2.
"""

mel_params =yaml.safe_load(mel_params)
mel_params

{'n_fft': 2048, 'n_mels': 128, 'fmin': 80, 'fmax': 15000, 'power': 2.0}

In [11]:
def train_process(recording_id):
    y, sr = sf.read(TRAIN_AUDIO_DIR / f"{recording_id}.flac")
    image = wave2image(y, sr, width=3072, height=512, mel_params=mel_params)
    np.savez_compressed(TRAIN_SPEC_DIR / f'{recording_id}', image)

In [12]:
def test_process(recording_id):
    y, sr = sf.read(TEST_AUDIO_DIR / f"{recording_id}.flac")
    image = wave2image(y, sr, width=3072, height=512, mel_params=mel_params)
    np.savez_compressed(TEST_SPEC_DIR / f'{recording_id}', image)

In [15]:
from joblib import Parallel, delayed
Parallel(n_jobs=24, verbose=1)(delayed(test_process)(recording_id) for recording_id in test_audio_infos)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    3.4s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:   16.1s
[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:   37.6s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:  1.1min
[Parallel(n_jobs=24)]: Done 1202 tasks      | elapsed:  1.8min
[Parallel(n_jobs=24)]: Done 1752 tasks      | elapsed:  2.7min
[Parallel(n_jobs=24)]: Done 1992 out of 1992 | elapsed:  3.1min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [16]:
from joblib import Parallel, delayed
Parallel(n_jobs=24, verbose=1)(delayed(train_process)(recording_id) for recording_id in train_audio_infos)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    1.9s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:   15.2s
[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:   37.1s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:  1.2min
[Parallel(n_jobs=24)]: Done 1202 tasks      | elapsed:  1.9min
[Parallel(n_jobs=24)]: Done 1752 tasks      | elapsed:  2.8min
[Parallel(n_jobs=24)]: Done 2402 tasks      | elapsed:  3.9min
[Parallel(n_jobs=24)]: Done 3152 tasks      | elapsed:  5.1min
[Parallel(n_jobs=24)]: Done 4002 tasks      | elapsed:  6.4min
[Parallel(n_jobs=24)]: Done 4727 out of 4727 | elapsed:  7.6min finished


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,