In [10]:
import os
import math
import librosa
import cv2
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
import pytz
import pickle
import csv
from joblib import Parallel, delayed
import warnings


In [11]:
# ===== 設定クラス =====
class DatasetConfig:
    def __init__(self, debug=False):
        self.debug = debug

        self.RAW_DIR = '../data/raw/'
        self.PROCESSED_DIR = '../data/processed/'

        # Audio Settings
        self.FS = 32000
        self.WINDOW_SIZE = 5.0  # seconds
        self.TARGET_SHAPE = (256, 256)
        self.N_FFT = 1024
        self.HOP_LENGTH = 512
        self.N_MELS = 128
        self.FMIN = 50
        self.FMAX = 14000
        self.N_MAX_FILES = 5 if self.debug else None
        self.N_JOBS = 16

        self.seed = 42

config = DatasetConfig(debug=False)
np.random.seed(config.seed)

In [12]:
# ===== Mel変換関数 =====
def audio_to_melspec(y, config):
    mel = librosa.feature.melspectrogram(
        y=y,
        sr=config.FS,
        n_fft=config.N_FFT,
        hop_length=config.HOP_LENGTH,
        n_mels=config.N_MELS,
        fmin=config.FMIN,
        fmax=config.FMAX,
        power=2.0
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
    mel_norm = np.clip(mel_norm, 0, 1)
    return mel_norm

# ===== 1ファイルを5秒ごとに分割して処理 =====
def process_soundscape_file(filepath):
    try:
        y, _ = librosa.load(filepath, sr=config.FS)
        total_secs = int(len(y) / config.FS)
        result = []

        for start_sec in range(0, total_secs, 5):
            if start_sec + 5 > total_secs:
                continue
            start = start_sec * config.FS
            end = start + int(config.WINDOW_SIZE * config.FS)
            y_seg = y[start:end]

            if len(y_seg) < end - start:
                y_seg = np.pad(y_seg, (0, end - len(y_seg)), mode="constant")

            mel = audio_to_melspec(y_seg, config)
            if mel.shape != config.TARGET_SHAPE:
                mel = cv2.resize(mel, config.TARGET_SHAPE[::-1])

            filename = Path(filepath).stem
            row_id = f"{filename}_{start_sec + 5}"
            result.append((row_id, mel.astype(np.float32), None))

        return result
    except Exception as e:
        return [(None, None, (str(filepath), str(e)))]

In [13]:
print("🔍 Loading soundscape files...")
files = sorted(Path(config.RAW_DIR + '/train_soundscapes').glob("*.ogg"))
if config.N_MAX_FILES:
    files = files[:config.N_MAX_FILES]
print(f"🗂 Found {len(files)} files")

print("🎧 Generating Mel spectrograms...")
results_nested = Parallel(n_jobs=config.N_JOBS)(
    delayed(process_soundscape_file)(f) for f in tqdm(files)
)

results = [item for sublist in results_nested for item in sublist]
mel_dict = {row_id: mel for row_id, mel, err in results if row_id is not None}
errors = [err for _, _, err in results if err is not None]



🔍 Loading soundscape files...
🗂 Found 9726 files
🎧 Generating Mel spectrograms...


100%|██████████| 9726/9726 [01:31<00:00, 106.50it/s]


In [14]:
# 保存先ディレクトリを作成
jst = pytz.timezone('Asia/Tokyo')
now = datetime.now(jst)
timestamp = now.strftime("%Y%m%d_%H%M")
output_dir = Path(config.PROCESSED_DIR) / f"melspec_train_soundscapes_{timestamp}"
output_dir.mkdir(parents=True, exist_ok=True)

# Mel保存
melspec_path = output_dir / "train_soundscapes_melspecs.npy"
with open(melspec_path, 'wb') as f:
    pickle.dump(mel_dict, f, protocol=5)
print(f"\n✅ Mel-spectrograms saved to: {melspec_path}")
print(f"📦 File size: {os.path.getsize(melspec_path) / (1024 ** 2):.2f} MB")
print(f"📐 Example shape: {next(iter(mel_dict.values())).shape}")

# Config保存
config_path = output_dir / "config.csv"
with open(config_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["key", "value"])
    for k, v in vars(config).items():
        writer.writerow([k, v])
print(f"📝 Config saved to: {config_path}")

# エラー表示
if errors:
    print("\n⚠️ Some files failed to process:")
    for err in errors[:5]:
        print(f" - {err[0]}: {err[1]}")


✅ Mel-spectrograms saved to: ../data/processed/melspec_train_soundscapes_20250407_1536/train_soundscapes_melspecs.npy
📦 File size: 29188.80 MB
📐 Example shape: (256, 256)
📝 Config saved to: ../data/processed/melspec_train_soundscapes_20250407_1536/config.csv
