# **BirdCLEF 2025 Data Preprocessing Notebook**

In [None]:
import os
import cv2
import math
import time
import librosa
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm
import sys
import os
import numpy as np
from datetime import datetime
import pytz
import random
import torch
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedGroupKFold

from joblib import Parallel, delayed



In [None]:
class DatasetConfig:
    def __init__(self, kaggle_notebook=False, debug=False):
        self.KAGGLE_NOTEBOOK = kaggle_notebook
        self.debug = debug

        # ===== Path Settings =====
        if self.KAGGLE_NOTEBOOK:
            self.OUTPUT_DIR = ''
            self.train_datadir = '/kaggle/input/birdclef-2025/train_audio'
            self.train_csv = '/kaggle/input/birdclef-2025/train.csv'
            self.test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
            self.submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
            self.taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
            self.model_path = '/kaggle/input/birdclef-2025-0330'
            self.RAW_DIR = "/kaggle/input/birdclef-2025/"
            self.PROCESSED_DIR = ""
        else:
            self.OUTPUT_DIR = '../data/result/'
            self.train_datadir = '../data/raw/train_audio/'
            self.train_csv = '../data/raw/train.csv'
            self.test_soundscapes = '../data/raw/test_soundscapes/'
            self.submission_csv = '../data/raw/sample_submission.csv'
            self.taxonomy_csv = '../data/raw/taxonomy.csv'
            self.models_dir = "../models/" # 全modelの保存先
            self.model_path = self.models_dir # 各モデルの保存先．学習時に動的に変更．
            self.RAW_DIR = '../data/raw/'
            self.PROCESSED_DIR = '../data/processed/'


        # ===== Audio Settings =====
        self.FS = 32000
        self.WINDOW_SIZE = 5.0 # 推論時のウィンドウサイズ
        self.TARGET_DURATION = 5 # データセット作成時のウィンドウサイズ
        self.TARGET_SHAPE = (256, 256)
        self.N_FFT = 1024
        self.HOP_LENGTH = 512
        self.N_MELS = 148
        self.FMIN = 20
        self.FMAX = 16000
        self.N_MAX = 50 if self.debug else None        
        self.N_JOBS = 3
        self.LOAD_ENGINE = 'torchaudio'  # librosa or torchaudio
        self.SKIP_RESIZE = False  # resizeしないならTrue
        self.seed = 42
        self.n_fold = 5
        self.num_rare_samples = 50 # これ以下のサンプル数のspeciesはrare speciesとして扱う
        self.is_crop_aug = False
            

In [None]:
config = DatasetConfig(kaggle_notebook=False, debug=False)

In [None]:
def set_seed(seed=42):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

set_seed(config.seed)

In [None]:
print(f"Debug mode: {'ON' if config.debug else 'OFF'}")
print(f"Max samples to process: {config.N_MAX if config.N_MAX is not None else 'ALL'}")

print("Loading taxonomy data...")
taxonomy_df = pd.read_csv(f'{config.RAW_DIR}/taxonomy.csv')
species_class_map = dict(zip(taxonomy_df['primary_label'], taxonomy_df['class_name']))

print("Loading training metadata...")
train_df = pd.read_csv(f'{config.RAW_DIR}/train.csv')

In [None]:
label_list = sorted(train_df['primary_label'].unique()) # ① データフレーム中の primary_label 列から重複を除いたリストを取得し、アルファベット順にソート
label_id_list = list(range(len(label_list)))   # ② 0 から始まる ID のリストを、ラベル数に合わせて作成
label2id = dict(zip(label_list, label_id_list)) # ③ ラベル文字列 → 整数 ID の辞書を作成
id2label = dict(zip(label_id_list, label_list)) # ④ 整数 ID → ラベル文字列 の逆辞書も作成

print(f'Found {len(label_list)} unique species')
working_df = train_df.copy()
working_df['target'] = working_df.primary_label.map(label2id)
working_df['filepath'] = config.RAW_DIR + '/train_audio/' + working_df.filename
working_df['samplename'] = working_df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])
working_df['class'] = working_df.primary_label.map(lambda x: species_class_map.get(x, 'Unknown'))
working_df["crop_strategy"] = "center"
total_samples = min(len(working_df), config.N_MAX or len(working_df))
print(f'Total samples to process: {total_samples} out of {len(working_df)} available')
print(f'Samples by class:')
print(working_df['class'].value_counts())

# 音源の長さ， foldをロードして追加． 
duration_fold_df = pd.read_csv("../data/processed/train_mel0413.csv")[["filename", "duration_sec", "fold"]]
working_df = working_df.merge(duration_fold_df, on="filename", how="left")

missing = working_df["duration_sec"].isna().sum()
print(f"✅ Added 'duration_sec'. Missing values: {missing}")

In [None]:
working_df["valid_start_sec"] = 0
working_df["valid_end_sec"] = working_df["duration_sec"]

In [None]:
working_df['duration_sec']

In [None]:
# 手動で人の声除去

# 4. 特定のファイルの valid_start_sec を変更
# 特定のファイル（最初にスペイン語が含まれる）
spanish_intro_filenames = [
    '50186/CSA28885.ogg',
    '52884/CSA14875.ogg'
]
# valid_start_sec を 4.0 に変更
working_df.loc[
    working_df['filename'].isin(spanish_intro_filenames),
    'valid_start_sec'
] = 4.0


# 途中で人の声のみになるので除去
voice_only_ranges = {
    '476537/CSA35459.ogg': 134,  # 2分14秒 = 134秒
    '476537/CSA35461.ogg': 259,  # 4分19秒 = 259秒
}
for fname, end_sec in voice_only_ranges.items():
    working_df.loc[
        working_df['filename'] == fname,
        'valid_end_sec'
    ] = end_sec


# Eliana Barona- Cortés　の音源．話している部分．いらない部分
# 24292/CSA34649.ogg 2min8以降
# 24292/CSA34651.ogg 1min33以降
# 50186/CSA34622.ogg 21s以降
# 50186/CSA34678.ogg 43s以降
voice_only_ranges = {
    '24292/CSA34649.ogg': 128,   # 2分8秒 = 128秒
    '24292/CSA34651.ogg': 93,    # 1分33秒 = 93秒
    '50186/CSA34622.ogg': 21,    # 21秒
    '50186/CSA34678.ogg': 43,    # 43秒
}
for fname, end_sec in voice_only_ranges.items():
    working_df.loc[
        working_df['filename'] == fname,
        'valid_end_sec'
    ] = end_sec

# Alexandra Butrago-Cardona の音源チェック
# 話している部分．いらない部分
# 24292/CSA35021.ogg 36s以降
# 52884/CSA34947.ogg 13s以降
voice_only_ranges = {
    '24292/CSA35021.ogg': 36,    # 36秒
    '52884/CSA34947.ogg': 13,     # 13秒
}
for fname, end_sec in voice_only_ranges.items():
    working_df.loc[
        working_df['filename'] == fname,
        'valid_end_sec'
    ] = end_sec


# Fabio A. Sarria-S の音声は 0〜7秒 だけ使用可能に設定．後半はただの説明なので
fabio_filenames = train_df.loc[
    train_df['author'] == "Fabio A. Sarria-S", 'filename'
].tolist()
# 
working_df.loc[
    working_df['filename'].isin(fabio_filenames), 'valid_end_sec'
] = 7.0

#  Fabioの解説で，必ずしも7secではないもの
fabio_override = {
    "48124/CSA36346.ogg": 24.0,
    "52884/CSA36344.ogg": 55.0,
    "52884/CSA36342.ogg": 14.0,  # ← 追加分
}

for fname, end_sec in fabio_override.items():
    working_df.loc[
        working_df['filename'] == fname,
        'valid_end_sec'
    ] = end_sec


# crop戦略は基本center
working_df["crop_strategy"] = "center"

In [None]:
# safezoneをworking_dfに反映．

# safe_zoneを準備（すでに"checked"のみ、重複除去済み）
safe_zone = pd.read_csv("../data/processed/safezone_1000_0501.csv")
safe_zone = safe_zone[safe_zone["check"] == "checked"]
safe_zone = safe_zone.drop_duplicates(subset=["filename"])  #filename 列を基準にして、重複するファイル名を持つ行は最初の1つだけ残し、あとは削除

# start, endをfloatに変換
safe_zone["start"] = pd.to_numeric(safe_zone["start"], errors="coerce") #errors="coerce" を指定しているので、もし変換できない値があれば NaN に置き換え
safe_zone["end"] = pd.to_numeric(safe_zone["end"], errors="coerce")

# safe_zoneから必要なカラムだけ持ってくる
safe_zone_update = safe_zone[["filename", "start", "end"]]

# working_dfも用意されている想定

# working_dfにsafe_zoneのstart, endをマージする
working_df = working_df.merge(safe_zone_update, on="filename", how="left")

# start, endが存在するものについて、valid_start_sec, valid_end_secを書き換え
working_df["valid_start_sec"] = working_df["start"].combine_first(working_df["valid_start_sec"])  #safe_zone に start・end があれば、それを valid_start_sec・valid_end_secに上書き
working_df["valid_end_sec"] = working_df["end"].combine_first(working_df["valid_end_sec"])

# 使い終わったstart, endカラムを消す（必要なら）
working_df = working_df.drop(columns=["start", "end"])

In [None]:
# safe_zoneを準備（すでに"checked"のみ、重複除去済み）
safe_zone = pd.read_csv("../data/processed/safezone_1000_2000.csv")
safe_zone = safe_zone[safe_zone["is_checked"] == "checked"]
safe_zone = safe_zone.drop_duplicates(subset=["filename"])  #filename 列を基準にして、重複するファイル名を持つ行は最初の1つだけ残し、あとは削除

# start, endをfloatに変換
safe_zone["start"] = pd.to_numeric(safe_zone["start"], errors="coerce") #errors="coerce" を指定しているので、もし変換できない値があれば NaN に置き換え
safe_zone["end"] = pd.to_numeric(safe_zone["end"], errors="coerce")

# safe_zoneから必要なカラムだけ持ってくる
safe_zone_update = safe_zone[["filename", "start", "end"]]

# working_dfも用意されている想定

# working_dfにsafe_zoneのstart, endをマージする
working_df = working_df.merge(safe_zone_update, on="filename", how="left")

# start, endが存在するものについて、valid_start_sec, valid_end_secを書き換え
working_df["valid_start_sec"] = working_df["start"].combine_first(working_df["valid_start_sec"])  #safe_zone に start・end があれば、それを valid_start_sec・valid_end_secに上書き
working_df["valid_end_sec"] = working_df["end"].combine_first(working_df["valid_end_sec"])

# 使い終わったstart, endカラムを消す（必要なら）
working_df = working_df.drop(columns=["start", "end"])

In [None]:
# safe_zoneを準備（すでに"checked"のみ、重複除去済み）
safe_zone = pd.read_csv("../data/processed/safezone_2000_3000.csv")
safe_zone = safe_zone[safe_zone["check"] == "checked"]
safe_zone = safe_zone.drop_duplicates(subset=["filename"])  #filename 列を基準にして、重複するファイル名を持つ行は最初の1つだけ残し、あとは削除

# start, endをfloatに変換
safe_zone["start"] = pd.to_numeric(safe_zone["start"], errors="coerce") #errors="coerce" を指定しているので、もし変換できない値があれば NaN に置き換え
safe_zone["end"] = pd.to_numeric(safe_zone["end"], errors="coerce")

# safe_zoneから必要なカラムだけ持ってくる
safe_zone_update = safe_zone[["filename", "start", "end"]]

# working_dfも用意されている想定

# working_dfにsafe_zoneのstart, endをマージする
working_df = working_df.merge(safe_zone_update, on="filename", how="left")

# start, endが存在するものについて、valid_start_sec, valid_end_secを書き換え
working_df["valid_start_sec"] = working_df["start"].combine_first(working_df["valid_start_sec"])  #safe_zone に start・end があれば、それを valid_start_sec・valid_end_secに上書き
working_df["valid_end_sec"] = working_df["end"].combine_first(working_df["valid_end_sec"])

# 使い終わったstart, endカラムを消す（必要なら）
working_df = working_df.drop(columns=["start", "end"])

In [None]:
def prepare_augmented_rows(new_files, working_df, base_dir="../data/raw/bc2025_rare"):
    """
    新しいファイル情報を基に working_df を拡張するための行を準備する
    """
    augmented_rows = []
    
    for item in new_files:
        label = item["primary_label"]
        original_fname = item["filename"]
        
        # ファイルパスを整形
        fname = os.path.splitext(original_fname)[0] + '.ogg'
        full_path = os.path.join(base_dir, fname)
        
        # 同じラベルの既存データを取得
        match = working_df[working_df["primary_label"] == label]
        
        if not match.empty:
            new_row = match.iloc[0].copy()
            new_row["filename"] = fname
            new_row["filepath"] = full_path
            sample_prefix = os.path.dirname(fname)
            sample_name = os.path.splitext(os.path.basename(fname))[0]
            new_row["samplename"] = f"{sample_prefix}-{sample_name}"
            new_row["fold"] = 1  # foldは未割り当て
            new_row["duration_time"]= None  # duration_timeは未割り当て
            new_row["valid_end_sec"] = None
            
            augmented_rows.append(new_row)
    
    return pd.DataFrame(augmented_rows)

# 使用例
new_files = [
    {"filename": "1139490/2391.ogg", "primary_label": "1139490"},
    {"filename": "42113/XC975063.ogg", "primary_label": "42113"},
    {"filename": "66016/vaillanti-escape1.ogg", "primary_label": "66016"},
    {"filename": "66016/vaillanti-escape3.ogg", "primary_label": "66016"},
    {"filename": "66016/vaillanti-escape4.ogg", "primary_label": "66016"},
    {"filename": "66578/Pristimantis_bogotensis15.ogg", "primary_label": "66578"},
    {"filename": "868458/2388.ogg", "primary_label": "868458"},
    {"filename": "turvul/XC39894.ogg", "primary_label": "turvul"},
    {"filename": "turvul/XC381486.ogg", "primary_label": "turvul"},
    {"filename": "turvul/XC520288.ogg", "primary_label": "turvul"},
    {"filename": "turvul/XC552488.ogg", "primary_label": "turvul"},
    {"filename": "turvul/XC748979.ogg", "primary_label": "turvul"},
    {"filename": "turvul/XC764680.ogg", "primary_label": "turvul"},
    {"filename": "turvul/XC780516.ogg", "primary_label": "turvul"},
    {"filename": "turvul/XC904279.ogg", "primary_label": "turvul"},
]

# ここで実際に拡張する
augmented_df = prepare_augmented_rows(new_files, working_df)
working_df = pd.concat([working_df, augmented_df], ignore_index=True)


In [None]:
# augmentationのための処理．各音源でどれくらい増やすのかを事前に決定

# 初期化
working_df['n_augment'] = 0
working_df['multi_crop'] = False

target_samples = int(config.TARGET_DURATION * config.FS)

# valid_end_sec が None なら duration_sec に補完
working_df['valid_end_sec'] = working_df.apply(
    lambda row: row['duration_sec'] if pd.isna(row['valid_end_sec']) else row['valid_end_sec'],
    axis=1
)

# valid_start_sec が None なら 0 に補完（念のため）
working_df['valid_start_sec'] = working_df['valid_start_sec'].fillna(0)

# rareなラベルを抽出
label_counts = working_df['primary_label'].value_counts().rename_axis("label").reset_index(name="sample_count")
rare_labels = label_counts[label_counts['sample_count'] < config.num_rare_samples]['label'].tolist()

# ✅ rare種ごとに crop 数を割り当てる
for rare_label in rare_labels:
    base_rows = working_df[working_df['primary_label'] == rare_label]
    n_exist = len(base_rows)
    n_needed = config.num_rare_samples - n_exist
    n_aug_per_sample = math.ceil(n_needed / n_exist)

    for idx, row in base_rows.iterrows():
        usable_duration_sec = row['valid_end_sec'] - row['valid_start_sec']
        usable_samples = int(usable_duration_sec * config.FS)

        # 少なくとも2倍にする
        max_possible = usable_samples // target_samples
        n_actual = min(n_aug_per_sample, max_possible)

        if n_actual > 0:
            working_df.at[idx, 'multi_crop'] = True
            working_df.at[idx, 'n_augment'] = n_actual
            
            
if not config.is_crop_aug:
    working_df['n_augment'] = 0
    working_df['multi_crop'] = False

# num_augmented
print(f"Total number of augmentations: {working_df['n_augment'].sum()}")

In [None]:
def extract_maxdb(audio_data: np.ndarray,
                  sr: int = config.FS,
                  target_sec: float = config.TARGET_DURATION,
                  chunk_len: float = 0.05) -> tuple[int, int]:

    tgt_samples = int(target_sec * sr)
    chunk       = int(chunk_len * sr)

    # ---- チャンクごとのパワー合計 (Σx²) を計算 ----------------
    pad_len  = int(np.ceil(len(audio_data) / chunk) * chunk - len(audio_data))
    power_sq = np.pad(audio_data ** 2, (0, pad_len))
    power_chunks = power_sq.reshape(-1, chunk).sum(axis=1)

    # ---- 最大チャンクの中心を 5 s の中央に ---------------------
    max_idx   = power_chunks.argmax()
    center_t  = (max_idx + 0.5) * chunk_len          # [sec]
    start_t   = max(center_t - target_sec / 2, 0.0)
    end_t     = min(center_t + target_sec / 2, len(audio_data) / sr)

    start = int(start_t * sr)
    end   = int(end_t   * sr)
    return start, end


def extract_maxrms(audio_data: np.ndarray,
                   sr: int = config.FS,
                   target_sec: float = config.TARGET_DURATION,
                   chunk_len: float = 0.05):
    
    tgt_samples = int(target_sec * sr)
    chunk       = int(chunk_len * sr)

    pad_len = int(np.ceil(len(audio_data) / chunk) * chunk - len(audio_data))
    audio_pad = np.pad(audio_data, (0, pad_len))

    rms_chunks = np.sqrt(np.mean(audio_pad.reshape(-1, chunk) ** 2, axis=1))

    max_idx   = rms_chunks.argmax()
    center_t  = (max_idx + 0.5) * chunk_len
    start_t   = max(center_t - target_sec / 2, 0)
    end_t     = min(center_t + target_sec / 2, len(audio_data) / sr)

    start = int(start_t * sr)
    end   = int(end_t * sr)
    return start, end



In [None]:
def snr(signal: np.ndarray,
        peak_percentile: float = 99.5) -> float:
    """
    Parameters
    ----------
    signal : np.ndarray
        1-D モノラル波形（float32/float64）。
    peak_percentile : float, optional
        “ピーク” を何パーセンタイルで測るか。
        デフォルト 99.5 = 外れ値耐性を持たせた近似ピーク。

    Returns
    -------
    snr_db : float
        振幅ピークと RMS の比を dB で表した簡易 SNR。
        値が大きいほどピークが背景より目立つ。
    """
    # 必ず float32/64 で計算
    x = signal.astype(np.float32, copy=False)

    # ① 近似ピーク振幅（外れ値を除外）
    peak = np.percentile(np.abs(x), peak_percentile)

    # ② RMS（実効値）
    rms = np.sqrt(np.mean(x**2)) + 1e-9        # 1e-9 でゼロ割り防止

    # ③ dB 変換
    snr_db = 20.0 * np.log10(peak / rms)
    return snr_db

def smart_crop(audio_data: np.ndarray,
               sr: int = config.FS,
               target_sec: float = config.TARGET_DURATION) -> tuple[int, int]:
    """
    1) “maxRMS” チャンクを中心に SNR ≥ 10 dB なら即採用
    2) 高域パワー最大フレーム (SNR ≥ 8 dB) を試す
    3) スペクトルエントロピー最小フレームを採用
    戻り値は (start_sample, end_sample)
    """
    tgt = int(target_sec * sr)

    # ① max-RMS チャンク中心
    start_rms, end_rms = extract_maxrms(audio_data,
                                        sr=sr,
                                        target_sec=target_sec)
    center_clip = audio_data[start_rms:end_rms]
    if len(center_clip) == tgt and snr(center_clip) > 10:
        return start_rms, end_rms

    # ② STFT / 高域パワー
    S = np.abs(librosa.stft(audio_data, n_fft=1024, hop_length=256))**2
    mel = librosa.feature.melspectrogram(S=S, sr=sr, fmin=4000, fmax=10000)
    idx = mel.sum(0).argmax()
    start = max(int(idx * 256 - tgt // 2), 0)
    end   = start + tgt
    if end <= len(audio_data):
        clip = audio_data[start:end]
        if snr(clip) > 8:
            return start, end

    # ③ スペクトルエントロピー最小
    prob = S / (S.sum(0, keepdims=True) + 1e-12)
    ent  = -(prob * np.log(prob + 1e-12)).sum(0)
    idx  = ent.argmin()
    start = max(int(idx * 256 - tgt // 2), 0)
    end   = min(start + tgt, len(audio_data))     # ← ここで切り詰め

    return start, end


In [None]:
def smart1_crop(audio_data: np.ndarray,
               sr: int = config.FS,
               target_sec: float = config.TARGET_DURATION) -> tuple[int, int]:
    """
    1) “center” チャンクを中心に SNR ≥ 10 dB なら即採用
    2) 高域パワー最大フレーム (SNR ≥ 8 dB) を試す
    3) スペクトルエントロピー最小フレームを採用
    戻り値は (start_sample, end_sample)
    """
    target_samples = int(target_sec * sr)
    total_samples = len(audio_data)

    # ① max-RMS チャンク中心
    start_center = total_samples // 2 - target_samples // 2
    start_center = max(0, min(start_center, total_samples - target_samples))
    end_center = start_center + target_samples
    center_clip = audio_data[start_center:end_center]
    if len(center_clip) == target_samples and snr(center_clip) > 10:
        return start_center, end_center

    # ② STFT / 高域パワー
    S = np.abs(librosa.stft(audio_data, n_fft=1024, hop_length=256))**2
    mel = librosa.feature.melspectrogram(S=S, sr=sr, fmin=4000, fmax=10000)
    idx = mel.sum(0).argmax()
    start = max(int(idx * 256 - target_samples // 2), 0)
    end   = start + target_samples
    if end <= len(audio_data):
        clip = audio_data[start:end]
        if snr(clip) > 8:
            return start, end

    # ③ スペクトルエントロピー最小
    prob = S / (S.sum(0, keepdims=True) + 1e-12)
    ent  = -(prob * np.log(prob + 1e-12)).sum(0)
    idx  = ent.argmin()
    start = max(int(idx * 256 - target_samples // 2), 0)
    end   = min(start + target_samples, len(audio_data))     # ← ここで切り詰め

    return start, end

In [None]:
# crop_strategyに基づいて音声データを切り出す
# 現状centerしか使ってないのであまり意味がないコード．
def crop_audio(audio_data: np.ndarray, target_samples: int, strategy='center'):
    total_samples = len(audio_data)

    if total_samples < target_samples:
        n_copy = math.ceil(target_samples / total_samples)
        audio_data = np.concatenate([audio_data] * n_copy)
        total_samples = len(audio_data)

    if strategy == 'head':
        # 1秒遅らせて開始（ただし収まらない場合は0から）
        buffer = int(1.0 * config.FS)
        start_idx = min(buffer, total_samples - target_samples)
        end_idx = start_idx + target_samples
    elif strategy == 'tail':
        start_idx = total_samples - target_samples
        start_idx = max(0, min(start_idx, total_samples - target_samples))
        end_idx = start_idx + target_samples
    elif strategy == 'center':
        start_idx = total_samples // 2 - target_samples // 2
        start_idx = max(0, min(start_idx, total_samples - target_samples))
        end_idx = start_idx + target_samples
    elif strategy == 'random':
        max_start = total_samples - target_samples
        start_idx = np.random.randint(0, max_start + 1)
        start_idx = max(0, min(start_idx, total_samples - target_samples))
        end_idx = start_idx + target_samples
    elif isinstance(strategy, (float, int)):
        start_idx = int(strategy * config.FS)
        start_idx = max(0, min(start_idx, total_samples - target_samples))
        end_idx = start_idx + target_samples
    elif strategy == "maxRMS":
        start_idx, end_idx = extract_maxrms(audio_data)
    elif strategy == "maxdb":
        start_idx, end_idx = extract_maxdb(audio_data)
    elif strategy == "smart":
        start_idx, end_idx = smart_crop(audio_data)  #maxRMS + A + B
    elif strategy == "smart1":
        start_idx, end_idx = smart1_crop(audio_data)   #center + A + B
    else:
        raise ValueError(f"Unknown strategy: {strategy}")


    return audio_data[start_idx:end_idx]

In [None]:
# audioをmelに変換
def audio2melspec(audio_data):
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=config.FS,
        n_fft=config.N_FFT,
        hop_length=config.HOP_LENGTH,
        n_mels=config.N_MELS,
        fmin=config.FMIN,
        fmax=config.FMAX,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

In [None]:
# 音源をmelに変える処理．並列化に対応
# 元の関数（等間隔にaugment）
def process_row(row):

    strategy = "smart" 
    try:
        audio_data, _ = librosa.load(row.filepath, sr=config.FS)
        target_samples = int(config.TARGET_DURATION * config.FS)

        mel_list = []
        name_list = []

        # === 有効範囲を秒 → サンプルに変換 ===
        valid_start_sec = row.get("valid_start_sec", 0)
        valid_end_sec = row.get("valid_end_sec", None)
        duration_sec = len(audio_data) / config.FS

        if pd.isna(valid_end_sec) or valid_end_sec is None:
            valid_end_sec = duration_sec

        valid_start_sample = int(valid_start_sec * config.FS)
        valid_end_sample = int(valid_end_sec * config.FS)

        usable_audio = audio_data[valid_start_sample:valid_end_sample]
        total_usable_samples = len(usable_audio)

        # === オリジナル clip ===
        # strategy = row.crop_strategy
        # try:
        #     strategy = float(strategy)
        # except ValueError:
        #     pass

        clip = crop_audio(usable_audio, target_samples, strategy=strategy)  # strategyはcenter固定 or 任意でも可
        if len(clip) < target_samples:
            clip = np.pad(clip, (0, target_samples - len(clip)), mode='constant')
        mel = audio2melspec(clip)
        if mel.shape != config.TARGET_SHAPE:
            mel = cv2.resize(mel, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        mel_list.append(mel.astype(np.float32))
        name_list.append(row.samplename)

        # === n_augment に応じて crop ===
        n_aug = int(row.get("n_augment", 0))
        if n_aug <= 0:
            return list(zip(name_list, mel_list)), None

        interval = max((total_usable_samples - target_samples) // (n_aug + 1), 1)

        for i in range(n_aug):
            start_idx = min(i * interval, total_usable_samples - target_samples)
            clip = usable_audio[start_idx: start_idx + target_samples]
            if len(clip) < target_samples:
                clip = np.pad(clip, (0, target_samples - len(clip)), mode='constant')
            mel_crop = audio2melspec(clip)
            if mel_crop.shape != config.TARGET_SHAPE:
                mel_crop = cv2.resize(mel_crop, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
            mel_list.append(mel_crop.astype(np.float32))
            name_list.append(f"{row.samplename}_crop{i}")

        return list(zip(name_list, mel_list)), None

    except Exception as e:
        return None, (row.filepath, str(e))

In [None]:
# 音源をmelに変える処理．並列化に対応
# usable_audioをn_augment等分割し、それぞれの領域でcrop_audio 
def process_multisegment_row(row):

    # 切り出す戦略を変えるにはここを変更
    strategy = "maxRMS"
    
    try:
        audio_data, _ = librosa.load(row.filepath, sr=config.FS)
        target_samples = int(config.TARGET_DURATION * config.FS)

        mel_list = []
        name_list = []

        # === 有効範囲を秒 → サンプルに変換 ===
        valid_start_sec = row.get("valid_start_sec", 0)
        valid_end_sec = row.get("valid_end_sec", None)
        duration_sec = len(audio_data) / config.FS

        if pd.isna(valid_end_sec) or valid_end_sec is None:
            valid_end_sec = duration_sec

        valid_start_sample = int(valid_start_sec * config.FS)
        valid_end_sample = int(valid_end_sec * config.FS)

        usable_audio = audio_data[valid_start_sample:valid_end_sample]
        total_usable_samples = len(usable_audio)

        # === オリジナル clip ===
        # strategy = row.crop_strategy
        # try:
        #     strategy = float(strategy)
        # except ValueError:
        #     pass

        clip = crop_audio(usable_audio, target_samples, strategy=strategy)  # strategyはcenter固定 or 任意でも可
        if len(clip) < target_samples:
            clip = np.pad(clip, (0, target_samples - len(clip)), mode='constant')
        mel = audio2melspec(clip)
        if mel.shape != config.TARGET_SHAPE:
            mel = cv2.resize(mel, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        mel_list.append(mel.astype(np.float32))
        name_list.append(row.samplename)

        # === n_augment に応じて crop ===
        n_aug = int(row.get("n_augment", 0))
        if n_aug <= 0:
            return list(zip(name_list, mel_list)), None
        
        # ──────────────────────────────────────────────
        # ★ 等分割ブロックごとにクロップ  ★
        # usable_audio 全体を (n_aug+1) ブロックに等分
        borders = np.linspace(0,          # 0 サンプル
                              total_usable_samples,   # 最後
                              n_aug + 2,              # 区間端点数 = n_aug+1 ブロック
                              dtype=int)
        
        for i in range(1, n_aug + 1):     # 1 … n_aug
            seg_start = borders[i    ]    # ブロック i の開始
            seg_end   = borders[i + 1]    # ブロック i の終了
            segment   = usable_audio[seg_start:seg_end]
            seg_len   = len(segment)
        
            # ブロックが十分長い → 指定 strategy でクロップ
            if seg_len >= target_samples:
                clip = crop_audio(segment,
                                  target_samples,
                                  strategy=strategy)
            # 短いブロック → usable_audio 全体からランダム抽出
            else:
                clip = crop_audio(usable_audio,
                                  target_samples,
                                  strategy="random")
        
            # 長さ不足なら右側ゼロ埋め（保険）
            if len(clip) < target_samples:
                clip = np.pad(clip,
                              (0, target_samples - len(clip)),
                              mode="constant")
        
            # Mel 化 → サイズ統一
            mel_crop = audio2melspec(clip)
            if mel_crop.shape != config.TARGET_SHAPE:
                mel_crop = cv2.resize(mel_crop, config.TARGET_SHAPE,
                                      interpolation=cv2.INTER_LINEAR)
        
            mel_list.append(mel_crop.astype(np.float32))
            name_list.append(f"{row.samplename}_crop{i-1}")
        # ──────────────────────────────────────────────
        
        return list(zip(name_list, mel_list)), None

    except Exception as e:
        return None, (row.filepath, str(e))

In [None]:
# mel変換を並列化
# results = Parallel(n_jobs=config.N_JOBS)(
#     delayed(process_row)(row) for _, row in working_df.iloc[:total_samples].iterrows()
# )
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib


tqdm.monitor_interval = 0


total_samples = len(working_df)

with tqdm_joblib(tqdm(desc="Processing", total=total_samples, dynamic_ncols=True, disable=False)):
    results = Parallel(n_jobs=config.N_JOBS, verbose=0)(
        delayed(process_row)(row) for _, row in working_df.iterrows()
    )



# 結果の整理
all_bird_data = {}
errors = []

for result, err in results:
    if result is not None:
        for name, mel in result:
            all_bird_data[name] = mel
    if err is not None:
        errors.append(err)
        
print(f"Total errors: {len(errors)}")
if errors:
    print("Errors:")
    for filepath, error in errors:
        print(f"  {filepath}: {error}")    

In [None]:
# working_dfにaugmentしたデータ情報を追加
augmented_rows = []

for _, row in working_df.iterrows():
    n_aug = int(row.get('n_augment', 0))
    if n_aug > 0:
        for i in range(n_aug):
            new_row = row.copy()
            new_row['samplename'] = f"{row.samplename}_crop{i}"
            augmented_rows.append(new_row)

# DataFrameにまとめる
augmented_rows = pd.DataFrame(augmented_rows)
working_df_augmented = pd.concat([working_df, augmented_rows], ignore_index=True)
print(f"✅ working_df_augmented created with {len(augmented_rows)} augmented rows.")

In [None]:
# errorになったファイルを除外

# エラーになったファイルパスだけ抽出
error_files = [e[0] for e in errors]

# 削除対象の行を抽出
to_remove = working_df_augmented[working_df_augmented['filepath'].isin(error_files)]

# 削除されるファイルパスを表示
print("削除されるファイル:")
for fname in to_remove['filename']:
    print(fname)

# 実際に削除
working_df_augmented = working_df_augmented[~working_df_augmented['filepath'].isin(error_files)]

In [None]:
# melとworking_dfを保存．working_dfはtrain.csvとして保存


import os
import numpy as np
import pickle
import csv
from datetime import datetime
import pytz

# === JST時刻でディレクトリ作成 ===
jst = pytz.timezone('Asia/Tokyo')
now = datetime.now(jst)
timestamp = now.strftime("%Y%m%d_%H%M")

# ✅ 保存先フォルダを debug に応じて分岐
if config.debug:
    output_dir = os.path.join(config.PROCESSED_DIR, "data_debugs")
else:
    output_dir = os.path.join(config.PROCESSED_DIR, f"melspec_{timestamp}")
os.makedirs(output_dir, exist_ok=True)

# === 1. melスペクトログラムの保存 ===
output_path = os.path.join(output_dir, "birdclef2025_melspec_5sec_256_256.npy")
wrapped_array = np.array(all_bird_data, dtype=object)

with open(output_path, 'wb') as f:
    pickle.dump(wrapped_array, f, protocol=5)

print(f"\n✅ Mel-spectrograms saved to: {output_path}")
print(f"📦 File size: {os.path.getsize(output_path) / (1024 ** 2):.2f} MB")
print(f"📐 Example shape: {next(iter(all_bird_data.values())).shape}")

# === 2. configの保存 ===
config_path = os.path.join(output_dir, "config.csv")
config_dict = {k: v for k, v in vars(config).items() if not k.startswith("__")}

with open(config_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["key", "value"])
    for key, value in config_dict.items():
        writer.writerow([key, value])

print(f"📝 Config saved to: {config_path}")


# ✅ train.csv として保存
train_csv_path = os.path.join(output_dir, "train.csv")
working_df_augmented.to_csv(train_csv_path, index=False)

print(f"📝 Augmented training metadata saved to: {train_csv_path}")
print(f"📊 Total rows: {len(working_df_augmented)}")