# **BirdCLEF 2025 Data Preprocessing Notebook**

In [41]:
import os
import cv2
import math
import time
import librosa
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import os
import numpy as np
from datetime import datetime
import pytz
import torch
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedGroupKFold

from joblib import Parallel, delayed


from module import config_lib, utils_lib

In [43]:
class DatasetConfig:
    def __init__(self, kaggle_notebook=False, debug=False):
        self.KAGGLE_NOTEBOOK = kaggle_notebook
        self.debug = debug

        # ===== Path Settings =====
        if self.KAGGLE_NOTEBOOK:
            self.OUTPUT_DIR = ''
            self.train_datadir = '/kaggle/input/birdclef-2025/train_audio'
            self.train_csv = '/kaggle/input/birdclef-2025/train.csv'
            self.test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
            self.submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
            self.taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
            self.model_path = '/kaggle/input/birdclef-2025-0330'
        else:
            self.OUTPUT_DIR = '../data/result/'
            self.train_datadir = '../data/raw/train_audio/'
            self.train_csv = '../data/raw/train.csv'
            self.test_soundscapes = '../data/raw/test_soundscapes/'
            self.submission_csv = '../data/raw/sample_submission.csv'
            self.taxonomy_csv = '../data/raw/taxonomy.csv'
            self.models_dir = "../models/" # 全modelの保存先
            self.model_path = self.models_dir # 各モデルの保存先．学習時に動的に変更．
            self.RAW_DIR = '../data/raw/'
            self.PROCESSED_DIR = '../data/processed/'


        # ===== Audio Settings =====
        self.FS = 32000
        self.WINDOW_SIZE = 5.0 # 推論時のウィンドウサイズ
        self.TARGET_DURATION = 5 # データセット作成時のウィンドウサイズ
        self.TARGET_SHAPE = (256, 256)
        self.N_FFT = 1024
        self.HOP_LENGTH = 16
        self.N_MELS = 148
        self.FMIN = 20
        self.FMAX = 16000
        self.N_MAX = 50 if self.debug else None        
        self.N_JOBS = 16  # 並列処理のスレッド数 16くらいでいい
        self.LOAD_ENGINE = 'torchaudio'  # librosa or torchaudio
        self.SKIP_RESIZE = False  # resizeしないならTrue
        self.seed = 42
        self.n_fold = 5
        self.num_rare_samples = 50 # これ以下のサンプル数のspeciesはrare speciesとして扱う
            

In [44]:
config = DatasetConfig(kaggle_notebook=False, debug=False)

In [45]:
utils_lib.set_seed(config.seed)

In [46]:
print(f"Debug mode: {'ON' if config.debug else 'OFF'}")
print(f"Max samples to process: {config.N_MAX if config.N_MAX is not None else 'ALL'}")

print("Loading taxonomy data...")
taxonomy_df = pd.read_csv(f'{config.RAW_DIR}/taxonomy.csv')
species_class_map = dict(zip(taxonomy_df['primary_label'], taxonomy_df['class_name']))

print("Loading training metadata...")
train_df = pd.read_csv(f'{config.RAW_DIR}/train.csv')

Debug mode: OFF
Max samples to process: ALL
Loading taxonomy data...
Loading training metadata...


In [47]:
label_list = sorted(train_df['primary_label'].unique())
label_id_list = list(range(len(label_list)))
label2id = dict(zip(label_list, label_id_list))
id2label = dict(zip(label_id_list, label_list))

print(f'Found {len(label_list)} unique species')
working_df = train_df.copy()
working_df['target'] = working_df.primary_label.map(label2id)
working_df['filepath'] = config.RAW_DIR + '/train_audio/' + working_df.filename
working_df['samplename'] = working_df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])
working_df['class'] = working_df.primary_label.map(lambda x: species_class_map.get(x, 'Unknown'))
working_df["crop_strategy"] = "center"
total_samples = min(len(working_df), config.N_MAX or len(working_df))
print(f'Total samples to process: {total_samples} out of {len(working_df)} available')
print(f'Samples by class:')
print(working_df['class'].value_counts())

Found 206 unique species
Total samples to process: 28564 out of 28564 available
Samples by class:
class
Aves        27648
Amphibia      583
Mammalia      178
Insecta       155
Name: count, dtype: int64


In [48]:
# Fabio A. Sarria-Sの音声は最初にInsectaの鳴き声があるため，最初をcropしたい
# crop_starategyを'head'に変更する

# 1. "Fabio A. Sarria-S" の filename を抽出
fabio_filenames = train_df.loc[
    train_df['author'] == "Fabio A. Sarria-S", 'filename'
].tolist()

# 2. working_df の crop_strategy を 'head' に更新
working_df['crop_strategy'] = working_df.get('crop_strategy', 'center')  # 初期化（なければcenter）

working_df.loc[
    working_df['filename'].isin(fabio_filenames),
    'crop_strategy'
] = 'head'

In [49]:
# 全音源のdurationを計算
def get_duration(filepath, sr):
    try:
        audio, _ = librosa.load(filepath, sr=sr)
        return len(audio) / sr
    except Exception as e:
        print(f"[ERROR] Could not load {filepath}: {e}")
        return np.nan

print("🔄 Calculating durations with parallel processing...")

# tqdm 対応
filepaths = working_df['filepath'].tolist()
durations = Parallel(n_jobs=32)(
    delayed(get_duration)(fp, config.FS) for fp in tqdm(filepaths)
)

working_df['duration_sec'] = durations
print("✅ Added 'duration_sec' column to working_df")

🔄 Calculating durations with parallel processing...


  0%|          | 0/28564 [00:00<?, ?it/s]

✅ Added 'duration_sec' column to working_df


In [50]:
# 1. 列を初期化
working_df["valid_start_sec"] = 0
working_df["valid_end_sec"] = None  # None or np.nan にしておく

In [None]:
# CSA コレクションの音源 → 後ろ7秒を無効に．スペイン語の除去
csa_mask = working_df['collection'] == 'CSA'
working_df.loc[csa_mask, 'valid_end_sec'] = working_df.loc[csa_mask, 'duration_sec'] - 7.0

# 負の値にならないように
working_df['valid_end_sec'] = working_df['valid_end_sec'].clip(lower=0.0)


#  一部ラベル（スペイン語が含まれない例外）には full duration を許可（valid_end_sec を None に）
# 1564122, 523060はCSAだが短すぎ．スペイン語の解説もない．
exception_labels = ['1564122', '523060']
working_df.loc[working_df['primary_label'].isin(exception_labels), 'valid_end_sec'] = None


# Fabio A. Sarria-S の音声は 0〜7秒 だけ使用可能に設定．後半はただの説明なので
working_df.loc[
    working_df['filename'].isin(fabio_filenames), 'valid_end_sec'
] = 7.0


In [214]:
i = 19
idx_list = working_df[working_df["collection"] == "CSA"]["primary_label"].unique()
df = working_df[working_df["primary_label"] == f"{idx_list[i]}"][["primary_label", "filename", "author", "valid_end_sec", "duration_sec"]]
df

Unnamed: 0,primary_label,filename,author,valid_end_sec,duration_sec
911,963335,963335/CSA36372.ogg,Fabio A. Sarria-S,7.0,108.2425
912,963335,963335/CSA36374.ogg,Fabio A. Sarria-S,7.0,103.571344
913,963335,963335/CSA36375.ogg,Fabio A. Sarria-S,7.0,106.700375
914,963335,963335/CSA36377.ogg,Fabio A. Sarria-S,7.0,107.488969
915,963335,963335/CSA36393.ogg,Fabio A. Sarria-S,7.0,97.343281


In [215]:
utils_lib.play_audio(filename="963335/CSA36374.ogg",base_path="../data/raw//train_audio/")

In [None]:
# CSAチェック
# 最後にスペイン語が含まれていないlabel
# 1564122, 50186/CSA28885.ogg, 523060
# 52884/CSA14875.ogg
# 548639
# 714022
# 868458



# スペイン語最初に含まれる 4secくらい
# 50186/CSA28885.ogg
# 52884/CSA14875.ogg



# 話してる人
# Eliana Barona- Cortés
# Alexandra Butrago-Cardona
# Fabio A. Sarria-S

# 人の声だけの箇所
# 24292/CSA34649.ogg 2min48移行
# 24292/CSA34651.ogg 1min34移行
# 476537/CSA35459.ogg 2min14移行
# 476537/CSA35461.ogg 4min19移行


In [205]:
working_df[working_df["primary_label"] == "714022"]

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,...,target,filepath,samplename,class,crop_strategy,duration_sec,valid_start_sec,valid_end_sec,n_augment,multi_crop
876,714022,[''],[''],714022/CSA34203.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,-3.8333,-70.3333,Panoploscelis specularis,...,54,../data/raw//train_audio/714022/CSA34203.ogg,714022-CSA34203,Insecta,center,27.169125,0,20.169125,4,True
877,714022,[''],[''],714022/CSA34204.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,-3.8333,-70.3333,Panoploscelis specularis,...,54,../data/raw//train_audio/714022/CSA34204.ogg,714022-CSA34204,Insecta,center,37.742344,0,30.742344,6,True
878,714022,[''],[''],714022/CSA34205.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,-3.8333,-70.3333,Panoploscelis specularis,...,54,../data/raw//train_audio/714022/CSA34205.ogg,714022-CSA34205,Insecta,center,40.458906,0,33.458906,6,True
879,714022,[''],[''],714022/CSA34206.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,-3.8333,-70.3333,Panoploscelis specularis,...,54,../data/raw//train_audio/714022/CSA34206.ogg,714022-CSA34206,Insecta,center,28.307656,0,21.307656,4,True
880,714022,[''],[''],714022/CSA34207.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,-3.8333,-70.3333,Panoploscelis specularis,...,54,../data/raw//train_audio/714022/CSA34207.ogg,714022-CSA34207,Insecta,center,42.226188,0,35.226188,7,True


In [98]:
working_df[(working_df["duration_sec"] < 15) & (working_df["collection"] == "CSA")]["primary_label"].value_counts()

primary_label
1564122    6
548639     5
523060     4
868458     4
48124      3
1194042    1
52884      1
Name: count, dtype: int64

In [52]:
# 初期化
working_df['n_augment'] = 0
working_df['multi_crop'] = False

target_samples = int(config.TARGET_DURATION * config.FS)

# valid_end_sec が None なら duration_sec に補完
working_df['valid_end_sec'] = working_df.apply(
    lambda row: row['duration_sec'] if pd.isna(row['valid_end_sec']) else row['valid_end_sec'],
    axis=1
)

# valid_start_sec が None なら 0 に補完（念のため）
working_df['valid_start_sec'] = working_df['valid_start_sec'].fillna(0)

# rareなラベルを抽出
label_counts = working_df['primary_label'].value_counts().rename_axis("label").reset_index(name="sample_count")
rare_labels = label_counts[label_counts['sample_count'] < config.num_rare_samples]['label'].tolist()

# ✅ rare種ごとに crop 数を割り当てる
for rare_label in rare_labels:
    base_rows = working_df[working_df['primary_label'] == rare_label]
    n_exist = len(base_rows)
    n_needed = config.num_rare_samples - n_exist
    n_aug_per_sample = math.ceil(n_needed / n_exist)

    for idx, row in base_rows.iterrows():
        usable_duration_sec = row['valid_end_sec'] - row['valid_start_sec']
        usable_samples = int(usable_duration_sec * config.FS)

        # 少なくとも2倍にする
        max_possible = usable_samples // target_samples
        n_actual = min(n_aug_per_sample, max_possible)

        if n_actual > 0:
            working_df.at[idx, 'multi_crop'] = True
            working_df.at[idx, 'n_augment'] = n_actual

In [53]:
# crop_strategyに基づいて音声データを切り出す
def crop_audio(audio_data: np.ndarray, target_samples: int, strategy='center'):
    total_samples = len(audio_data)

    if total_samples < target_samples:
        n_copy = math.ceil(target_samples / total_samples)
        audio_data = np.concatenate([audio_data] * n_copy)
        total_samples = len(audio_data)

    if strategy == 'head':
        # 1秒遅らせて開始（ただし収まらない場合は0から）
        buffer = int(1.0 * config.FS)
        start_idx = min(buffer, total_samples - target_samples)
    elif strategy == 'tail':
        start_idx = total_samples - target_samples
    elif strategy == 'center':
        start_idx = total_samples // 2 - target_samples // 2
    elif strategy == 'random':
        max_start = total_samples - target_samples
        start_idx = np.random.randint(0, max_start + 1)
    elif isinstance(strategy, (float, int)):
        start_idx = int(strategy * config.FS)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

    start_idx = max(0, min(start_idx, total_samples - target_samples))
    end_idx = start_idx + target_samples
    return audio_data[start_idx:end_idx]

In [54]:
def audio2melspec(audio_data):
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=config.FS,
        n_fft=config.N_FFT,
        hop_length=config.HOP_LENGTH,
        n_mels=config.N_MELS,
        fmin=config.FMIN,
        fmax=config.FMAX,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

In [55]:
def process_row(row):
    try:
        audio_data, _ = librosa.load(row.filepath, sr=config.FS)
        target_samples = int(config.TARGET_DURATION * config.FS)

        mel_list = []
        name_list = []

        # === 有効範囲を秒 → サンプルに変換 ===
        valid_start_sec = row.get("valid_start_sec", 0)
        valid_end_sec = row.get("valid_end_sec", None)
        duration_sec = len(audio_data) / config.FS

        if pd.isna(valid_end_sec) or valid_end_sec is None:
            valid_end_sec = duration_sec

        valid_start_sample = int(valid_start_sec * config.FS)
        valid_end_sample = int(valid_end_sec * config.FS)

        usable_audio = audio_data[valid_start_sample:valid_end_sample]
        total_usable_samples = len(usable_audio)

        # === オリジナル clip ===
        strategy = row.crop_strategy
        try:
            strategy = float(strategy)
        except ValueError:
            pass

        clip = crop_audio(usable_audio, target_samples, strategy="center")  # strategyはcenter固定 or 任意でも可
        if len(clip) < target_samples:
            clip = np.pad(clip, (0, target_samples - len(clip)), mode='constant')
        mel = audio2melspec(clip)
        if mel.shape != config.TARGET_SHAPE:
            mel = cv2.resize(mel, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        mel_list.append(mel.astype(np.float32))
        name_list.append(row.samplename)

        # === n_augment に応じて crop ===
        n_aug = int(row.get("n_augment", 0))
        if n_aug <= 0:
            return list(zip(name_list, mel_list)), None

        interval = max((total_usable_samples - target_samples) // (n_aug + 1), 1)

        for i in range(n_aug):
            start_idx = min(i * interval, total_usable_samples - target_samples)
            clip = usable_audio[start_idx: start_idx + target_samples]
            if len(clip) < target_samples:
                clip = np.pad(clip, (0, target_samples - len(clip)), mode='constant')
            mel_crop = audio2melspec(clip)
            if mel_crop.shape != config.TARGET_SHAPE:
                mel_crop = cv2.resize(mel_crop, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
            mel_list.append(mel_crop.astype(np.float32))
            name_list.append(f"{row.samplename}_crop{i}")

        return list(zip(name_list, mel_list)), None

    except Exception as e:
        return None, (row.filepath, str(e))

In [56]:
results = Parallel(n_jobs=config.N_JOBS)(
    delayed(process_row)(row) for _, row in working_df.iloc[:total_samples].iterrows()
)

# 結果の整理
all_bird_data = {}
errors = []

for result, err in results:
    if result is not None:
        for name, mel in result:
            all_bird_data[name] = mel
    if err is not None:
        errors.append(err)

In [69]:
errors

[('../data/raw//train_audio/1564122/CSA34195.ogg', 'division by zero'),
 ('../data/raw//train_audio/1564122/CSA34196.ogg', 'division by zero'),
 ('../data/raw//train_audio/1564122/CSA34197.ogg', 'division by zero'),
 ('../data/raw//train_audio/1564122/CSA34198.ogg', 'division by zero'),
 ('../data/raw//train_audio/1564122/CSA34199.ogg', 'division by zero'),
 ('../data/raw//train_audio/1564122/CSA34200.ogg', 'division by zero'),
 ('../data/raw//train_audio/523060/CSA34180.ogg', 'division by zero'),
 ('../data/raw//train_audio/523060/CSA34182.ogg', 'division by zero'),
 ('../data/raw//train_audio/548639/CSA34185.ogg', 'division by zero'),
 ('../data/raw//train_audio/548639/CSA34186.ogg', 'division by zero'),
 ('../data/raw//train_audio/548639/CSA34189.ogg', 'division by zero'),
 ('../data/raw//train_audio/868458/CSA34217.ogg', 'division by zero'),
 ('../data/raw//train_audio/868458/CSA34220.ogg', 'division by zero')]

In [57]:
# working_dfにaugmentしたデータ情報を追加
augmented_rows = []

for _, row in working_df.iterrows():
    n_aug = int(row.get('n_augment', 0))
    if n_aug > 0:
        for i in range(n_aug):
            new_row = row.copy()
            new_row['samplename'] = f"{row.samplename}_crop{i}"
            augmented_rows.append(new_row)

# DataFrameにまとめる
augmented_rows = pd.DataFrame(augmented_rows)
working_df_augmented = pd.concat([working_df, augmented_rows], ignore_index=True)
print(f"✅ working_df_augmented created with {len(augmented_rows)} augmented rows.")

✅ working_df_augmented created with 1912 augmented rows.


In [58]:
# foldを決めておく


working_df_augmented['group_id'] = working_df_augmented['samplename'].map(lambda x: x.split('_crop')[0])

# fold 列を初期化
working_df_augmented['fold'] = -1

# ✅ stratify + group 両立！
sgkf = StratifiedGroupKFold(n_splits=config.n_fold, shuffle=True, random_state=config.seed)
groups = working_df_augmented['group_id']
labels = working_df_augmented['primary_label']

for fold_id, (_, val_idx) in enumerate(sgkf.split(working_df_augmented, labels, groups=groups)):
    working_df_augmented.loc[val_idx, 'fold'] = fold_id

In [64]:
# 4mins
import os
import numpy as np
import pickle
import csv
from datetime import datetime
import pytz

# === JST時刻でディレクトリ作成 ===
jst = pytz.timezone('Asia/Tokyo')
now = datetime.now(jst)
timestamp = now.strftime("%Y%m%d_%H%M")

# ✅ 保存先フォルダを debug に応じて分岐
if config.debug:
    output_dir = os.path.join(config.PROCESSED_DIR, "data_debugs")
else:
    output_dir = os.path.join(config.PROCESSED_DIR, f"melspec_{timestamp}")
os.makedirs(output_dir, exist_ok=True)

# === 1. melスペクトログラムの保存 ===
output_path = os.path.join(output_dir, "birdclef2025_melspec_5sec_256_256.npy")
wrapped_array = np.array(all_bird_data, dtype=object)

with open(output_path, 'wb') as f:
    pickle.dump(wrapped_array, f, protocol=5)

print(f"\n✅ Mel-spectrograms saved to: {output_path}")
print(f"📦 File size: {os.path.getsize(output_path) / (1024 ** 2):.2f} MB")
print(f"📐 Example shape: {next(iter(all_bird_data.values())).shape}")

# === 2. configの保存 ===
config_path = os.path.join(output_dir, "config.csv")
config_dict = {k: v for k, v in vars(config).items() if not k.startswith("__")}

with open(config_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["key", "value"])
    for key, value in config_dict.items():
        writer.writerow([key, value])

print(f"📝 Config saved to: {config_path}")


# ✅ train.csv として保存
train_csv_path = os.path.join(output_dir, "train.csv")
working_df_augmented.to_csv(train_csv_path, index=False)

print(f"📝 Augmented training metadata saved to: {train_csv_path}")
print(f"📊 Total rows: {len(working_df_augmented)}")


✅ Mel-spectrograms saved to: ../data/processed/melspec_20250413_1922/birdclef2025_melspec_5sec_256_256.npy
📦 File size: 7618.41 MB
📐 Example shape: (256, 256)
📝 Config saved to: ../data/processed/melspec_20250413_1922/config.csv
📝 Augmented training metadata saved to: ../data/processed/melspec_20250413_1922/train.csv
📊 Total rows: 30476


In [102]:
spectrograms = np.load("../data/processed/mel_0411/birdclef2025_melspec_5sec_256_256.npy", allow_pickle=True).item()
