# **BirdCLEF 2025 Data Preprocessing Notebook**

In [108]:
import os
import cv2
import math
import time
import librosa
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from importlib import reload

import os
import numpy as np
from datetime import datetime
import pytz
import torch
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedGroupKFold

from joblib import Parallel, delayed


from module import config_lib, utils_lib

In [123]:
class DatasetConfig:
    def __init__(self, kaggle_notebook=False, debug=False):
        self.KAGGLE_NOTEBOOK = kaggle_notebook
        self.debug = debug

        # ===== Path Settings =====
        if self.KAGGLE_NOTEBOOK:
            self.OUTPUT_DIR = ''
            self.train_datadir = '/kaggle/input/birdclef-2025/train_audio'
            self.train_csv = '/kaggle/input/birdclef-2025/train.csv'
            self.test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
            self.submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
            self.taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
            self.model_path = '/kaggle/input/birdclef-2025-0330'
        else:
            self.OUTPUT_DIR = '../data/result/'
            self.train_datadir = '../data/raw/train_audio/'
            self.test_soundscapes = '../data/raw/test_soundscapes/'
            self.submission_csv = '../data/raw/sample_submission.csv'
            self.taxonomy_csv = '../data/raw/taxonomy.csv'
            self.models_dir = "../models/" # 全modelの保存先
            self.model_path = self.models_dir # 各モデルの保存先．学習時に動的に変更．
            self.RAW_DIR = '../data/raw/'
            self.PROCESSED_DIR = '../data/processed/'
            
            self.train_csv = '../data/processed/mel_CropAugment/train.csv'
            self.spectrogram_npy = '../data/processed/mel_CropAugment/birdclef2025_melspec_5sec_256_256.npy'


        # ===== Audio Settings =====
        self.FS = 32000
        self.WINDOW_SIZE = 5.0 # 推論時のウィンドウサイズ
        self.TARGET_DURATION = 5 # データセット作成時のウィンドウサイズ
        self.TARGET_SHAPE = (256, 256)
        self.N_FFT = 1024
        self.HOP_LENGTH = 16
        self.N_MELS = 148
        self.FMIN = 20
        self.FMAX = 16000
        self.N_MAX = 50 if self.debug else None        
        self.N_JOBS = 16  # 並列処理のスレッド数 16くらいでいい
        self.LOAD_ENGINE = 'torchaudio'  # librosa or torchaudio
        self.SKIP_RESIZE = False  # resizeしないならTrue
        self.seed = 42
        self.n_fold = 5
            

In [124]:
cfg = DatasetConfig(kaggle_notebook=False, debug=False)

In [125]:
utils_lib.set_seed(cfg.seed)

In [90]:
print(f"Debug mode: {'ON' if cfg.debug else 'OFF'}")
print(f"Max samples to process: {cfg.N_MAX if cfg.N_MAX is not None else 'ALL'}")

print("Loading taxonomy data...")
taxonomy_df = pd.read_csv(f'{cfg.RAW_DIR}/taxonomy.csv')
species_class_map = dict(zip(taxonomy_df['primary_label'], taxonomy_df['class_name']))

print("Loading training metadata...")
train_df = pd.read_csv(f'{cfg.RAW_DIR}/train.csv')

Debug mode: OFF
Max samples to process: ALL
Loading taxonomy data...
Loading training metadata...


In [91]:
working_df_aug = pd.read_csv(cfg.train_csv)
# mixupの有無を示すカラムを追加
if 'is_mixup' not in working_df_aug.columns:
    working_df_aug['is_mixup'] = False
if 'mix_partner' not in working_df_aug.columns:
    working_df_aug['mix_partner'] = None
if 'mixup_weight' not in working_df_aug.columns:
    working_df_aug['mixup_weight'] = None

# === 2. Rare label 抽出（例: <50件） ===
label_counts = working_df_aug['primary_label'].value_counts()
# ✅ レア種候補の primary_label を抽出
rare_labels = working_df_aug.loc[working_df_aug['n_augment'] > 0, 'primary_label'].unique().tolist()

# ✅ そのうちの crop だけを対象に
rare_crop_df = working_df_aug[
    (working_df_aug['primary_label'].isin(rare_labels)) &
    (working_df_aug['samplename'].str.contains('_crop')) &
    (~working_df_aug['samplename'].str.contains('_mix_'))  # 既存mixup除外
]

# === ラベル情報 ===
label2id = {label: idx for idx, label in enumerate(sorted(working_df_aug['primary_label'].unique()))}
num_classes = len(label2id)

# === レア種crop（augmentationされた）だけを抽出 ===
rare_crop_df = working_df_aug[
    (working_df_aug['n_augment'] > 0) &
    (working_df_aug['samplename'].str.contains('_crop')) &
    (~working_df_aug['samplename'].str.contains('_mix_'))
].copy()

# === コモン種抽出 ===
num_common_labels = 50 # 50位でデータ数180程度．
label_counts = working_df_aug['primary_label'].value_counts()
common_labels = label_counts.head(num_common_labels).index.tolist()

common_df = working_df_aug[working_df_aug['primary_label'].isin(common_labels)].copy()


In [92]:
# レア種にコモン種を混ぜて上書き．cropされているものはすべてmixup対象
all_bird_data = np.load(cfg.spectrogram_npy, allow_pickle=True).item()

for idx, row in rare_crop_df.iterrows():
    s_rare = row['samplename']
    label = row['primary_label']
    if s_rare not in all_bird_data:
        continue

    # === コモン種のcropを1つランダムに選択 乱数固定
    s_common_row = common_df.sample(1, random_state=cfg.seed + idx).iloc[0]
    s_common = s_common_row['samplename']
    common_label = s_common_row['primary_label']

    if s_common not in all_bird_data:
        continue

    # === Mixup実行 ===
    m1 = all_bird_data[s_rare]
    m2 = all_bird_data[s_common]

    lam = np.random.uniform(0.7, 0.9)
    mixed = np.clip(lam * m1 + (1 - lam) * m2, 0, 1)

    # === melを上書き ===
    all_bird_data[s_rare] = mixed

    # === working_dfの該当行を更新 ===
    working_df_aug.loc[idx, 'is_mixup'] = True
    working_df_aug.loc[idx, 'mix_partner'] = s_common
    working_df_aug.loc[idx, 'mixup_weight'] = round(float(lam), 4)

    # === secondary_labels を追加・更新 ===
    try:
        current_sec = row['secondary_labels']
        current_sec_list = eval(current_sec) if isinstance(current_sec, str) else []
    except:
        current_sec_list = []

    # --- 型・形式の安全処理 ---
    if not isinstance(current_sec_list, list):
        current_sec_list = []

    # ✅ 空文字やNoneを除去
    current_sec_list = [s for s in current_sec_list if s and s.strip() != '']

    # ✅ コモン種ラベルが入っていなければ追加
    if common_label not in current_sec_list:
        current_sec_list.append(common_label)

    working_df_aug.loc[idx, 'secondary_labels'] = str(current_sec_list)

In [93]:
import os
import numpy as np
import pickle
import csv
from datetime import datetime
import pytz

# === JST時刻でディレクトリ作成 ===
jst = pytz.timezone('Asia/Tokyo')
now = datetime.now(jst)
timestamp = now.strftime("%Y%m%d_%H%M")

# ✅ 保存先フォルダを debug に応じて分岐
if cfg.debug:
    output_dir = os.path.join(cfg.PROCESSED_DIR, "data_debugs")
else:
    output_dir = os.path.join(cfg.PROCESSED_DIR, f"melspec_{timestamp}")
os.makedirs(output_dir, exist_ok=True)

# === 1. melスペクトログラム（all_bird_data）の保存 ===
output_path = os.path.join(output_dir, "birdclef2025_melspec_5sec_256_256.npy")

wrapped_array = np.array(all_bird_data, dtype=object)
with open(output_path, 'wb') as f:
    pickle.dump(wrapped_array, f, protocol=5)

print(f"\n✅ Mel-spectrograms saved to: {output_path}")
print(f"📦 File size: {os.path.getsize(output_path) / (1024 ** 2):.2f} MB")
print(f"📐 Example shape: {next(iter(all_bird_data.values())).shape}")

# === 2. configの保存 ===
config_path = os.path.join(output_dir, "config.csv")
config_dict = {k: v for k, v in vars(cfg).items() if not k.startswith("__")}

with open(config_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["key", "value"])
    for key, value in config_dict.items():
        writer.writerow([key, value])

print(f"📝 Config saved to: {config_path}")

# === 3. augmented metadata の保存 ===
train_csv_path = os.path.join(output_dir, "train.csv")
working_df_aug.to_csv(train_csv_path, index=False)

print(f"📝 Augmented training metadata saved to: {train_csv_path}")
print(f"📊 Total rows: {len(working_df_aug)}")


✅ Mel-spectrograms saved to: ../data/processed/melspec_20250412_1803/birdclef2025_melspec_5sec_256_256.npy
📦 File size: 7980.93 MB
📐 Example shape: (256, 256)
📝 Config saved to: ../data/processed/melspec_20250412_1803/config.csv
📝 Augmented training metadata saved to: ../data/processed/melspec_20250412_1803/train.csv
📊 Total rows: 31913


In [110]:
utils_lib.play_audio("rebbla1/XC560068.ogg", cfg.train_datadir)

In [103]:
working_df_aug[working_df_aug["primary_label"] == "1139490"].head()

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,...,samplename,class,crop_strategy,n_augment,multi_crop,group_id,fold,is_mixup,mix_partner,mixup_weight
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,...,1139490-CSA36385,Insecta,head,24,True,1139490-CSA36385,0,False,,
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,...,1139490-CSA36389,Insecta,head,24,True,1139490-CSA36389,3,False,,
28564,1139490,['grekis'],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,...,1139490-CSA36385_crop0,Insecta,head,24,True,1139490-CSA36385,0,True,grekis-iNat826128,0.7749
28565,1139490,['orcpar'],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,...,1139490-CSA36385_crop1,Insecta,head,24,True,1139490-CSA36385,0,True,orcpar-XC822732,0.8901
28566,1139490,['banana'],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,...,1139490-CSA36385_crop2,Insecta,head,24,True,1139490-CSA36385,0,True,banana-iNat1083501,0.8464


In [113]:
mel = all_bird_data["1139490-CSA36385"]
utils_lib.inverse_melspec(mel, cfg)

In [120]:
melspec = np.load("../data/processed/mel_0411/birdclef2025_melspec_5sec_256_256.npy", allow_pickle=True).item()

In [121]:
mel = all_bird_data["1192948-CSA36373"]
utils_lib.inverse_melspec(mel, cfg)