In [5]:
import os
import logging
import random
import gc
import time
import cv2
import math
import warnings
from pathlib import Path
from datetime import datetime, timezone, timedelta

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import librosa

from sklearn.metrics import roc_auc_score, average_precision_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import json
import timm

from importlib import reload

logging.basicConfig(level=logging.ERROR)

from module import preprocess_lib, datasets_lib, utils_lib, models_lib, learning_lib, config_lib
reload(config_lib)

<module 'module.config_lib' from '/root/program/birdclef-2025/scripts/module/config_lib.py'>

In [6]:
def calculate_auc(targets, probs):
        targets_bin = (targets >= 0.5).astype(int)
        aucs = [roc_auc_score(targets_bin[:, i], probs[:, i])
                for i in range(targets.shape[1]) if np.sum(targets_bin[:, i]) > 0]
        return np.mean(aucs) if aucs else 0.0

def calculate_map(targets, probs):
    targets_bin = (targets >= 0.5).astype(int)
    aps = [average_precision_score(targets_bin[:, i], probs[:, i])
            for i in range(targets.shape[1]) if np.sum(targets_bin[:, i]) > 0]
    return np.mean(aps) if aps else 0.0

class CFG:
    def __init__(self, mode="train", kaggle_notebook=False, debug=False):
        assert mode in ["train", "inference"], "mode must be 'train' or 'inference'"
        self.mode = mode
        self.KAGGLE_NOTEBOOK = kaggle_notebook
        self.debug = debug

        # ===== Path Settings =====
        if self.KAGGLE_NOTEBOOK:
            self.OUTPUT_DIR = ''
            self.train_datadir = '/kaggle/input/birdclef-2025/train_audio'
            self.train_csv = '/kaggle/input/birdclef-2025/train.csv'
            self.test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
            self.submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
            self.taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
            self.spectrogram_npy = '/kaggle/input/birdclef25-mel-spectrograms/birdclef2025_melspec_5sec_256_256.npy'
            self.model_path = '/kaggle/input/birdclef-2025-0330'
        else:
            self.OUTPUT_DIR = '../data/result/'
            self.RAW_DIR = '../data/raw/'
            self.PROCESSED_DIR = '../data/processed/'
            self.train_datadir = '../data/raw/train_audio/'
            self.train_csv = '../data/raw/train.csv'
            self.test_soundscapes = '../data/raw/test_soundscapes/'
            self.submission_csv = '../data/raw/sample_submission.csv'
            self.taxonomy_csv = '../data/raw/taxonomy.csv'
            self.models_dir = "../models/" # 全modelの保存先
            self.model_path = self.models_dir # 各モデルの保存先．学習時に動的に変更．
            
            self.spectrogram_npy = '../data/processed/baseline/birdclef2025_melspec_5sec_256_256.npy'
            
            self.pseudo_label_csv = "../data/result/pseudo_labels_baseline_7sec.csv"
            self.pseudo_melspec_npy = "../data/processed/train_soundscapes_0407/train_soundscapes_melspecs.npy"

        # ===== Model Settings =====
        self.model_name = 'efficientnet_b0'
        self.pretrained = True if mode == "train" else False
        self.in_channels = 1

        # ===== Audio Settings =====
        self.FS = 32000
        self.WINDOW_SIZE = 5.0 # 推論時のウィンドウサイズ
        self.TARGET_DURATION = 5.0 # データセット作成時のウィンドウサイズ
        self.TARGET_SHAPE = (256, 256)
        self.N_FFT = 1024
        self.HOP_LENGTH = 512
        self.N_MELS = 128
        self.FMIN = 50
        self.FMAX = 14000        

        # ===== Training Mode =====
        if mode == "train":
            self.seed = 42
            self.apex = False
            self.print_freq = 100
            self.num_workers = 2

            self.LOAD_DATA = True
            self.epochs = 10
            self.batch_size = 32
            self.criterion = 'BCEWithLogitsLoss'

            self.n_fold = 5
            self.selected_folds = [0, 1, 2, 3, 4]

            self.optimizer = 'AdamW'
            self.lr = 5e-4
            self.weight_decay = 1e-5
            self.scheduler = 'CosineAnnealingLR'
            self.min_lr = 1e-6
            self.T_max = self.epochs

            self.aug_prob = 0.5
            self.mixup_alpha_real = 0.5
            self.mixup_alpha_pseudo = 0.5
            
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
            self.use_pseudo_mixup = False  # pseudo lableでmixupするかどうか
            self.pseudo_mix_prob = 0.4  # mixupでpseudo lableを使う確率
            self.pseudo_conf_threshold = 0.5
            

            if self.debug:
                self.epochs = 2
                self.selected_folds = [0]
                self.batch_size = 4
                

In [7]:
# debug trueにするとvalidationの数が1000に固定される．
cfg = CFG(mode="train", kaggle_notebook=False, debug=False)

In [8]:
utils_lib.set_seed(cfg.seed)

In [9]:

class BirdCLEFValidator:
    def __init__(self, cfg, df, datasets_lib, models_lib):
        self.cfg = cfg
        self.df = df
        self.datasets_lib = datasets_lib
        self.models_lib = models_lib
        self.label2index = {}
        self.index2label = {}
        self.spectrograms = None
        self.val_metrics = {}

        self._load_taxonomy()
        self._load_spectrograms()

    def _load_taxonomy(self):
        taxonomy_df = pd.read_csv(self.cfg.taxonomy_csv)
        species_ids = taxonomy_df['primary_label'].tolist()
        self.cfg.num_classes = len(species_ids)
        self.index2label = {i: label for i, label in enumerate(species_ids)}
        self.label2index = {label: i for i, label in enumerate(species_ids)}

    def _load_spectrograms(self):
        print(f"Loading pre-computed mel spectrograms from: {self.cfg.spectrogram_npy}")
        self.spectrograms = np.load(self.cfg.spectrogram_npy, allow_pickle=True).item()
        print(f"Loaded {len(self.spectrograms)} spectrograms")

    def _get_val_df(self, fold):
        skf = StratifiedKFold(n_splits=self.cfg.n_fold, shuffle=True, random_state=self.cfg.seed)
        _, val_idx = list(skf.split(self.df, self.df['primary_label']))[fold]
        return self.df.iloc[val_idx].reset_index(drop=True)

    def _get_val_loader(self, val_df):
        val_dataset = self.datasets_lib.BirdCLEFDatasetFromNPY(val_df, self.cfg, self.spectrograms, mode='valid')
        return DataLoader(val_dataset, batch_size=self.cfg.batch_size, shuffle=False,
                          num_workers=self.cfg.num_workers, pin_memory=True,
                          collate_fn=self.datasets_lib.collate_fn)

    def _load_model(self, model_path):
        model = self.models_lib.BirdCLEFModelForTrain(self.cfg).to(self.cfg.device)
        state = torch.load(model_path, map_location=self.cfg.device)
        model.load_state_dict(state['model_state_dict'])
        return model.eval()
    
    def _calculate_auc(self, targets, outputs):
        probs = 1 / (1 + np.exp(-outputs))

        # 👇 ROC AUC はバイナリラベルを必要とするので、soft labelを2値化
        targets_bin = (targets >= 0.5).astype(int)

        aucs = [roc_auc_score(targets_bin[:, i], probs[:, i]) 
                for i in range(targets.shape[1]) if np.sum(targets_bin[:, i]) > 0]
        return np.mean(aucs) if aucs else 0.0

    def _calculate_classwise_auc(self, targets, outputs):
        probs = 1 / (1 + np.exp(-outputs))

        # バイナリ化（連続値でもintでも安全）
        targets_bin = (targets >= 0.5).astype(int)

        classwise_auc = {}
        for i in range(targets.shape[1]):
            if np.sum(targets_bin[:, i]) > 0:
                try:
                    classwise_auc[i] = roc_auc_score(targets_bin[:, i], probs[:, i])
                except ValueError:
                    classwise_auc[i] = np.nan  # エラー出たときも安心
        return classwise_auc

    def _calculate_classwise_ap(self, targets, outputs):
        probs = 1 / (1 + np.exp(-outputs))

        # ラベルをバイナリ化（soft label対応）
        targets_bin = (targets >= 0.5).astype(int)

        classwise_ap = {}
        for i in range(targets.shape[1]):
            if np.sum(targets_bin[:, i]) > 0:
                try:
                    classwise_ap[i] = average_precision_score(targets_bin[:, i], probs[:, i])
                except ValueError:
                    classwise_ap[i] = np.nan
        return classwise_ap
    
    def _calculate_map(self, targets, outputs):
        classwise_ap = self._calculate_classwise_ap(targets, outputs)
        values = [v for v in classwise_ap.values() if v is not None and not np.isnan(v)]
        return np.mean(values) if values else 0.0

    def _predict(self, model, loader):
        model.eval()
        all_outputs, all_targets, all_filenames = [], [], []
        with torch.no_grad():
            for batch in tqdm(loader, desc="Validation"):
                if isinstance(batch['melspec'], list):
                    for melspec, target, filename in zip(batch['melspec'], batch['target'], batch['filename']):
                        inputs = melspec.unsqueeze(0).to(self.cfg.device)
                        output = model(inputs)
                        output = output[0] if isinstance(output, tuple) else output
                        all_outputs.append(output.detach().cpu().numpy())
                        all_targets.append(target.numpy())
                        all_filenames.append(filename)
                else:
                    inputs = batch['melspec'].to(self.cfg.device)
                    outputs = model(inputs)
                    outputs = outputs[0] if isinstance(outputs, tuple) else outputs
                    outputs = outputs.detach().cpu().numpy()
                    targets = batch['target'].numpy()
                    all_outputs.extend(outputs)
                    all_targets.extend(targets)
                    all_filenames.extend(batch['filename'])
        
        all_outputs = np.array(all_outputs)
        all_targets = np.array(all_targets)
        all_filenames = np.array(all_filenames)
        
        self.val_metrics = {
            'val_auc': self._calculate_auc(all_targets, all_outputs),
            "val_map": self._calculate_map(all_targets, all_outputs),
            "val_classwise_auc": self._calculate_classwise_auc(all_targets, all_outputs),
            "val_classwise_ap": self._calculate_classwise_ap(all_targets, all_outputs),
        }

        return all_outputs, all_targets, all_filenames
    

    def evaluate_model_dir(self, model_dir):
        full_dir = os.path.join(self.cfg.models_dir, model_dir)
        print(f"\n🔍 Evaluating model directory: {full_dir}")

        for fold in range(self.cfg.n_fold):
            model_path = os.path.join(full_dir, f"model_fold{fold}.pth")
            if not os.path.exists(model_path):
                print(f"⛔️ model_fold{fold}.pth not found in {model_dir}")
                continue

            val_df = self._get_val_df(fold)
            val_loader = self._get_val_loader(val_df)
            self.val_loader = val_loader

            model = self._load_model(model_path)
            outputs, targets, filenames = self._predict(model, val_loader)

            class_names = [self.index2label[i] for i in range(outputs.shape[1])]
            probs = 1 / (1 + np.exp(-outputs))
            df_preds = pd.DataFrame(probs, columns=class_names)
            df_preds.insert(0, "row_id", filenames)
            df_preds.to_csv(os.path.join(full_dir, f"predictions_fold{fold}.csv"), index=False)
            
            df_targets = pd.DataFrame(targets, columns=class_names)
            df_targets.insert(0, "row_id", filenames)
            df_targets.to_csv(os.path.join(full_dir, f"targets_fold{fold}.csv"), index=False)
            
            print("Val AUC:", f"{self.val_metrics['val_auc']:.4f}", "Val MAP:", f"{self.val_metrics['val_map']:.4f}")

        print(f"\n✅ Finished evaluation for model_dir: {model_dir}")


In [24]:
if __name__ == "__main__":
    print("\nLoading training data...")
    train_df = pd.read_csv(cfg.train_csv)

    model_dirs = [
    "baseline_pseudo_th0.5",
    ]
    validator = BirdCLEFValidator(cfg, train_df, datasets_lib, models_lib)
    # ====== 実行 ======
    for model_dir in model_dirs:
        validator.evaluate_model_dir(model_dir)
        



Loading training data...
Loading pre-computed mel spectrograms from: ../data/processed/baseline/birdclef2025_melspec_5sec_256_256.npy
Loaded 28564 spectrograms

🔍 Evaluating model directory: ../models/baseline_pseudo_th0.5
Found 5713 matching spectrograms for valid dataset out of 5713 samples




Validation:   0%|          | 0/179 [00:01<?, ?it/s]

Val AUC: 0.9424 Val MAP: 0.5033
Found 5713 matching spectrograms for valid dataset out of 5713 samples




Validation:   0%|          | 0/179 [00:01<?, ?it/s]

Val AUC: 0.9455 Val MAP: 0.5235
Found 5713 matching spectrograms for valid dataset out of 5713 samples




Validation:   0%|          | 0/179 [00:01<?, ?it/s]

Val AUC: 0.9484 Val MAP: 0.5218
Found 5713 matching spectrograms for valid dataset out of 5713 samples




Validation:   0%|          | 0/179 [00:01<?, ?it/s]

Val AUC: 0.9474 Val MAP: 0.5185
Found 5712 matching spectrograms for valid dataset out of 5712 samples




Validation:   0%|          | 0/179 [00:01<?, ?it/s]

Val AUC: 0.9469 Val MAP: 0.5294

✅ Finished evaluation for model_dir: baseline_pseudo_th0.5


In [None]:

def weighted_ensemble(prediction_dict):
    """
    Args:
        prediction_dict (dict): keys are model paths to CSV, values are weights (float)

    Returns:
        pd.DataFrame: ensembled predictions (row_id + class probs)
    """
    assert len(prediction_dict) > 0, "No predictions provided."

    dfs = []
    weights = []

    for path, weight in prediction_dict.items():
        df = pd.read_csv(path)
        dfs.append(df)
        weights.append(weight)

    row_ids = dfs[0]['row_id']
    class_names = dfs[0].columns[1:]

    # stack predictions and apply weights
    stacked = np.stack([df[class_names].values * w for df, w in zip(dfs, weights)], axis=0)
    ensemble_preds = np.sum(stacked, axis=0) / np.sum(weights)

    df_ensemble = pd.DataFrame(ensemble_preds, columns=class_names)
    df_ensemble.insert(0, "row_id", row_ids)
    return df_ensemble



selected_folds = [0, 1, 2, 3, 4]

all_aucs = []
all_maps = []

for fold in selected_folds:
    print(f"Fold {fold} validation")
    
    targets = pd.read_csv(os.path.join(cfg.models_dir, "baseline_7sec", f"targets_fold{fold}.csv"))
    model_1_path = os.path.join(cfg.models_dir, "baseline_7sec", f"predictions_fold{fold}.csv")
    model_2_path = os.path.join(cfg.models_dir, "baseline_pseudo_th0.5", f"predictions_fold{fold}.csv")
    pred_dict = {
        model_1_path: 0.6,
        model_2_path: 0.4,
    }

    df_ens = weighted_ensemble(pred_dict)
    
    auc = calculate_auc(targets.iloc[:, 1:].values, df_ens.iloc[:, 1:].values)
    map = calculate_map(targets.iloc[:, 1:].values, df_ens.iloc[:, 1:].values)
    
    all_aucs.append(auc)
    all_maps.append(map)
    
    print("Ensemble AUC:", f"{auc:.3f}")
    print("Ensemble MAP:", f"{map:.3f}")
    
print("Average AUC:", f"{np.mean(all_aucs):.3f}")
print("Average MAP:", f"{np.mean(all_maps):.3f}")



Fold 0 validation
Ensemble AUC: 0.950
Ensemble MAP: 0.529
Average AUC: 0.950
Average MAP: 0.529


In [42]:
def grid_search_weights(pred_paths, targets, resolution=11):
    best_auc = -1
    best_weights = None
    best_df = None

    weights_grid = [(w/10, 1 - w/10) for w in range(resolution)]

    for w1, w2 in weights_grid:
        pred_dict = {
            pred_paths[0]: w1,
            pred_paths[1]: w2,
        }
        df_ens = weighted_ensemble(pred_dict)
        auc = calculate_auc(targets.iloc[:, 1:].values, df_ens.iloc[:, 1:].values)

        if auc > best_auc:
            best_auc = auc
            best_weights = (w1, w2)
            best_df = df_ens

    print(f"Best AUC: {best_auc:.4f} with weights: {best_weights}")
    return best_df, best_weights

df_ens, best_weights = grid_search_weights([model_1_path, model_2_path], targets)

Best AUC: 0.9498 with weights: (0.6, 0.4)


In [34]:
# 7secのモデルは，学習時のスコアと一致しないのは仕様．7secのモデルはdatasetも7sec．このNBではデータセットは5sなので正しい評価
model_dir_name = "baseline_fold0_7sec"
model_path = os.path.join(cfg.models_dir, model_dir_name)
predictions = pd.read_csv(os.path.join(model_path, "predictions_fold0.csv"))
targets = pd.read_csv(os.path.join(model_path, "targets_fold0.csv"))
log = pd.read_csv(os.path.join(model_path, "log_fold0.csv"))

print("Val AUC:", f"{calculate_auc(targets.iloc[:, 1:].values, predictions.iloc[:, 1:].values):.3f}")
print("Val MAP:", f"{calculate_map(targets.iloc[:, 1:].values, predictions.iloc[:, 1:].values):.3f}")

Val AUC: 0.942
Val MAP: 0.466


In [25]:
# 7secのモデルは，学習時のスコアと一致しないのは仕様．7secのモデルはdatasetも7sec．このNBではデータセットは5sなので正しい評価
model_dir_name = "baseline_pseudo_th0.5"
model_path = os.path.join(cfg.models_dir, model_dir_name)

all_aucs = []
all_maps = []

for fold in range(5):
    predictions = pd.read_csv(os.path.join(model_path, f"predictions_fold{fold}.csv"))
    targets = pd.read_csv(os.path.join(model_path, f"targets_fold{fold}.csv"))
    log = pd.read_csv(os.path.join(model_path, f"log_fold{fold}.csv"))
    
    auc = calculate_auc(targets.iloc[:, 1:].values, predictions.iloc[:, 1:].values)
    map = calculate_map(targets.iloc[:, 1:].values, predictions.iloc[:, 1:].values)
    all_aucs.append(auc)
    all_maps.append(map)
    print(f"Fold {fold}: Val AUC: {auc:.3f}, Val MAP: {map:.3f}")
print(f"Mean AUC: {np.mean(all_aucs):.3f}, Mean MAP: {np.mean(all_maps):.3f}")

Fold 0: Val AUC: 0.942, Val MAP: 0.503
Fold 1: Val AUC: 0.946, Val MAP: 0.523
Fold 2: Val AUC: 0.948, Val MAP: 0.522
Fold 3: Val AUC: 0.947, Val MAP: 0.519
Fold 4: Val AUC: 0.947, Val MAP: 0.529
Mean AUC: 0.946, Mean MAP: 0.519
