### Some useful references:
1. **[Training]**: https://github.com/LIHANG-HONG/birdclef2023-2nd-place-solution
2. **[Inference]**: https://www.kaggle.com/code/kadircandrisolu/efficientnet-b0-pytorch-inference-birdclef-25

This model backbone is seresnext26t_32x4d

In [13]:
import os
import gc
import warnings
import logging
import time
import math
import cv2
from pathlib import Path
import joblib

import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from soundfile import SoundFile 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
import timm
from tqdm.auto import tqdm
from glob import glob
import torchaudio
import random
import itertools
from typing import Union

import concurrent.futures

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

In [14]:
class CFG:
    
    seed = 42
    print_freq = 100
    num_workers = 4

    stage = 'train_bce'
    train_datadir = '/kaggle/input/birdclef-2025/train_audio'
    train_csv = '/kaggle/input/birdclef-2025/train.csv'
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
    model_files = ['/kaggle/input/bird2025-sed-ckpt/sedmodel.pth'
                  ]
 
    model_name = 'seresnext26t_32x4d'  
    pretrained = False
    in_channels = 1

    
    SR = 32000
    target_duration = 5
    train_duration = 10
    
    
    device = 'cpu'

cfg = CFG()

In [15]:
class CFG:
    def __init__(self, mode="train", kaggle_notebook=True, debug=False):
        assert mode in ["train", "inference"], "mode must be 'train' or 'inference'"
        self.mode = mode
        self.KAGGLE_NOTEBOOK = kaggle_notebook
        self.debug = debug

        # ===== Common Settings =====
        self.seed = 42
        self.print_freq = 100
        self.num_workers = 4
        self.stage = 'train_bce'

        # ===== Path Settings =====
        if self.KAGGLE_NOTEBOOK:
            self.train_datadir = '/kaggle/input/birdclef-2025/train_audio'
            self.train_csv = '/kaggle/input/birdclef-2025/train.csv'
            self.test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
            self.submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
            self.taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
            self.model_files = ['/kaggle/input/bird2025-sed-ckpt/sedmodel.pth']
            self.device = "cpu"
        else:
            self.train_datadir = '../data/raw/train_audio'
            self.train_csv = '../data/raw/train.csv'
            self.test_soundscapes = '../data/raw/test_soundscapes_small/'
            self.submission_csv = '../data/raw/sample_submission.csv'
            self.taxonomy_csv = '../data/raw/taxonomy.csv'
            self.model_files = ['../models/SEDmodel/']
            self.model_path = '../models/SEDmodel/'
            self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # ===== Model Settings =====
        self.model_name = 'seresnext26t_32x4d'
        self.pretrained = False
        self.in_channels = 1
        self.use_specific_folds = False
        self.specific_folds = [0, 1, 2, 3, 4]
        

        # ===== Audio Settings =====
        self.SR = 32000
        self.target_duration = 5
        self.train_duration = 10

        # ===== Debug Settings =====
        if self.debug:
            self.print_freq = 10
            self.model_files = self.model_files[:1]  # limit to 1 model for debug
            
cfg = CFG(mode="inference", kaggle_notebook=False, debug=False)

In [16]:
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

Using device: cuda
Loading taxonomy data...
Number of classes: 206


In [17]:
def set_seed(seed=42):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.seed)


In [18]:
class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)


def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.0)

def init_bn(bn):
    bn.bias.data.fill_(0.0)
    bn.weight.data.fill_(1.0)

In [19]:


class BirdCLEFModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        
        taxonomy_df = pd.read_csv('/kaggle/input/birdclef-2025/taxonomy.csv')
        self.num_classes = len(taxonomy_df)

        self.bn0 = nn.BatchNorm2d(cfg['n_mels'])
        
        self.backbone = timm.create_model(
            cfg['model_name'],
            pretrained=False,
            in_chans=cfg['in_channels'],
            drop_rate=0.2,
            drop_path_rate=0.2,
        )

        layers = list(self.backbone.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        
        if "efficientnet" in self.cfg['model_name']:
            backbone_out = self.backbone.classifier.in_features
        elif "eca" in self.cfg['model_name']:
            backbone_out = self.backbone.head.fc.in_features
        elif "res" in self.cfg['model_name']:
            backbone_out = self.backbone.fc.in_features
        else:
            backbone_out = self.backbone.num_features
            
        
        self.fc1 = nn.Linear(backbone_out, backbone_out, bias=True)
        self.att_block = AttBlockV2(backbone_out, self.num_classes, activation="sigmoid")

        self.melspec_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.cfg['SR'],
            hop_length=self.cfg['hop_length'],
            n_mels=self.cfg['n_mels'],
            f_min=self.cfg['f_min'],
            f_max=self.cfg['f_max'],
            n_fft=self.cfg['n_fft'],
            pad_mode="constant",
            norm="slaney",
            onesided=True,
            mel_scale="htk",
        )
        if self.cfg['device'] == "cuda":
            self.melspec_transform = self.melspec_transform.cuda()
        else:
            self.melspec_transform = self.melspec_transform.cpu()

        self.db_transform = torchaudio.transforms.AmplitudeToDB(
            stype="power", top_db=80
        )


    def extract_feature(self,x):
        x = x.permute((0, 1, 3, 2))
        frames_num = x.shape[2]
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        # if self.training:
        #    x = self.spec_augmenter(x)
        
        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        x = self.encoder(x)
        
        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)
        
        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        return x, frames_num
        
    @torch.cuda.amp.autocast(enabled=False)
    def transform_to_spec(self, audio):

        audio = audio.float()
        
        spec = self.melspec_transform(audio)
        spec = self.db_transform(spec)

        if self.cfg['normal'] == 80:
            spec = (spec + 80) / 80
        elif self.cfg['normal'] == 255:
            spec = spec / 255
        else:
            raise NotImplementedError
                
        if self.cfg['in_channels'] == 3:
            spec = image_delta(spec)
        
        return spec

    def forward(self, x):

        with torch.no_grad():
            x = self.transform_to_spec(x)

        x, frames_num = self.extract_feature(x)
        
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        return torch.logit(clipwise_output)

    def infer(self, x, tta_delta=2):
        with torch.no_grad():
            x = self.transform_to_spec(x)
        x,_ = self.extract_feature(x)
        time_att = torch.tanh(self.att_block.att(x))
        feat_time = x.size(-1)
        start = (
            feat_time / 2 - feat_time * (self.cfg['infer_duration'] / self.cfg['duration_train']) / 2
        )
        end = start + feat_time * (self.cfg['infer_duration'] / self.cfg['duration_train'])
        start = int(start)
        end = int(end)
        pred = self.attention_infer(start,end,x,time_att)

        start_minus = max(0, start-tta_delta)
        end_minus=end-tta_delta
        pred_minus = self.attention_infer(start_minus,end_minus,x,time_att)

        start_plus = start+tta_delta
        end_plus=min(feat_time, end+tta_delta)
        pred_plus = self.attention_infer(start_plus,end_plus,x,time_att)

        pred = 0.5*pred + 0.25*pred_minus + 0.25*pred_plus
        return pred
        
    def attention_infer(self,start,end,x,time_att):
        feat = x[:, :, start:end]
        # att = torch.softmax(time_att[:, :, start:end], dim=-1)
        #             print(feat_time, start, end)
        #             print(att_a.sum(), att.sum(), time_att.shape)
        framewise_pred = torch.sigmoid(self.att_block.cla(feat))
        framewise_pred_max = framewise_pred.max(dim=2)[0]
        # clipwise_output = torch.sum(framewise_pred * att, dim=-1)
        #logits = torch.sum(
        #    self.att_block.cla(feat) * att,
        #    dim=-1,
        #)

        # return clipwise_output
        return framewise_pred_max

In [20]:
def waveform_to_melspec(audio_waveform, cfg):
    """
    Converts raw waveform to normalized mel spectrogram.
    Returns shape: (1, 1, n_mels, time) e.g., (1, 1, 256, 256)
    """
    sr = cfg.SR
    n_mels = 256
    n_fft = 1024
    hop_length = 512

    mel_spec = librosa.feature.melspectrogram(
        y=audio_waveform,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        fmin=20,
        fmax=16000
    )
    mel_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_norm = (mel_db + 80) / 80  # normalize to [0,1]

    # Pad or crop to fixed size
    target_length = 256
    if mel_norm.shape[1] < target_length:
        pad_width = target_length - mel_norm.shape[1]
        mel_norm = np.pad(mel_norm, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_norm = mel_norm[:, :target_length]

    mel_norm = mel_norm[np.newaxis, np.newaxis, :, :]  # (1, 1, 256, 256)
    return mel_norm.astype(np.float32)



In [21]:
def load_sample(path, cfg):
    audio, orig_sr = sf.read(path, dtype="float32")
    seconds = []
    audio_length = cfg.SR * cfg.target_duration
    step = audio_length
    for i in range(audio_length, len(audio) + step, step):
        start = max(0, i - audio_length)
        end = start + audio_length
        if end > len(audio):
            pass
        else:
            seconds.append(int(end/cfg.SR))

    audio = np.concatenate([audio,audio,audio])
    audios = []
    for i,second in enumerate(seconds):
        end_seconds = int(second)
        start_seconds = int(end_seconds - cfg.target_duration)
    
        end_index = int(cfg.SR * (end_seconds + (cfg.train_duration - cfg.target_duration) / 2) ) + len(audio) // 3
        start_index = int(cfg.SR * (start_seconds - (cfg.train_duration - cfg.target_duration) / 2) ) + len(audio) // 3
        end_pad = int(cfg.SR * (cfg.train_duration - cfg.target_duration) / 2) 
        start_pad = int(cfg.SR * (cfg.train_duration - cfg.target_duration) / 2) 
        y = audio[start_index:end_index].astype(np.float32)
        if i==0:
            y[:start_pad] = 0
        elif i==(len(seconds)-1):
            y[-end_pad:] = 0
        audios.append(y)

    return audios

def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s

In [22]:
def find_model_files(cfg):
    """
    Find all .pth model files in the specified model directory
    """
    model_files = []
    
    model_dir = Path(cfg.model_path)
    
    for path in model_dir.glob('**/*.pth'):
        model_files.append(str(path))
    
    return model_files

from openvino.runtime import Core

def load_openvino_models(vino_dir, cfg):
    models = []
    vino_dir = Path(vino_dir)
    core = Core()

    if cfg.use_specific_folds:
        fold_ids = cfg.folds
        xml_files = [vino_dir / f"model_fold{f}.xml" for f in fold_ids]
    else:
        xml_files = sorted(vino_dir.glob("model_fold*.xml"))

    for xml_path in xml_files:
        bin_path = xml_path.with_suffix(".bin")

        if not xml_path.exists() or not bin_path.exists():
            print(f"⚠️ Warning: Missing files for {xml_path.stem}")
            continue

        model_ir = core.read_model(xml_path)
        compiled_model = core.compile_model(model_ir, device_name="CPU")

        input_layer = compiled_model.input(0)
        output_layer = compiled_model.output(0)

        models.append({
            "compiled_model": compiled_model,
            "input_layer": input_layer,
            "output_layer": output_layer,
            "name": xml_path.name
        })

        print(f"✅ Loaded model: {xml_path.name}")

    print(f"🎉 Total {len(models)} OpenVINO model(s) loaded from {vino_dir}")
    return models

def predict_openvino(audio_waveform, model_dict, cfg):
    mel_input = waveform_to_melspec(audio_waveform, cfg)  # 前処理を明示的に行う

    compiled_model = model_dict["compiled_model"]
    input_layer = model_dict["input_layer"]
    output_layer = model_dict["output_layer"]

    result = compiled_model({input_layer: mel_input})
    output = result[output_layer]
    return output.squeeze()

def predict_on_spectrogram_openvino(audio_path, models, cfg, species_ids):
    predictions = []
    row_ids = []
    soundscape_id = Path(audio_path).stem

    print(f"Processing {soundscape_id}")
    audio_data = load_sample(audio_path, cfg)

    for segment_idx, audio_input in enumerate(audio_data):
        row_id = f"{soundscape_id}_{(segment_idx + 1) * cfg.target_duration}"
        row_ids.append(row_id)

        # audio_input: waveform of shape (samples,)
        if len(models) == 1:
            pred = predict_openvino(audio_input, models[0], cfg)
        else:
            preds = [predict_openvino(audio_input, m, cfg) for m in models]
            pred = np.mean(preds, axis=0)

        predictions.append(pred)

    predictions = np.stack(predictions, axis=0)
    return row_ids, predictions

In [23]:
def run_inference_openvino(cfg, models, species_ids):
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    if len(test_files) == 0:
        test_files = sorted(glob(str(Path('/kaggle/input/birdclef-2025/train_soundscapes') / '*.ogg')))[:10]

    print(f"Found {len(test_files)} test soundscapes")

    all_row_ids = []
    all_predictions = []

    for test_file in test_files:
        rids, preds = predict_on_spectrogram_openvino(test_file, models, cfg, species_ids)
        all_row_ids.extend(rids)
        all_predictions.extend(preds)

    return all_row_ids, all_predictions

def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe"""
    print("Creating submission dataframe...")

    submission_dict = {'row_id': row_ids}
    
    for i, species in enumerate(species_ids):
        submission_dict[species] = [pred[i] for pred in predictions]

    submission_df = pd.DataFrame(submission_dict)

    submission_df.set_index('row_id', inplace=True)

    sample_sub = pd.read_csv(cfg.submission_csv, index_col='row_id')

    missing_cols = set(sample_sub.columns) - set(submission_df.columns)
    if missing_cols:
        print(f"Warning: Missing {len(missing_cols)} species columns in submission")
        for col in missing_cols:
            submission_df[col] = 0.0

    submission_df = submission_df[sample_sub.columns]

    submission_df = submission_df.reset_index()
    
    return submission_df


def smooth_submission(submission_path):
        """
        Post-process the submission CSV by smoothing predictions to enforce temporal consistency.
        
        For each soundscape (grouped by the file name part of 'row_id'), each row's predictions
        are averaged with those of its neighbors using defined weights.
        
        :param submission_path: Path to the submission CSV file.
        """
        print("Smoothing submission predictions...")
        sub = pd.read_csv(submission_path)
        cols = sub.columns[1:]
        # Extract group names by splitting row_id on the last underscore
        groups = sub['row_id'].str.rsplit('_', n=1).str[0].values
        unique_groups = np.unique(groups)
        
        for group in unique_groups:
            # Get indices for the current group
            idx = np.where(groups == group)[0]
            sub_group = sub.iloc[idx].copy()
            predictions = sub_group[cols].values
            new_predictions = predictions.copy()
            
            if predictions.shape[0] > 1:
                # Smooth the predictions using neighboring segments
                new_predictions[0] = (predictions[0] * 0.8) + (predictions[1] * 0.2)
                new_predictions[-1] = (predictions[-1] * 0.8) + (predictions[-2] * 0.2)
                for i in range(1, predictions.shape[0]-1):
                    new_predictions[i] = (predictions[i-1] * 0.2) + (predictions[i] * 0.6) + (predictions[i+1] * 0.2)
            # Replace the smoothed values in the submission dataframe
            sub.iloc[idx, 1:] = new_predictions
        
        sub.to_csv(submission_path, index=False)
        print(f"Smoothed submission saved to {submission_path}")

In [24]:

start_time = time.time()
print("Starting BirdCLEF-2025 OpenVINO inference...")

vino_model_dir = cfg.model_path  # OpenVINOのモデル保存ディレクトリ
models = load_openvino_models(vino_model_dir, cfg)

if not models:
    print("No models found! Please check OpenVINO model paths.")
    raise RuntimeError("No OpenVINO models loaded.")

row_ids, predictions = run_inference_openvino(cfg, models, species_ids)

submission_df = create_submission(row_ids, predictions, species_ids, cfg)
submission_path = 'submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")

smooth_submission(submission_path)

end_time = time.time()
print(f"Inference completed in {(end_time - start_time)/60:.2f} minutes")

Starting BirdCLEF-2025 OpenVINO inference...


✅ Loaded model: model_fold1.xml
🎉 Total 1 OpenVINO model(s) loaded from ../models/SEDmodel
Found 4 test soundscapes
Processing H02_20230502_080500
Processing H02_20230420_074000
Processing H02_20230420_112000
Processing H02_20230420_154500
Creating submission dataframe...
Submission saved to submission.csv
Smoothing submission predictions...
Smoothed submission saved to submission.csv
Inference completed in 0.03 minutes


In [25]:
submission_df

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar
0,H02_20230502_080500_5,-0.003912,-0.011743,-3.767084e-05,-0.018549,0.015625,-0.000934,0.007796808,-0.011708,0.015625,...,-0.007775,0.007818335,0.00985,-0.003904,0.0,-0.007812,0.021535,0.015625,-0.017576,0.023443
1,H02_20230502_080500_10,-0.007813,-0.007812,0.01171165,-0.023439,0.015626,-0.014648,-2.384186e-07,-0.015625,0.015625,...,-0.015626,0.0,0.015625,-0.007813,0.0,-0.007813,0.031253,0.015626,-0.029297,0.031252
2,H02_20230502_080500_15,-0.007812,-0.007812,0.01171212,-0.023439,0.015625,-0.015625,0.0,-0.015625,0.015626,...,-0.015625,0.0,0.015625,-0.007813,-5.960464e-08,-0.007812,0.031253,0.015625,-0.029297,0.031253
3,H02_20230502_080500_20,-0.007813,-0.007813,0.01171141,-0.023439,0.015626,-0.014648,-2.384186e-07,-0.015625,0.015625,...,-0.015625,-5.960464e-08,0.015625,-0.007813,0.0,-0.007813,0.031252,0.015625,-0.029297,0.031253
4,H02_20230502_080500_25,-0.007813,-0.007812,0.0117133,-0.022461,0.015626,-0.01367,2.384186e-07,-0.015625,0.015625,...,-0.015625,0.0,0.015625,-0.007813,0.0,-0.007813,0.027355,0.015625,-0.029297,0.031253
5,H02_20230502_080500_30,-0.007813,-0.007813,0.01171306,-0.022461,0.015625,-0.01367,0.0,-0.015625,0.015625,...,-0.015625,0.0,0.015625,-0.007813,0.0,-0.007812,0.027356,0.015625,-0.029297,0.031252
6,H02_20230502_080500_35,-0.007813,-0.007812,2.384186e-07,-0.021484,0.015626,-0.01367,0.0,-0.015625,0.015625,...,-0.015625,0.0,0.015625,-0.007813,0.0,-0.007812,0.027356,0.015625,-0.023439,0.031253
7,H02_20230502_080500_40,-0.007813,-0.007813,2.384186e-07,-0.019531,0.015625,-0.007813,0.0,-0.015625,0.015626,...,-0.01367,0.0,0.015625,-0.007813,0.0,-0.007813,0.015625,0.015625,-0.023439,0.031253
8,H02_20230502_080500_45,-0.007812,-0.007813,-5.960464e-08,-0.021484,0.015625,-0.007813,0.0,-0.015625,0.015625,...,-0.01367,0.0,0.015626,-0.007813,0.0,-0.007813,0.015625,0.015625,-0.023439,0.031253
9,H02_20230502_080500_50,-0.007813,-0.007813,0.0,-0.021484,0.015625,-0.01367,0.0,-0.015625,0.015625,...,-0.015625,0.0,0.015625,-0.007813,0.0,-0.007813,0.027355,0.015625,-0.023439,0.031253
