### Some useful references:
1. **[Training]**: https://github.com/LIHANG-HONG/birdclef2023-2nd-place-solution
2. **[Inference]**: https://www.kaggle.com/code/kadircandrisolu/efficientnet-b0-pytorch-inference-birdclef-25

This model backbone is seresnext26t_32x4d

In [113]:
import os
import gc
import warnings
import logging
import time
import math
import cv2
from pathlib import Path
import joblib

import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from soundfile import SoundFile 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
import timm
from tqdm.auto import tqdm
from glob import glob
import torchaudio
import random
import itertools
from typing import Union

import concurrent.futures

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

In [114]:
class CFG:
    def __init__(self, mode="inference", kaggle_notebook=False):
        self.seed = 42
        self.print_freq = 100
        self.num_workers = 4
        self.stage = "train_bce"
        self.mode = mode  # "train" or "inference"

        if kaggle_notebook:
        # ===== Path Settings =====
            self.train_datadir = "/kaggle/input/birdclef-2025/train_audio"
            self.train_csv = "/kaggle/input/birdclef-2025/train.csv"
            self.test_soundscapes = "/kaggle/input/bc25-testsoundscapes-small"
            self.submission_csv = "/kaggle/input/birdclef-2025/sample_submission.csv"
            self.taxonomy_csv = "/kaggle/input/birdclef-2025/taxonomy.csv"
            self.model_files = ["/kaggle/input/bird2025-sed-ckpt/sedmodel.pth"]
            
        else:
            self.train_datadir = "/kaggle/input/birdclef-2025/train_audio"
            self.train_csv = "/kaggle/input/birdclef-2025/train.csv"
            self.test_soundscapes = "../data/raw/test_soundscapes_small/"
            self.submission_csv = "../data/raw/sample_submission.csv"
            self.taxonomy_csv = "../data/raw/taxonomy.csv"
            self.model_files = ["../models/sedmodel_0857/model_fold0.pth"]
            

        # ===== Model Settings =====
        self.model_name = "seresnext26t_32x4d"
        self.pretrained = False
        self.in_channels = 1

        # ===== Audio Settings =====
        self.SR = 32000
        self.target_duration = 5
        self.train_duration = 10
        
        self.n_mels = 128
        self.n_fft = 2048
        self.hop_length = 512
        self.f_min = 20
        self.f_max = 16000
        self.normal = 80  # or 255
        self.infer_duration = 5
        self.duration_train = 10

        # ===== Device =====
        self.device = "cpu"

cfg = CFG(mode="inference", kaggle_notebook=False)

In [115]:
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

Using device: cpu
Loading taxonomy data...
Number of classes: 206


In [116]:
def set_seed(seed=42):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.seed)


In [117]:
import os
import gc
import warnings
import logging
import time
import math
import cv2
from pathlib import Path
import joblib

import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from soundfile import SoundFile 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
import timm
from tqdm.auto import tqdm
from glob import glob
import torchaudio
import random
import itertools
from typing import Union

import concurrent.futures

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

In [118]:
class CFG:
    def __init__(self, mode="inference", kaggle_notebook=False):
        self.seed = 42
        self.print_freq = 100
        self.num_workers = 4
        self.stage = "train_bce"
        self.mode = mode  # "train" or "inference"

        if kaggle_notebook:
        # ===== Path Settings =====
            self.train_datadir = "/kaggle/input/birdclef-2025/train_audio"
            self.train_csv = "/kaggle/input/birdclef-2025/train.csv"
            self.test_soundscapes = "/kaggle/input/bc25-testsoundscapes-small"
            self.submission_csv = "/kaggle/input/birdclef-2025/sample_submission.csv"
            self.taxonomy_csv = "/kaggle/input/birdclef-2025/taxonomy.csv"
            self.model_files = ["/kaggle/input/bird2025-sed-ckpt/sedmodel.pth"]
            
        else:
            self.train_datadir = "/kaggle/input/birdclef-2025/train_audio"
            self.train_csv = "/kaggle/input/birdclef-2025/train.csv"
            self.test_soundscapes = "../data/raw/test_soundscapes_small/"
            self.submission_csv = "../data/raw/sample_submission.csv"
            self.taxonomy_csv = "../data/raw/taxonomy.csv"
            self.model_files = ["../models/sedmodel_0857/model_fold0.pth"]
            

        # ===== Model Settings =====
        self.model_name = "seresnext26t_32x4d"
        self.pretrained = False
        self.in_channels = 1

        # ===== Audio Settings =====
        self.SR = 32000
        self.target_duration = 5
        self.train_duration = 10
        
        self.n_mels = 128
        self.n_fft = 2048
        self.hop_length = 512
        self.f_min = 20
        self.f_max = 16000
        self.normal = 80  # or 255
        self.infer_duration = 5
        self.duration_train = 10

        # ===== Device =====
        self.device = "cpu"

cfg = CFG(mode="inference", kaggle_notebook=False)

In [119]:
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

Using device: cpu
Loading taxonomy data...
Number of classes: 206


In [120]:
def set_seed(seed=42):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.seed)


In [None]:
class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)


def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.0)

def init_bn(bn):
    bn.bias.data.fill_(0.0)
    bn.weight.data.fill_(1.0)

class BirdCLEFModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        
        taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
        self.num_classes = len(taxonomy_df)

        self.bn0 = nn.BatchNorm2d(cfg.n_mels)
        
        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=False,
            in_chans=cfg.in_channels,
            drop_rate=0.2,
            drop_path_rate=0.2,
        )

        layers = list(self.backbone.children())[:-2]
        self.encoder = nn.Sequential(*layers)
        
        if "efficientnet" in cfg.model_name:
            backbone_out = self.backbone.classifier.in_features
        elif "eca" in cfg.model_name:
            backbone_out = self.backbone.head.fc.in_features
        elif "res" in cfg.model_name:
            backbone_out = self.backbone.fc.in_features
        else:
            backbone_out = self.backbone.num_features
            
        self.fc1 = nn.Linear(backbone_out, backbone_out, bias=True)
        self.att_block = AttBlockV2(backbone_out, self.num_classes, activation="sigmoid")

        self.melspec_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=cfg.SR,
            hop_length=cfg.hop_length,
            n_mels=cfg.n_mels,
            f_min=cfg.f_min,
            f_max=cfg.f_max,
            n_fft=cfg.n_fft,
            pad_mode="constant",
            norm="slaney",
            onesided=True,
            mel_scale="htk",
        )
        if cfg.device == "cuda":
            self.melspec_transform = self.melspec_transform.cuda()
        else:
            self.melspec_transform = self.melspec_transform.cpu()

        self.db_transform = torchaudio.transforms.AmplitudeToDB(
            stype="power", top_db=80
        )

    def extract_feature(self, x):
        x = x.permute((0, 1, 3, 2))
        frames_num = x.shape[2]
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        x = x.transpose(2, 3)
        x = self.encoder(x)
        
        x = torch.mean(x, dim=2)
        
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        return x, frames_num
        
    @torch.cuda.amp.autocast(enabled=False)
    def transform_to_spec(self, audio):
        audio = audio.float()
        spec = self.melspec_transform(audio)
        spec = self.db_transform(spec)

        if self.cfg.normal == 80:
            spec = (spec + 80) / 80
        elif self.cfg.normal == 255:
            spec = spec / 255
        else:
            raise NotImplementedError
                
        if self.cfg.in_channels == 3:
            spec = image_delta(spec)
        
        return spec

    def forward(self, x):
        with torch.no_grad():
            x = self.transform_to_spec(x)

        x, frames_num = self.extract_feature(x)
        
        clipwise_output, norm_att, segmentwise_output = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        return torch.logit(clipwise_output)

    def infer(self, x, tta_delta=2):
        with torch.no_grad():
            x = self.transform_to_spec(x)
        x, _ = self.extract_feature(x)
        time_att = torch.tanh(self.att_block.att(x))
        feat_time = x.size(-1)

        start = (
            feat_time / 2 - feat_time * (self.cfg.infer_duration / self.cfg.duration_train) / 2
        )
        end = start + feat_time * (self.cfg.infer_duration / self.cfg.duration_train)
        start = int(start)
        end = int(end)
        pred = self.attention_infer(start, end, x, time_att)

        start_minus = max(0, start - tta_delta)
        end_minus = end - tta_delta
        pred_minus = self.attention_infer(start_minus, end_minus, x, time_att)

        start_plus = start + tta_delta
        end_plus = min(feat_time, end + tta_delta)
        pred_plus = self.attention_infer(start_plus, end_plus, x, time_att)

        pred = 0.5 * pred + 0.25 * pred_minus + 0.25 * pred_plus
        return pred
        
    def attention_infer(self, start, end, x, time_att):
        feat = x[:, :, start:end]
        framewise_pred = torch.sigmoid(self.att_block.cla(feat))
        framewise_pred_max = framewise_pred.max(dim=2)[0]
        return framewise_pred_max
    
class BirdCLEFModelForOpenVINO(torch.nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        self.att_block = base_model.att_block
        self.fc1 = base_model.fc1
        self.bn0 = base_model.bn0
        self.encoder = base_model.encoder
        self.cfg = base_model.cfg

    def extract_feature(self, x):
        x = x.permute((0, 1, 3, 2))
        frames_num = x.shape[2]
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        x = x.transpose(2, 3)
        x = self.encoder(x)
        x = torch.mean(x, dim=2)
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        return x, frames_num

    def forward(self, mel):
        x, _ = self.extract_feature(mel)
        time_att = torch.tanh(self.att_block.att(x))
        feat_time = x.size(-1)

        start = int(feat_time / 2 - feat_time * (self.cfg.infer_duration / self.cfg.duration_train) / 2)
        end = int(start + feat_time * (self.cfg.infer_duration / self.cfg.duration_train))

        pred = self.attention_infer(start, end, x)
        pred_minus = self.attention_infer(max(0, start - 2), end - 2, x)
        pred_plus = self.attention_infer(start + 2, min(feat_time, end + 2), x)

        return 0.5 * pred + 0.25 * pred_minus + 0.25 * pred_plus

    def attention_infer(self, start, end, x):
        feat = x[:, :, start:end]
        framewise_pred = torch.sigmoid(self.att_block.cla(feat))
        return framewise_pred.max(dim=2)[0]

In [122]:
def load_models_for_openvino(cfg, num_classes):
    models = []

    for model_path in cfg.model_files:
        print(f"📦 Loading model: {model_path}")
        checkpoint = torch.load(model_path, map_location='cpu')

        checkpoint_cfg = CFG()
        for k, v in checkpoint["cfg"].items():
            setattr(checkpoint_cfg, k, v)
        checkpoint_cfg.device = "cpu"
        checkpoint_cfg.taxonomy_csv = cfg.taxonomy_csv

        model = BirdCLEFModel(checkpoint_cfg)
        model.load_state_dict(checkpoint["model_state_dict"])
        model.eval()

        # 👇 ラップして OpenVINO 向けモデルに変換
        model_vino = BirdCLEFModelForOpenVINO(model)
        models.append(model_vino)

    return models

In [123]:
import os
import json
import subprocess
from pathlib import Path
from datetime import datetime, timezone, timedelta
import torch
import pandas as pd

# === 初期設定 ===
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)

# 保存先ディレクトリ
model_dir = Path(cfg.model_files[0]).parent.resolve()
vino_dir = model_dir.parent / (model_dir.name + "_vino")
vino_dir.mkdir(parents=True, exist_ok=True)
print(f"📁 Saving ONNX & IR files to: {vino_dir}")

# === dataset-metadata.jsonの作成 ===
japan_time = datetime.now(timezone(timedelta(hours=9)))
current_time = japan_time.strftime('%Y%m%d_%H%M')

dataset_metadata = {
    "title": f"bc25-models-{current_time}",
    "id": f"ihiratch/bc25-models-{current_time}",
    "licenses": [{"name": "CC0-1.0"}]
}
metadata_path = os.path.join(vino_dir, "dataset-metadata.json")
with open(metadata_path, "w") as f:
    json.dump(dataset_metadata, f, indent=2)

# === モデル読み込み（全fold） ===
print("📦 Loading all fold models...")
models = load_models_for_openvino(cfg, num_classes)

# 推論専用のラッパー（melを入力として扱う）
class InferenceWrapper(torch.nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

    def forward(self, mel):  # mel: [B, 1, n_mels, time]
        x, _ = self.base_model.extract_feature(mel)
        time_att = torch.tanh(self.base_model.att_block.att(x))
        feat_time = x.size(-1)

        start = (
            feat_time / 2 - feat_time * (self.base_model.cfg.infer_duration / self.base_model.cfg.duration_train) / 2
        )
        end = start + feat_time * (self.base_model.cfg.infer_duration / self.base_model.cfg.duration_train)
        start = int(start)
        end = int(end)

        pred = self.attention_infer(x, time_att, start, end)
        pred_minus = self.attention_infer(x, time_att, max(0, start-2), end-2)
        pred_plus = self.attention_infer(x, time_att, start+2, min(feat_time, end+2))

        return 0.5 * pred + 0.25 * pred_minus + 0.25 * pred_plus

    def attention_infer(self, x, time_att, start, end):
        feat = x[:, :, start:end]
        framewise_pred = torch.sigmoid(self.base_model.att_block.cla(feat))
        return framewise_pred.max(dim=2)[0]  # [B, num_classes]

# === 各foldモデルをONNX→IR変換 ===
for fold, base_model in enumerate(models):
    print(f"\n🔁 [Fold {fold}] Converting model...")

    model = InferenceWrapper(base_model).to("cpu").eval()

    # 推論時の入力テンソル: melスペクトログラム（例: [1, 1, 128, 313]）
    dummy_input = torch.randn(1, 1, cfg.n_mels, 313)

    # ONNX書き出し
    onnx_path = vino_dir / f"model_fold{fold}.onnx"
    torch.onnx.export(
        model,
        dummy_input,
        onnx_path,
        input_names=["mel"],
        output_names=["logits"],
        dynamic_axes={"mel": {0: "batch_size"}, "logits": {0: "batch_size"}},
        opset_version=11
    )
    print(f"✅ Exported ONNX: {onnx_path.name}")

    # OpenVINO IRに変換（ovc使用）
    result = subprocess.run(
        ["ovc", str(onnx_path)],
        cwd=str(vino_dir),
        capture_output=True,
        text=True
    )

    if result.returncode != 0:
        print(f"❌ OpenVINO conversion failed for fold{fold}:")
        print("----- stderr -----")
        print(result.stderr)
        print("----- stdout -----")
        print(result.stdout)
    else:
        print(f"✅ Converted to OpenVINO IR:")
        print(f"   - {(vino_dir / f'model_fold{fold}.xml').resolve()}")
        print(f"   - {(vino_dir / f'model_fold{fold}.bin').resolve()}")

📁 Saving ONNX & IR files to: /root/program/birdclef-2025/models/sedmodel_0857_vino
📦 Loading all fold models...
📦 Loading model: ../models/sedmodel_0857/model_fold0.pth



🔁 [Fold 0] Converting model...


RuntimeError: running_mean should contain 128 elements not 256