In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
plt.rcParams['font.family'] = 'Noto Sans CJK JP'

from IPython.display import Audio

from module import config_lib, utils_lib

In [130]:
class DatasetConfig:
    def __init__(self, kaggle_notebook=False, debug=False):
        self.KAGGLE_NOTEBOOK = kaggle_notebook
        self.debug = debug

        # ===== Path Settings =====
        if self.KAGGLE_NOTEBOOK:
            self.OUTPUT_DIR = ''
            self.train_datadir = '/kaggle/input/birdclef-2025/train_audio'
            self.train_csv = '/kaggle/input/birdclef-2025/train.csv'
            self.test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
            self.submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
            self.taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
            self.model_path = '/kaggle/input/birdclef-2025-0330'
        else:
            self.OUTPUT_DIR = '../data/result/'
            self.train_datadir = '../data/raw/train_audio/'
            self.train_csv = '../data/raw/train.csv'
            self.test_soundscapes = '../data/raw/test_soundscapes/'
            self.submission_csv = '../data/raw/sample_submission.csv'
            self.taxonomy_csv = '../data/raw/taxonomy.csv'
            self.models_dir = "../models/" # 全modelの保存先
            self.model_path = self.models_dir # 各モデルの保存先．学習時に動的に変更．
            self.RAW_DIR = '../data/raw/'
            self.PROCESSED_DIR = '../data/processed/'
            
            self.spectrogram_npy = '../data/processed/baseline/birdclef2025_melspec_5sec_256_256.npy'
            self.pseudo_label_csv = "../data/processed/pseudo_labels/ensemble_7sec_pseudoth0.5/pseudo_label.csv"
            self.pseudo_melspec_npy = "../data/processed/train_soundscapes_0407/train_soundscapes_melspecs.npy"


        # ===== Audio Settings =====
        self.FS = 32000
        self.WINDOW_SIZE = 5.0 # 推論時のウィンドウサイズ
        self.TARGET_DURATION = 5 # データセット作成時のウィンドウサイズ
        self.TARGET_SHAPE = (256, 256)
        self.N_FFT = 1024
        self.HOP_LENGTH = 512
        self.N_MELS = 148
        self.FMIN = 20
        self.FMAX = 16000
        self.N_MAX = 50 if self.debug else None        
        self.N_JOBS = 16  # 並列処理のスレッド数 16くらいでいい
        self.LOAD_ENGINE = 'torchaudio'  # librosa or torchaudio
        self.SKIP_RESIZE = False  # resizeしないならTrue
        self.seed = 42
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        self.use_tta = False
        self.tta_count = 3
        self.threshold = 0.5

        self.use_specific_folds = False
        self.folds = [0, 1, 2, 3, 4]  # Used only if use_specific_folds is True

        self.debug_count = 3
            

In [132]:
config = DatasetConfig()
utils_lib.set_seed(config.seed)

In [27]:
train_2025 = pd.read_csv("../data/raw/train.csv")
taxonomy_2025 = pd.read_csv("../data/raw/taxonomy.csv")

In [72]:
train_2024 = pd.read_csv("../data/raw/bc2024/train_metadata.csv")
taxonomy_2024 = pd.read_csv("../data/raw/bc2024/eBird_Taxonomy_v2021.csv")

In [120]:
train_2024_add = pd.read_csv("../data/raw/bc2024-additional/BirdClef2024_additional.csv")
# 学名を作る
train_2024_add["scientific_name"] = train_2024_add["gen"].str.strip() + " " + train_2024_add["sp"].str.strip()
train_2024_add["filename"] = train_2024_add["primary_label"] + "/" + train_2024_add["file"] + ".mp3"

  train_2024_add = pd.read_csv("../data/raw/bc2024-additional/BirdClef2024_additional.csv")


In [137]:
train_2023 = pd.read_csv("../data/raw/bc2023/train_metadata.csv")
taxonomy_2023 = pd.read_csv("../data/raw/bc2023/eBird_Taxonomy_v2021.csv")

In [138]:
train_2022 = pd.read_csv("../data/raw/bc2022/train_metadata.csv")
taxonomy_2022 = pd.read_csv("../data/raw/bc2022/eBird_Taxonomy_v2021.csv")

In [139]:
import pandas as pd

# 2025のユニークなprimary_label
species_2025 = set(train_2025["primary_label"].unique())

# データセットと対応するtrainデータ
datasets = {
    "2024": train_2024,
    "2024_add": train_2024_add,
    "2023": train_2023,
    "2022": train_2022
}

# 結果格納
results = []

for year, df in datasets.items():
    all_species = set(df["primary_label"].unique())
    matched_species = species_2025 & all_species
    matched_df = df[df["primary_label"].isin(matched_species)]

    results.append({
        "year": year,
        "matched_species_count": len(matched_species),
        "total_species_in_dataset": len(all_species),
        "matched_sample_count": len(matched_df),
        "total_sample_count": len(df)
    })

# DataFrame化して表示
summary_df = pd.DataFrame(results)

summary_df

Unnamed: 0,year,matched_species_count,total_species_in_dataset,matched_sample_count,total_sample_count
0,2024,1,182,276,24459
1,2024_add,1,182,96,24279
2,2023,2,264,371,16941
3,2022,1,152,151,14852


In [140]:
# 学名を使っても結果は一緒

import pandas as pd
# 2025年の学名セット
species_2025_sci = set(train_2025["scientific_name"].unique())

# 各年のデータ（scientific_name列を持っていることが前提）
datasets = {
    "2024": train_2024,
    "2024_add": train_2024_add,
    "2023": train_2023,
    "2022": train_2022
}

results = []

for year, df in datasets.items():
    # nullを除いた学名一覧を取得
    all_species = set(df["scientific_name"].dropna().unique())
    matched_species = species_2025_sci & all_species
    matched_df = df[df["scientific_name"].isin(matched_species)]

    results.append({
        "year": year,
        "matched_species_count": len(matched_species),
        "total_species_in_dataset": len(all_species),
        "matched_sample_count": len(matched_df),
        "total_sample_count": len(df)
    })

# 表示
summary_df = pd.DataFrame(results)

In [141]:
summary_df

Unnamed: 0,year,matched_species_count,total_species_in_dataset,matched_sample_count,total_sample_count
0,2024,1,182,276,24459
1,2024_add,1,182,96,24279
2,2023,2,264,371,16941
3,2022,1,152,151,14852


In [142]:
# 2025の primary_label セットを取得
species_2025_labels = set(train_2025["primary_label"].dropna().unique())

# 過去データセット（pretrain候補）
past_datasets = {
    "2024": train_2024,
    "2024_add": train_2024_add,
    "2023": train_2023,
    "2022": train_2022,
}

# 各年から 2025に含まれない種のデータのみ抽出
pretrain_dfs = []
for year, df in past_datasets.items():
    df = df.copy()
    df["source_year"] = year
    df = df[~df["primary_label"].isin(species_2025_labels)]
    pretrain_dfs.append(df)

# pretrain用train_dfを結合
pretrain_df = pd.concat(pretrain_dfs, ignore_index=True)
print(f"✅ Pretrain用データ数: {len(pretrain_df)}")
print(f"🧬 含まれるprimary_label数: {pretrain_df['primary_label'].nunique()}")

✅ Pretrain用データ数: 79637
🧬 含まれるprimary_label数: 569


In [143]:
# filepathとsamplenameを追加
audio_dirs = {
    "2024": os.path.join(config.RAW_DIR, "bc2024/train_audio"),
    "2024_add": os.path.join(config.RAW_DIR, "bc2024-additional/additional_audio"),
    "2023": os.path.join(config.RAW_DIR, "bc2023/train_audio"),
    "2022": os.path.join(config.RAW_DIR, "bc2022/train_audio"),
}

# working_df をコピーして作成
working_df = pretrain_df.copy()

# ファイルパスを source_year に応じて割り当てる
def resolve_filepath(row):
    root_dir = audio_dirs.get(row["source_year"])
    return os.path.join(root_dir, row["filename"])

working_df['filepath'] = working_df.apply(resolve_filepath, axis=1)

# samplename などは同じでOK
working_df['samplename'] = working_df['filename'].apply(
    lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0]
)