# **BirdCLEF 2025 Data Preprocessing Notebook**

In [36]:
import os
import cv2
import math
import time
import librosa
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import os
import numpy as np
from datetime import datetime
import pytz
import torch
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedGroupKFold

from joblib import Parallel, delayed


from module import config_lib, utils_lib

In [None]:
class DatasetConfig:
    def __init__(self, kaggle_notebook=False, debug=False):
        self.KAGGLE_NOTEBOOK = kaggle_notebook
        self.debug = debug

        # ===== Path Settings =====
        if self.KAGGLE_NOTEBOOK:
            self.OUTPUT_DIR = ''
            self.PROCESSED_DIR = ""
            self.train_datadir = '/kaggle/input/birdclef-2025/train_audio'
            self.train_csv = '/kaggle/input/birdclef-2025/train.csv'
            self.test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
            self.submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
            self.taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
            self.model_path = '/kaggle/input/birdclef-2025-0330'
        else:
            self.OUTPUT_DIR = '../data/result/'
            self.train_datadir = '../data/raw/train_audio/'
            self.train_csv = '../data/raw/train.csv'
            self.test_soundscapes = '../data/raw/test_soundscapes/'
            self.submission_csv = '../data/raw/sample_submission.csv'
            self.taxonomy_csv = '../data/raw/taxonomy.csv'
            self.models_dir = "../models/" # 全modelの保存先
            self.model_path = self.models_dir # 各モデルの保存先．学習時に動的に変更．
            self.RAW_DIR = '../data/raw/'
            self.PROCESSED_DIR = '../data/processed/'


        # ===== Audio Settings =====
        self.FS = 32000
        self.WINDOW_SIZE = 5.0 # 推論時のウィンドウサイズ
        self.TARGET_DURATION = 5 # データセット作成時のウィンドウサイズ
        self.TARGET_SHAPE = (256, 256)
        self.N_FFT = 1024
        self.HOP_LENGTH = 512
        self.N_MELS = 148
        self.FMIN = 20
        self.FMAX = 16000
        self.N_MAX = 50 if self.debug else None        
        self.N_JOBS = 16  # 並列処理のスレッド数 16くらいでいい
        self.N_JOBS_DURATION = 47
        
        self.LOAD_ENGINE = 'torchaudio'  # librosa or torchaudio
        self.SKIP_RESIZE = False  # resizeしないならTrue
        self.seed = 42
        self.n_fold = 5
        self.num_rare_samples = 50 # これ以下のサンプル数のspeciesはrare speciesとして扱う
        self.is_crop_aug = False
        
        self.crop_strategy = "head"

In [38]:
config = DatasetConfig(kaggle_notebook=False, debug=False)

In [39]:
utils_lib.set_seed(config.seed)

In [40]:
train_2025 = pd.read_csv("../data/raw/train.csv")
taxonomy_2025 = pd.read_csv("../data/raw/taxonomy.csv")
train_2024 = pd.read_csv("../data/raw/bc2024/train_metadata.csv")
taxonomy_2024 = pd.read_csv("../data/raw/bc2024/eBird_Taxonomy_v2021.csv")
train_2024_add = pd.read_csv("../data/raw/bc2024-additional/BirdClef2024_additional.csv")

# 学名を作る
train_2024_add["scientific_name"] = train_2024_add["gen"].str.strip() + " " + train_2024_add["sp"].str.strip()
train_2024_add["filename"] = train_2024_add["primary_label"] + "/" + train_2024_add["file"] + ".mp3"
# drop
unneeded_columns = ["regnr", "auto", "dvc", "mic", "smp", "numRecordings", "also", "rmk", "bird-seen", "animal-seen", "playback-used", "temp"]
unneeded_columns += ["lic", "q", "length", "date", "uploaded"]
unneeded_columns += ["method", "file", "file-name", "sono", "osci"]
unneeded_columns += ["gen", "sp"]
unneeded_columns += ["lat", "lng", "alt", "sex", "stage"]
unneeded_columns += ["group", "en", "rec", "cnt", "loc", "id", "ssp"]
train_2024_add = train_2024_add.drop(columns=unneeded_columns)


train_2023 = pd.read_csv("../data/raw/bc2023/train_metadata.csv")
taxonomy_2023 = pd.read_csv("../data/raw/bc2023/eBird_Taxonomy_v2021.csv")
train_2022 = pd.read_csv("../data/raw/bc2022/train_metadata.csv")
taxonomy_2022 = pd.read_csv("../data/raw/bc2022/eBird_Taxonomy_v2021.csv")

# 2025のユニークなprimary_label
species_2025 = set(train_2025["primary_label"].unique())

# データセットと対応するtrainデータ
datasets = {
    "2024": train_2024,
    "2024_add": train_2024_add,
    "2023": train_2023,
    "2022": train_2022
}

# 結果格納
results = []

for year, df in datasets.items():
    all_species = set(df["primary_label"].unique())
    matched_species = species_2025 & all_species
    matched_df = df[df["primary_label"].isin(matched_species)]

    results.append({
        "year": year,
        "matched_species_count": len(matched_species),
        "total_species_in_dataset": len(all_species),
        "matched_sample_count": len(matched_df),
        "total_sample_count": len(df)
    })

# DataFrame化して表示
summary_df = pd.DataFrame(results)


In [41]:
summary_df

Unnamed: 0,year,matched_species_count,total_species_in_dataset,matched_sample_count,total_sample_count
0,2024,1,182,276,24459
1,2024_add,1,182,96,24279
2,2023,2,264,371,16941
3,2022,1,152,151,14852


In [42]:


# 2025の primary_label セットを取得
species_2025_labels = set(train_2025["primary_label"].dropna().unique())

# 過去データセット（pretrain候補）
past_datasets = {
    "2024": train_2024,
    "2024_add": train_2024_add,
    "2023": train_2023,
    "2022": train_2022,
}

# 各年から 2025に含まれない種のデータのみ抽出
pretrain_dfs = []
for year, df in past_datasets.items():
    df = df.copy()
    df["source_year"] = year
    df = df[~df["primary_label"].isin(species_2025_labels)]
    pretrain_dfs.append(df)

# pretrain用train_dfを結合
pretrain_df = pd.concat(pretrain_dfs, ignore_index=True)
print(f"✅ Pretrain用データ数: {len(pretrain_df)}")
print(f"🧬 含まれるprimary_label数: {pretrain_df['primary_label'].nunique()}")


✅ Pretrain用データ数: 79637
🧬 含まれるprimary_label数: 569


In [43]:
# filepathとsamplenameを追加
audio_dirs = {
    "2024": os.path.join(config.RAW_DIR, "bc2024/train_audio"),
    "2024_add": os.path.join(config.RAW_DIR, "bc2024-additional/additional_audio"),
    "2023": os.path.join(config.RAW_DIR, "bc2023/train_audio"),
    "2022": os.path.join(config.RAW_DIR, "bc2022/train_audio"),
}

# working_df をコピーして作成
working_df = pretrain_df.copy()

# ファイルパスを source_year に応じて割り当てる
def resolve_filepath(row):
    root_dir = audio_dirs.get(row["source_year"])
    return os.path.join(root_dir, row["filename"])

working_df['filepath'] = working_df.apply(resolve_filepath, axis=1)

# samplename などは同じでOK
working_df['samplename'] = working_df['filename'].apply(
    lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0]
)


# fileが存在しない行を除去
from pathlib import Path
# filepath列に基づいてファイル存在をチェック
working_df["file_exists"] = working_df["filepath"].map(lambda x: Path(x).exists())

# 削除前の行数
before = len(working_df)

# 存在しないファイルを除去
working_df = working_df[working_df["file_exists"]].reset_index(drop=True)

# ログ出力
after = len(working_df)
print(f"✅ Removed {before - after} rows with missing audio files.")
working_df = working_df.drop(columns=["file_exists"])


# duration結合
duration_df = pd.read_csv("../data/processed/pretrain_duration.csv")
working_df = working_df.merge(duration_df, on="filename", how="left")
# duraion_secのnanの数
print(f"✅ NaN in duration_sec: {working_df['duration_sec'].isna().sum()}")

✅ Removed 11 rows with missing audio files.
✅ NaN in duration_sec: 0


In [44]:
# 重複したsamplenameを削除．新しい年を優先．

# 年度ごとの優先度を定義（数値が大きいほど新しい年度）
year_priority = {"2024_add": 4, "2024": 3, "2023": 2, "2022": 1}
working_df["year_priority"] = working_df["source_year"].map(year_priority)

# 年度優先でソートし、samplenameが重複しているもののうち、最も新しい年度のものを残す
working_df = working_df.sort_values("year_priority", ascending=False)
working_df = working_df.drop_duplicates(subset="samplename", keep="first")

# 補助列を削除
working_df = working_df.drop(columns=["year_priority"])

# ✅ 結果確認
print(f"🧼 重複削除後の件数: {len(working_df)}")
print(f"🧬 一意なsamplename数: {working_df['samplename'].nunique()}")

🧼 重複削除後の件数: 73277
🧬 一意なsamplename数: 73277


In [45]:
# duration計算


# # === 音声長さ（秒）を取得する関数 ===
# def get_duration(filepath, sr):
#     try:
#         audio, _ = librosa.load(filepath, sr=sr)
#         return len(audio) / sr
#     except Exception as e:
#         print(f"[ERROR] Could not load {filepath}: {e}")
#         return np.nan

# print("🔄 Calculating durations with parallel processing...")

# # === 並列で実行 ===
# filepaths = working_df['filepath'].tolist()
# durations = Parallel(n_jobs=config.N_JOBS_DURATION)(
#     delayed(get_duration)(fp, config.FS) for fp in tqdm(filepaths)
# )

# # 結果を working_df に追加
# working_df['duration_sec'] = durations
# print("✅ Added 'duration_sec' column to working_df")

# # 欠損チェック
# missing = working_df['duration_sec'].isna().sum()
# print(f"⚠️ Missing duration values: {missing}")

In [46]:
# targetを追加

# ユニークなラベルを抽出（working_dfから）
label_list = sorted(working_df['primary_label'].unique())
label_id_list = list(range(len(label_list)))

# ラベルIDマッピング辞書を作成
label2id = dict(zip(label_list, label_id_list))
id2label = dict(zip(label_id_list, label_list))

print(f'Found {len(label_list)} unique species')
total_samples = len(working_df)
print(f'Total samples: {total_samples}')

# クラスIDを working_df に割り当て
working_df['target'] = working_df['primary_label'].map(label2id)

print(f"Debug mode: {'ON' if config.debug else 'OFF'}")

# =============== DEBUGモードでのサンプル制限 ===============
if config.debug:
    working_df = working_df.sample(n=config.N_MAX, random_state=config.seed).reset_index(drop=True)
    config.n_fold = 2
    print(f"🐛 DEBUG MODE: Using only {len(working_df)} samples")
else:
    print(f"✅ FULL MODE: Using all {len(working_df)} samples")



Found 569 unique species
Total samples: 73277
Debug mode: OFF
✅ FULL MODE: Using all 73277 samples


In [None]:
working_df["valid_start_sec"] = 0
working_df["valid_end_sec"] = None  # None or np.nan にしておく

# 使える範囲決定．現状は全部使う


# valid_end_sec が None なら duration_sec に補完
working_df['valid_end_sec'] = working_df.apply(
    lambda row: row['duration_sec'] if pd.isna(row['valid_end_sec']) else row['valid_end_sec'],
    axis=1
)
# valid_start_sec が None なら 0 に補完（念のため）
working_df['valid_start_sec'] = working_df['valid_start_sec'].fillna(0)


working_df["crop_strategy"] = "head"

working_df["is_valid_audio"] = True
working_df["apply_denoise"] = False

In [48]:
# augmentを決定．現状はaugmentなし

working_df['n_augment'] = 0
working_df['multi_crop'] = False

In [None]:
# crop_strategyに基づいて音声データを切り出す
# 現状centerしか使ってないのであまり意味がないコード．
def crop_audio(audio_data: np.ndarray, target_samples: int, strategy='center'):
    total_samples = len(audio_data)

    if total_samples < target_samples:
        n_copy = math.ceil(target_samples / total_samples)
        audio_data = np.concatenate([audio_data] * n_copy)
        total_samples = len(audio_data)

    if strategy == 'head':
        # 1秒遅らせて開始（ただし収まらない場合は0から）
        buffer = int(0 * config.FS)
        start_idx = min(buffer, total_samples - target_samples)
    elif strategy == 'tail':
        start_idx = total_samples - target_samples
    elif strategy == 'center':
        start_idx = total_samples // 2 - target_samples // 2
    elif strategy == 'random':
        max_start = total_samples - target_samples
        start_idx = np.random.randint(0, max_start + 1)
    elif isinstance(strategy, (float, int)):
        start_idx = int(strategy * config.FS)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

    start_idx = max(0, min(start_idx, total_samples - target_samples))
    end_idx = start_idx + target_samples
    return audio_data[start_idx:end_idx]

In [50]:
# audioをmelに変換
def audio2melspec(audio_data):
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=config.FS,
        n_fft=config.N_FFT,
        hop_length=config.HOP_LENGTH,
        n_mels=config.N_MELS,
        fmin=config.FMIN,
        fmax=config.FMAX,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

In [None]:
# 音源をmelに変える処理．並列化に対応
def process_row(row):
    try:
        audio_data, _ = librosa.load(row.filepath, sr=config.FS)
        target_samples = int(config.TARGET_DURATION * config.FS)

        mel_list = []
        name_list = []

        # === 有効範囲を秒 → サンプルに変換 ===
        valid_start_sec = row.get("valid_start_sec", 0)
        valid_end_sec = row.get("valid_end_sec", None)
        duration_sec = len(audio_data) / config.FS

        if pd.isna(valid_end_sec) or valid_end_sec is None:
            valid_end_sec = duration_sec

        valid_start_sample = int(valid_start_sec * config.FS)
        valid_end_sample = int(valid_end_sec * config.FS)

        usable_audio = audio_data[valid_start_sample:valid_end_sample]
        total_usable_samples = len(usable_audio)

        # === オリジナル clip ===
        strategy = row.crop_strategy
        try:
            strategy = float(strategy)
        except ValueError:
            pass

        clip = crop_audio(usable_audio, target_samples, strategy=)  # strategyはcenter固定 or 任意でも可
        if len(clip) < target_samples:
            clip = np.pad(clip, (0, target_samples - len(clip)), mode='constant')
        mel = audio2melspec(clip)
        if mel.shape != config.TARGET_SHAPE:
            mel = cv2.resize(mel, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        mel_list.append(mel.astype(np.float32))
        name_list.append(row.samplename)

        # === n_augment に応じて crop ===
        n_aug = int(row.get("n_augment", 0))
        if n_aug <= 0:
            return list(zip(name_list, mel_list)), None

        interval = max((total_usable_samples - target_samples) // (n_aug + 1), 1)

        for i in range(n_aug):
            start_idx = min(i * interval, total_usable_samples - target_samples)
            clip = usable_audio[start_idx: start_idx + target_samples]
            if len(clip) < target_samples:
                clip = np.pad(clip, (0, target_samples - len(clip)), mode='constant')
            mel_crop = audio2melspec(clip)
            if mel_crop.shape != config.TARGET_SHAPE:
                mel_crop = cv2.resize(mel_crop, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
            mel_list.append(mel_crop.astype(np.float32))
            name_list.append(f"{row.samplename}_crop{i}")

        return list(zip(name_list, mel_list)), None

    except Exception as e:
        return None, (row.filepath, str(e))

In [52]:
# mel変換を並列化
results = Parallel(n_jobs=config.N_JOBS)(
    delayed(process_row)(row) for _, row in working_df.iloc[:total_samples].iterrows()
)

# 結果の整理
all_bird_data = {}
errors = []

for result, err in results:
    if result is not None:
        for name, mel in result:
            all_bird_data[name] = mel
    if err is not None:
        errors.append(err)
        
print(f"Total errors: {len(errors)}")
if errors:
    print("Errors:")
    for filepath, error in errors:
        print(f"  {filepath}: {error}")    

Total errors: 0


In [53]:
# working_dfにaugmentしたデータ情報を追加
augmented_rows = []

for _, row in working_df.iterrows():
    n_aug = int(row.get('n_augment', 0))
    if n_aug > 0:
        for i in range(n_aug):
            new_row = row.copy()
            new_row['samplename'] = f"{row.samplename}_crop{i}"
            augmented_rows.append(new_row)

# DataFrameにまとめる
augmented_rows = pd.DataFrame(augmented_rows)
working_df_augmented = pd.concat([working_df, augmented_rows], ignore_index=True)
print(f"✅ working_df_augmented created with {len(augmented_rows)} augmented rows.")

✅ working_df_augmented created with 0 augmented rows.


In [54]:
# 事前にfoldを決めておく．5fold．


working_df_augmented['group_id'] = working_df_augmented['samplename'].map(lambda x: x.split('_crop')[0])

# fold 列を初期化
working_df_augmented['fold'] = -1

# ✅ stratify + group 両立！
sgkf = StratifiedGroupKFold(n_splits=config.n_fold, shuffle=True, random_state=config.seed)
groups = working_df_augmented['group_id']
labels = working_df_augmented['primary_label']

for fold_id, (_, val_idx) in enumerate(sgkf.split(working_df_augmented, labels, groups=groups)):
    working_df_augmented.loc[val_idx, 'fold'] = fold_id

In [55]:
# === 無効な音源を除外（fold付与後） ===
working_df_filtered = working_df_augmented[working_df_augmented["is_valid_audio"]].reset_index(drop=True)

print(f"📉 Removed {len(working_df_augmented) - len(working_df_filtered)} rows marked as invalid audio.")
print(f"✅ Final training set size: {len(working_df_filtered)}")

working_df_augmented = working_df_filtered.copy()

📉 Removed 0 rows marked as invalid audio.
✅ Final training set size: 73277


In [56]:
# melとworking_dfを保存．working_dfはtrain.csvとして保存

# 4mins
import os
import numpy as np
import pickle
import csv
from datetime import datetime
import pytz

# === JST時刻でディレクトリ作成 ===
jst = pytz.timezone('Asia/Tokyo')
now = datetime.now(jst)
timestamp = now.strftime("%Y%m%d_%H%M")

# ✅ 保存先フォルダを debug に応じて分岐
if config.debug:
    output_dir = os.path.join(config.PROCESSED_DIR, "data_debugs")
else:
    output_dir = os.path.join(config.PROCESSED_DIR, f"melspec_{timestamp}")
os.makedirs(output_dir, exist_ok=True)

# === 1. melスペクトログラムの保存 ===
output_path = os.path.join(output_dir, "birdclef2025_melspec_5sec_256_256.npy")
wrapped_array = np.array(all_bird_data, dtype=object)

with open(output_path, 'wb') as f:
    pickle.dump(wrapped_array, f, protocol=5)

print(f"\n✅ Mel-spectrograms saved to: {output_path}")
print(f"📦 File size: {os.path.getsize(output_path) / (1024 ** 2):.2f} MB")
print(f"📐 Example shape: {next(iter(all_bird_data.values())).shape}")

# === 2. configの保存 ===
config_path = os.path.join(output_dir, "config.csv")
config_dict = {k: v for k, v in vars(config).items() if not k.startswith("__")}

with open(config_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["key", "value"])
    for key, value in config_dict.items():
        writer.writerow([key, value])

print(f"📝 Config saved to: {config_path}")


# ✅ train.csv として保存
train_csv_path = os.path.join(output_dir, "train.csv")
working_df_augmented.to_csv(train_csv_path, index=False)

print(f"📝 Augmented training metadata saved to: {train_csv_path}")
print(f"📊 Total rows: {len(working_df_augmented)}")


✅ Mel-spectrograms saved to: ../data/processed/melspec_20250422_2143/birdclef2025_melspec_5sec_256_256.npy
📦 File size: 18325.57 MB
📐 Example shape: (256, 256)
📝 Config saved to: ../data/processed/melspec_20250422_2143/config.csv
📝 Augmented training metadata saved to: ../data/processed/melspec_20250422_2143/train.csv
📊 Total rows: 73277


In [218]:
train_df = working_df_augmented.copy()
spectrograms = all_bird_data

# CSVにあるすべてのサンプル名
csv_sample_names = set(train_df['samplename'])

# NPYファイルの中のキー
npy_sample_names = set(spectrograms.keys())

# 一致していないサンプル名を確認
missing = csv_sample_names - npy_sample_names
print(f"🛑 Missing {len(missing)} spectrograms")
print("例:", list(missing)[:5])

🛑 Missing 0 spectrograms
例: []


In [None]:
# fabioの解説　必ずしも7secではない
# 48124/CSA36346.ogg 24sec以降
# 52884/CSA36344.ogg 55sec以降
# 52884/CSA36342.ogg 14sec以降

In [None]:
i = 19
idx_list = working_df[working_df["collection"] == "CSA"]["primary_label"].unique()
df = working_df[working_df["primary_label"] == f"{idx_list[i]}"][["primary_label", "filename", "author", "valid_end_sec", "duration_sec"]]
df

In [264]:
working_df[working_df["primary_label"] == "48124"]

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,...,target,filepath,samplename,class,crop_strategy,duration_sec,valid_start_sec,valid_end_sec,n_augment,multi_crop
458,48124,[''],[''],48124/CSA03598.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,5.6521,73.5077,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA03598.ogg,48124-CSA03598,Insecta,center,54.190812,0,47.190812,0,False
459,48124,[''],[''],48124/CSA18785.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,5.7892,-73.5504,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA18785.ogg,48124-CSA18785,Insecta,center,66.180656,0,59.180656,0,False
460,48124,[''],[''],48124/CSA18795.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,5.7892,-73.5504,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA18795.ogg,48124-CSA18795,Insecta,center,132.135031,0,125.135031,0,False
461,48124,[''],[''],48124/CSA18798.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,5.7892,-73.5504,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA18798.ogg,48124-CSA18798,Insecta,center,199.647531,0,192.647531,0,False
462,48124,[''],[''],48124/CSA34485.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,3.5732,-76.5809,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA34485.ogg,48124-CSA34485,Insecta,center,114.966031,0,107.966031,0,False
463,48124,[''],[''],48124/CSA35111.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.314,-73.9001,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA35111.ogg,48124-CSA35111,Insecta,center,130.85325,0,123.85325,0,False
464,48124,[''],[''],48124/CSA35116.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3515,-73.8261,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA35116.ogg,48124-CSA35116,Insecta,center,161.988531,0,154.988531,0,False
465,48124,[''],[''],48124/CSA35118.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3515,-73.8261,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA35118.ogg,48124-CSA35118,Insecta,center,118.004281,0,111.004281,0,False
466,48124,[''],[''],48124/CSA35157.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.247,-73.8732,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA35157.ogg,48124-CSA35157,Insecta,center,233.066031,0,226.066031,0,False
467,48124,[''],[''],48124/CSA35160.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3406,-73.8528,Tettigoniidae,...,29,../data/raw//train_audio/48124/CSA35160.ogg,48124-CSA35160,Insecta,center,260.902531,0,253.902531,0,False


In [217]:
# CSAチェック
# 最後にスペイン語が含まれていないlabel
# 1564122, 50186/CSA28885.ogg, 523060
# 52884/CSA14875.ogg
# 548639
# 714022
# 868458



# スペイン語最初に含まれる 4secくらい
# 50186/CSA28885.ogg
# 52884/CSA14875.ogg



# 話してる人
# Eliana Barona- Cortés
# Alexandra Butrago-Cardona
# Fabio A. Sarria-S

# 人の声だけの箇所
# 24292/CSA34649.ogg 2min48移行
# 24292/CSA34651.ogg 1min34移行
# 476537/CSA35459.ogg 2min14移行
# 476537/CSA35461.ogg 4min19移行

In [None]:
# 棒グラフ

working_df_augmented["primary_label"].value_counts()

primary_label
grekis     990
compau     808
trokin     787
roahaw     709
banana     610
          ... 
1564122      6
42087        5
528041       4
1139490      4
21116        3
Name: count, Length: 206, dtype: int64