In [None]:
import numpy as np
import pandas as pd

In [None]:
cols = ['primary_label', 'secondary_labels', 'record_name', 'source']


In [None]:
train = pd.read_csv('../input/birdclef-2024/train_metadata.csv')
train

train['record'] = [filename.split('/')[1] for filename in train.filename]
train['record_name'] = [record.split('.')[0] for record in train.record]

train['secondary_labels'] = [eval(secondary_labels) for secondary_labels in train['secondary_labels']]
train['source'] = 'bc24'
train[cols]

In [None]:
train_additional = pd.read_csv('../input/birdclef-2024/train_additional.csv')
train_additional
train_additional['source'] = 'bc00'
train_additional['record_name'] = [record.split('.')[0] for record in train_additional.record]
train_additional['primary_label'] = train_additional['species']
train_additional['secondary_labels'] = [[] for _ in train_additional['primary_label']]
train_additional = train_additional[~train_additional.record_name.isin(train.record_name.unique())].reset_index(drop=True)
train_additional[cols]

In [None]:
train_20 = pd.read_csv('../input/Birdclef20/train.csv')
train_20

train_20['record_name'] = [record.split('.')[0] for record in train_20.filename]
train_20['primary_label'] = train_20['ebird_code']
train_20['source'] = 'bc20'
train_20 = train_20[~train_20.record_name.isin(train.record_name.unique())].reset_index(drop=True)
train_20 = train_20[train_20.primary_label.isin(train.primary_label.unique())].reset_index(drop=True)
train_20[cols]

In [None]:
df = train_20[['species', 'ebird_code']].drop_duplicates().sort_values('species').reset_index(drop=True)
df

species2code = {species : code for species, code in zip(df.species, df.ebird_code)}
species2code

def process_secondary_labels(secondary_labels, species2code):
    secondary_labels = eval(secondary_labels)
    labels = []
    for secondary in secondary_labels:
        label = species2code.get(secondary.split('_')[1], '')
        if label != '':
            labels.append(label)
    return labels

train_20['secondary_labels'] = [process_secondary_labels(secondary_labels, species2code) for secondary_labels in train_20['secondary_labels']]

In [None]:
train_21 = pd.read_csv('../input/Birdclef21/train_metadata.csv')
train_21
train_21['secondary_labels'] = [eval(secondary_labels) for secondary_labels in train_21['secondary_labels']]
train_21['record_name'] = [record.split('.')[0] for record in train_21.filename]
train_21['source'] = 'bc21'
train_21 = train_21[~train_21.record_name.isin(train.record_name.unique())].reset_index(drop=True)
train_21 = train_21[train_21.primary_label.isin(train.primary_label.unique())].reset_index(drop=True)
train_21[cols]

In [None]:
train_22 = pd.read_csv('../input/Birdclef22/train_metadata.csv')
train_22['secondary_labels'] = [eval(secondary_labels) for secondary_labels in train_22['secondary_labels']]
train_22['record_name'] = [record.split('/')[1].split('.')[0] for record in train_22.filename]
train_22['source'] = 'bc22'
train_22 = train_22[~train_22.record_name.isin(train.record_name.unique())].reset_index(drop=True)
train_22 = train_22[train_22.primary_label.isin(train.primary_label.unique())].reset_index(drop=True)
train_22[cols]

In [None]:
train_23 = pd.read_csv('../input/Birdclef23/train_metadata.csv')
train_23['secondary_labels'] = [eval(secondary_labels) for secondary_labels in train_23['secondary_labels']]
train_23['record_name'] = [record.split('/')[1].split('.')[0] for record in train_23.filename]
train_23['source'] = 'bc23'
train_23 = train_23[~train_23.record_name.isin(train.record_name.unique())].reset_index(drop=True)
train_23 = train_23[train_23.primary_label.isin(train.primary_label.unique())].reset_index(drop=True)
train_23[cols]

In [None]:
all_train = pd.concat([train_20[cols], 
                       train_21[cols], 
                       train_22[cols], 
                       train_23[cols],
                       train_additional[cols],
                       train[cols],
                       ]).reset_index(drop=True)
all_train

In [None]:
all_train = all_train.sort_values(['primary_label', 'source'], ascending=False).reset_index(drop=True)
all_train = all_train.drop_duplicates('record_name')
all_train['rank'] = all_train.groupby('primary_label').source.rank(method='first', ascending=False)
all_train

In [None]:
all_train.source.value_counts()

In [None]:
all_train = all_train[all_train.source != 'bc24'].reset_index(drop=True)
all_train

all_train = all_train[all_train['rank'] <= 500].reset_index(drop=True)
all_train



In [None]:
all_train.to_csv('../input/birdclef-2024/all_train.csv', index=False)

In [None]:
from pathlib import Path
from tqdm import tqdm
import librosa
sr = 32000

def load_audio(record_name, primary_label, source):
    if source == 'bc20':
        pathname = Path('../input/Birdclef20/train_audio/') / primary_label / (record_name + '.mp3')
    elif source == 'bc21':
        pathname = Path('../input/Birdclef21/train_short_audio/') / primary_label / (record_name + '.ogg')
    elif source == 'bc22':
        pathname = Path('../input/Birdclef22/train_audio/') / primary_label / (record_name + '.ogg')
    elif source == 'bc23':
        pathname = Path('../input/Birdclef23/train_audio/') / primary_label / (record_name + '.ogg')
    elif source == 'bc00':
        pathname = Path('../input/birdclef2024-additional-mp3/additional_audio/') / primary_label / (record_name + '.mp3')
    audio = librosa.load(pathname, sr=32000)[0].astype(np.float32)
    return audio

In [None]:
lengths = []
for record_name, primary_label, source in zip(tqdm(all_train.record_name), all_train.primary_label, all_train.source):
    if source in ['bc00', 'bc20', 'bc21', 'bc22', 'bc23']:
        audio = load_audio(record_name, primary_label, source)
        lengths.append(len(audio))
        save_path = Path('../input') / 'birdclef_data' / primary_label
        np.save(save_path / ('first10_' + record_name), audio[: 10 * sr])
        np.save(save_path / ('last10_' + record_name), audio[-10 * sr : ])

In [None]:
len(lengths)