In [1]:
import pandas as pd
import librosa

## Read 2024 Train DF

In [2]:
df = pd.read_csv('../input/birdclef-2024/train_metadata.csv')
df.head()

# Create Target
label_list = sorted(df['primary_label'].unique())
label_id_list = list(range(len(label_list)))
label2id = dict(zip(label_list, label_id_list))
df['target'] = df.primary_label.map(label2id)

# Create Filepath
df['filepath'] = '../input/birdclef-2024' + '/train_audio/' + df.filename

# Create Name
df['name'] = df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])

# Durations
df['Duration'] = df.apply(lambda x: librosa.get_duration(path=x['filepath']), axis=1)

## Holdout = approx X% of the K Underrepresented Species

In [32]:
k = 182
percent = 0.001

total_duration_per_species = df.groupby('primary_label')['Duration'].sum().sort_values()
bottom_k_species = total_duration_per_species.sort_values().head(k)
holdout_bottom_kspecies_vol = bottom_k_species * percent

holdout_df = pd.DataFrame(columns=df.columns)
n_audios = []
n_audios_taken = []
for species, total_duration in holdout_bottom_kspecies_vol.items():
    species_audios = df[df['primary_label'] == species].sort_values(by='Duration')
    n_audios.append(len(species_audios))
    specie_dur = 0
    n = 0
    for idx, row in species_audios.iterrows():
        if specie_dur >= total_duration:
            break
        else:
            holdout_df = pd.concat([holdout_df, row.to_frame().transpose()])
            specie_dur += librosa.get_duration(path=row.filepath)
            n+=1
    n_audios_taken.append(n)
holdout_df.reset_index(drop=True, inplace=True)
results_diff = pd.DataFrame({'Specie Volumetry':bottom_k_species, '0.1%': holdout_bottom_kspecies_vol, 'Volumetry Taken': holdout_df.groupby('primary_label')['Duration'].sum().sort_values()})
results_diff.sort_values(by='Specie Volumetry', inplace=True)
results_diff['Remaining'] = (results_diff['Specie Volumetry'] - results_diff['Volumetry Taken'])
results_diff['# Audios'] = n_audios
results_diff['# Audios Taken'] = n_audios_taken
results_diff['Audios Remaining for Train'] = results_diff['# Audios'] - results_diff['# Audios Taken']
results_diff
# UNNAMED COLUMN WEIRD

Unnamed: 0_level_0,Specie Volumetry,0.1%,Volumetry Taken,Remaining,# Audios,# Audios Taken,Audios Remaining for Train
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
niwpig1,49.271844,0.049272,5.0155,44.256344,5,1,4
asiope1,66.758813,0.066759,3.056313,63.7025,5,1,4
integr,75.316719,0.075317,1.854688,73.462031,5,1,4
darter2,89.751500,0.089751,1.933062,87.818437,7,1,6
inpher1,128.041500,0.128042,1.056,126.9855,13,1,12
...,...,...,...,...,...,...,...
hoopoe,28376.377875,28.376378,32.889031,28343.488844,500,6,494
commyn,31232.709937,31.232710,35.304625,31197.405312,237,11,226
grewar3,32810.691094,32.810691,33.717,32776.974094,500,6,494
houspa,44403.307031,44.403307,48.739625,44354.567406,500,9,491


In [33]:
holdout_df = holdout_df[['name', 'filepath']]
holdout_df.to_csv(path_or_buf='../input/birdclef-2024/holdout_'+str(k)+'_under_'+str(percent*100)+'%_.csv')

In [5]:
species_seen_in_wg = (list(df[(df['latitude'] >= 7) & (df['latitude'] <= 21) & (df['longitude'] >= 65) & (df['longitude'] <= 80)]['primary_label'].unique()))
species_unseen_in_wg = list(df[~df['primary_label'].isin(species_seen_in_wg)]['primary_label'].unique())

print(len(species_seen_in_wg))
print(len(species_unseen_in_wg))

df_wg = df[df['primary_label'].isin(species_seen_in_wg)].reset_index(drop=True)
df_notwg = df[df['primary_label'].isin(species_unseen_in_wg)].reset_index(drop=True)

165
17
