In [12]:
import pandas as pd
import librosa

## Read 2024 Train DF

In [13]:
df = pd.read_csv('../input/birdclef-2024/train_metadata.csv')
df.head()

# Create Target
label_list = sorted(df['primary_label'].unique())
label_id_list = list(range(len(label_list)))
label2id = dict(zip(label_list, label_id_list))
df['target'] = df.primary_label.map(label2id)

# Create Filepath
df['filepath'] = '../input/birdclef-2024' + '/train_audio/' + df.filename

# Create Name
df['name'] = df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])

# Durations
df['Duration'] = df.apply(lambda x: librosa.get_duration(path=x['filepath']), axis=1)

## Holdout = approx X% of the K Underrepresented Species

In [14]:
k = 61
percent = 0.075

total_duration_per_species = df.groupby('primary_label')['Duration'].sum().sort_values()
bottom_k_species = total_duration_per_species.sort_values().head(k)
holdout_bottom_kspecies_vol = bottom_k_species * percent

holdout_df = pd.DataFrame(columns=df.columns)
n_audios = []
n_audios_taken = []
for species, total_duration in holdout_bottom_kspecies_vol.items():
    species_audios = df[df['primary_label'] == species].sort_values(by='Duration')
    n_audios.append(len(species_audios))
    specie_dur = 0
    n = 0
    for idx, row in species_audios.iterrows():
        if specie_dur >= total_duration:
            break
        else:
            holdout_df = pd.concat([holdout_df, row.to_frame().transpose()])
            specie_dur += librosa.get_duration(path=row.filepath)
            n+=1
    n_audios_taken.append(n)
holdout_df.reset_index(drop=True, inplace=True)
results_diff = pd.DataFrame({'Specie Volumetry':bottom_k_species, '0.1%': holdout_bottom_kspecies_vol, 'Volumetry Taken': holdout_df.groupby('primary_label')['Duration'].sum().sort_values()})
results_diff.sort_values(by='Specie Volumetry', inplace=True)
results_diff['Remaining'] = (results_diff['Specie Volumetry'] - results_diff['Volumetry Taken'])
results_diff['# Audios'] = n_audios
results_diff['# Audios Taken'] = n_audios_taken
results_diff['Audios Remaining for Train'] = results_diff['# Audios'] - results_diff['# Audios Taken']
results_diff
# UNNAMED COLUMN WEIRD

Unnamed: 0_level_0,Specie Volumetry,0.1%,Volumetry Taken,Remaining,# Audios,# Audios Taken,Audios Remaining for Train
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
niwpig1,49.271844,3.695388,5.0155,44.256344,5,1,4
asiope1,66.758813,5.006911,13.505281,53.253531,5,2,3
integr,75.316719,5.648754,5.838687,69.478031,5,2,3
darter2,89.751500,6.731362,11.4155,78.336,7,4,3
inpher1,128.041500,9.603112,10.773563,117.267938,13,4,9
...,...,...,...,...,...,...,...
brakit1,1048.545750,78.640931,84.390875,964.154875,44,14,30
lesyel1,1093.075969,81.980698,96.54125,996.534719,22,10,12
gryfra,1158.466906,86.885018,88.398719,1070.068188,56,16,40
yebbul3,1166.743062,87.505730,102.362875,1064.380187,30,8,22


In [15]:
holdout_df = holdout_df[['name', 'filepath']]
holdout_df.to_csv(path_or_buf='../input/birdclef-2024/holdout_'+str(k)+'_under_'+str(percent*100)+'%_.csv')

In [16]:
species_seen_in_wg = (list(df[(df['latitude'] >= 7) & (df['latitude'] <= 21) & (df['longitude'] >= 65) & (df['longitude'] <= 80)]['primary_label'].unique()))
species_unseen_in_wg = list(df[~df['primary_label'].isin(species_seen_in_wg)]['primary_label'].unique())

print(len(species_seen_in_wg))
print(len(species_unseen_in_wg))

df_wg = df[df['primary_label'].isin(species_seen_in_wg)].reset_index(drop=True)
df_notwg = df[df['primary_label'].isin(species_unseen_in_wg)].reset_index(drop=True)

165
17


In [17]:
species_seen_in_wg

['asbfly',
 'ashdro1',
 'ashpri1',
 'ashwoo2',
 'asikoe2',
 'aspfly1',
 'aspswi1',
 'barfly1',
 'bcnher',
 'bkcbul1',
 'bkrfla1',
 'bkskit1',
 'bkwsti',
 'bladro1',
 'blaeag1',
 'blakit1',
 'blhori1',
 'blnmon1',
 'blrwar1',
 'bncwoo3',
 'brakit1',
 'brcful1',
 'brfowl1',
 'brnhao1',
 'brnshr',
 'brodro1',
 'brwjac1',
 'brwowl1',
 'btbeat1',
 'bwfshr1',
 'categr',
 'chbeat1',
 'cohcuc1',
 'comfla1',
 'comior1',
 'comkin1',
 'commyn',
 'compea',
 'comros',
 'comsan',
 'comtai1',
 'copbar1',
 'crbsun2',
 'cregos1',
 'crfbar1',
 'crseag1',
 'dafbab1',
 'emedov2',
 'eucdov',
 'eurbla2',
 'forwag1',
 'goflea1',
 'grbeat1',
 'grecou1',
 'greegr',
 'grefla1',
 'grehor1',
 'grejun2',
 'grenig1',
 'grewar3',
 'grnsan',
 'grnwar1',
 'grtdro1',
 'gryfra',
 'grynig2',
 'grywag',
 'gybpri1',
 'gyhcaf1',
 'heswoo1',
 'hoopoe',
 'houcro1',
 'houspa',
 'inbrob1',
 'indpit1',
 'indrob1',
 'indrol2',
 'indtit1',
 'ingori1',
 'inpher1',
 'insbab1',
 'insowl1',
 'isbduc1',
 'jerbus2',
 'junbab2',
 'junmyn