In [9]:
import pandas as pd
import librosa

## Read 2024 Train DF

In [10]:
df = pd.read_csv('../input/birdclef-2024/train_metadata.csv')
df.head()

# Create Target
label_list = sorted(df['primary_label'].unique())
label_id_list = list(range(len(label_list)))
label2id = dict(zip(label_list, label_id_list))
df['target'] = df.primary_label.map(label2id)

# Create Filepath
df['filepath'] = '../input/birdclef-2024' + '/train_audio/' + df.filename

# Create Name
df['filename'] = df['filename'].apply(lambda x: x.split('.')[0].replace('/', '-'))


# Durations
df['Duration'] = df.apply(lambda x: librosa.get_duration(path=x['filepath']), axis=1)

## Holdout

1) Takes the k less represented species
2) Gets percent of each of the k species
3) Removes all the audios with ratings leq than rating_leq_than

The output name file is adjusted to show many species remained after the rating removal

In [11]:
k = 55
percent = 0.075
rating_leq_than = 3

In [12]:
total_duration_per_species = df.groupby('primary_label')['Duration'].sum().sort_values()
bottom_k_species = total_duration_per_species.sort_values().head(k)
holdout_bottom_kspecies_vol = bottom_k_species * percent

holdout_df = pd.DataFrame(columns=df.columns)
n_audios = []
n_audios_taken = []
for species, total_duration in holdout_bottom_kspecies_vol.items():
    species_audios = df[df['primary_label'] == species].sort_values(by='Duration')
    n_audios.append(len(species_audios))
    specie_dur = 0
    n = 0
    for idx, row in species_audios.iterrows():
        if specie_dur >= total_duration:
            break
        else:
            if row['rating'] <= rating_leq_than:
                holdout_df = pd.concat([holdout_df, row.to_frame().transpose()])
                specie_dur += librosa.get_duration(path=row.filepath)
                n+=1
            else:
                pass
    n_audios_taken.append(n)
holdout_df.reset_index(drop=True, inplace=True)
results_diff = pd.DataFrame({'Specie Volumetry':bottom_k_species, '0.1%': holdout_bottom_kspecies_vol, 'Volumetry Taken': holdout_df.groupby('primary_label')['Duration'].sum().sort_values()})
results_diff.sort_values(by='Specie Volumetry', inplace=True)
results_diff['Remaining'] = (results_diff['Specie Volumetry'] - results_diff['Volumetry Taken'])
results_diff['# Audios'] = n_audios
results_diff['# Audios Taken'] = n_audios_taken
results_diff['Audios Remaining for Train'] = results_diff['# Audios'] - results_diff['# Audios Taken']
results_diff
# UNNAMED COLUMN WEIRD

Unnamed: 0_level_0,Specie Volumetry,0.1%,Volumetry Taken,Remaining,# Audios,# Audios Taken,Audios Remaining for Train
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
niwpig1,49.271844,3.695388,5.0155,44.256344,5,1,4
asiope1,66.758813,5.006911,3.056313,63.7025,5,1,4
integr,75.316719,5.648754,16.702031,58.614688,5,2,3
darter2,89.7515,6.731362,60.72,29.0315,7,1,6
inpher1,128.0415,9.603112,13.855844,114.185656,13,3,10
blaeag1,128.845219,9.663391,0.940406,127.904812,6,1,5
pomgrp2,141.862063,10.639655,15.768,126.094063,9,1,8
wbbfly1,162.641938,12.198145,,,7,0,7
bncwoo3,215.660031,16.174502,,,7,0,7
scamin3,226.748063,17.006105,2.951,223.797063,12,1,11


In [13]:
# Species removed due to the rating constraint
results_diff[results_diff['# Audios Taken'] == 0]

Unnamed: 0_level_0,Specie Volumetry,0.1%,Volumetry Taken,Remaining,# Audios,# Audios Taken,Audios Remaining for Train
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
wbbfly1,162.641938,12.198145,,,7,0,7
bncwoo3,215.660031,16.174502,,,7,0,7
nilfly2,248.737969,18.655348,,,8,0,8
kerlau2,433.521125,32.514084,,,14,0,14
jerbus2,676.55,50.74125,,,19,0,19
sttwoo1,706.081188,52.956089,,,20,0,20


In [14]:
# New Number of species
act_len = len(results_diff[results_diff['# Audios Taken'] != 0])
print(act_len)

# How many audios are in holdout
print(len(holdout_df))

49
169


In [15]:
holdout_df = holdout_df[['filename']]
holdout_df.to_csv(path_or_buf='../input/birdclef-2024/holdout_'+str(k)+'bottom_'+str(percent*100)+'%_' + 'ratleq' + str(rating_leq_than) + '_.csv')

In [16]:
# Unused, just a way to separe species never in estern ghats
species_seen_in_wg = (list(df[(df['latitude'] >= 7) & (df['latitude'] <= 21) & (df['longitude'] >= 65) & (df['longitude'] <= 80)]['primary_label'].unique()))
species_unseen_in_wg = list(df[~df['primary_label'].isin(species_seen_in_wg)]['primary_label'].unique())

print(len(species_seen_in_wg))
print(len(species_unseen_in_wg))

df_wg = df[df['primary_label'].isin(species_seen_in_wg)].reset_index(drop=True)
df_notwg = df[df['primary_label'].isin(species_unseen_in_wg)].reset_index(drop=True)

165
17
