In [1]:
from google.colab import drive
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split as tts
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Main_Birdclef/scripts')
import birdclef_utils


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
birdclef_utils.retrieve_and_process_birdclef_data(zip_filename='ColabUploads.zip')
main_dir='/content/data/'
main_processed_dir=os.path.join(main_dir,'ColabUploads')
processed_dir=os.path.join(main_processed_dir,'KaggleUploads')
drive_dir='/content/drive/MyDrive'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully extracted all files from ColabUploads.zip to /content/data


In [3]:
df = pd.read_csv(os.path.join(processed_dir,'speech_cleaned_audio_with_duration.csv'))

under_represented = []
value_counts = df['primary_label'].value_counts()
for bird, count in value_counts.items():
    if count <= 4:
        print(f'{bird}:{count}')
        under_represented.append(bird)

df_under_represented = df[df['primary_label'].isin(under_represented)].copy() # Use .copy() to avoid SettingWithCopyWarning
df_filtered = df[~df['primary_label'].isin(under_represented)][['primary_label', 'cleaned_filename']].copy()

# Split the majority classes
training_fnames_filtered, test_fnames_filtered = tts(df_filtered, test_size=0.3, stratify=df_filtered['primary_label'], random_state=42)
train_fnames_filtered, val_fnames_filtered = tts(training_fnames_filtered, test_size=0.15, stratify=training_fnames_filtered['primary_label'], random_state=42)

train_filenames = list(train_fnames_filtered['cleaned_filename'].values)
val_filenames = list(val_fnames_filtered['cleaned_filename'].values)
test_filenames = list(test_fnames_filtered['cleaned_filename'].values)

under_represented_grouped = df_under_represented.groupby('primary_label')

for label in under_represented:
    try:
        group = under_represented_grouped.get_group(label)
        filenames = list(group['cleaned_filename'].values)
        num_files = len(filenames)

        if num_files == 4:
            train_filenames.append(filenames[0])
            train_filenames.append(filenames[1])
            val_filenames.append(filenames[2])
            test_filenames.append(filenames[3])
        elif num_files == 3:
            train_filenames.append(filenames[0])
            val_filenames.append(filenames[1])
            test_filenames.append(filenames[2])
        elif num_files == 2:
            train_filenames.append(filenames[0])
            val_filenames.append(filenames[1])
            test_filenames.append(filenames[1]) # Allow same file in val and test
        elif num_files == 1:
            train_filenames.append(filenames[0])
            val_filenames.append(filenames[0]) # Allow same file in val and test
            test_filenames.append(filenames[0]) # Allow same file in val and test
        else:
            print(f"Unexpected number of files ({num_files}) for underrepresented bird: {label}")

    except KeyError:
        print(f"Label '{label}' not found in under_represented_grouped (shouldn't happen).")

# Verify that all unique primary labels are present in each set
all_labels = df['primary_label'].unique()
train_labels = df[df['cleaned_filename'].isin(train_filenames)]['primary_label'].unique()
val_labels = df[df['cleaned_filename'].isin(val_filenames)]['primary_label'].unique()
test_labels = df[df['cleaned_filename'].isin(test_filenames)]['primary_label'].unique()

print(f"\nNumber of unique birds in full dataset: {len(all_labels)}")
print(f"Number of unique birds in training set: {len(train_labels)}")
print(f"Number of unique birds in validation set: {len(val_labels)}")
print(f"Number of unique birds in test set: {len(test_labels)}")

assert set(all_labels) == set(train_labels)
assert set(all_labels) == set(val_labels)
assert set(all_labels) == set(test_labels)
output_dir=os.path.join(drive_dir,'Main_Birdclef','supplemental_files')
np.save(os.path.join(output_dir,'train_filenames.npy'), train_filenames)
np.save(os.path.join(output_dir,'val_filenames.npy'), val_filenames)
np.save(os.path.join(output_dir,'test_filenames.npy'), test_filenames)

print(f"Created Files:\n"
      f"\t{os.path.join(output_dir, 'train_filenames.npy')}\n"
      f"\t{os.path.join(output_dir, 'val_filenames.npy')}\n"
      f"\t{os.path.join(output_dir, 'test_filenames.npy')}")


523060:4
868458:4
134933:4
1192948:4
1462711:3
1194042:3
24292:3
65419:3
67082:2
66016:2
66578:2
81930:2
476537:2
528041:2
66531:2
41778:2
47067:2
42087:2
42113:2
21116:2
1139490:2
21038:2
64862:2

Number of unique birds in full dataset: 206
Number of unique birds in training set: 206
Number of unique birds in validation set: 206
Number of unique birds in test set: 206
Created Files:
	/content/drive/MyDrive/Main_Birdclef/supplemental_files/train_filenames.npy
	/content/drive/MyDrive/Main_Birdclef/supplemental_files/val_filenames.npy
	/content/drive/MyDrive/Main_Birdclef/supplemental_files/test_filenames.npy
