In [42]:
## Importing Libraries
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # "jax" or "tensorflow" or "torch" 

import keras_cv
import keras
import keras.backend as K
import tensorflow as tf
import numpy as np 
import pandas as pd

from glob import glob
from tqdm import tqdm

import librosa
import IPython.display as ipd
import librosa.display as lid

import matplotlib.pyplot as plt
import matplotlib as mpl

cmap = mpl.cm.get_cmap('coolwarm')

  cmap = mpl.cm.get_cmap('coolwarm')


In [43]:
## Set the test data path
DATASET_PATH = 'content/birdclef-2024'
#`unlabeled_soundscapes` is the set of data downloaded together with the training data.
#`test_soundscapes` will be populated on the site when submission of the results from `unlabbed_soundscapes` is done.
test_paths = glob(f'{DATASET_PATH}/test_soundscapes/*ogg')
if len(test_paths)==0:
    # current_paths = glob(f'{DATASET_PATH}/unlabeled_soundscapes/*ogg')
    test_paths = glob(f'{DATASET_PATH}/unlabeled_soundscapes/*ogg')
    print(len(test_paths))
test_df = pd.DataFrame(test_paths, columns=['filepath'])
test_df.head()

8444


Unnamed: 0,filepath
0,content/birdclef-2024/unlabeled_soundscapes/18...
1,content/birdclef-2024/unlabeled_soundscapes/26...
2,content/birdclef-2024/unlabeled_soundscapes/66...
3,content/birdclef-2024/unlabeled_soundscapes/12...
4,content/birdclef-2024/unlabeled_soundscapes/16...


In [44]:
## To handle our settings  and configurations, let's create a class
class Config:
    seed = 42
    # Input image size and batch size
    img_size = [128, 384]
    
    # Audio duration, sample rate, and length
    duration = 15 # second
    sample_rate = 32000
    audio_len = duration*sample_rate
    
    # STFT parameters
    nfft = 2028
    window = 2048
    hop_length = audio_len // (img_size[1] - 1)
    fmin = 20
    fmax = 16000
    
    #model name
    preset = 'efficientnetv2_b2_imagenet'
    class_names = sorted(os.listdir(f'{DATASET_PATH}/train_audio/'))
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v:k for k,v in label2name.items()} 

In [45]:
# print(Config.class_names)
# for file in Config.class_names:
#     print(file)

tf.keras.utils.set_random_seed(Config.seed)
print(test_df.shape)


(8444, 1)


In [46]:

## Load the model

# Create an input layer for the model
inp = keras.layers.Input(shape=(Config.img_size[0], Config.img_size[1], 3))
# Pretrained backbone
backbone = keras_cv.models.EfficientNetV2Backbone.from_preset(
    Config.preset,
)
out = keras_cv.models.ImageClassifier(
    backbone=backbone,
    num_classes=Config.num_classes,
    name="classifier"
)(inp)
# Build model
model = keras.models.Model(inputs=inp, outputs=out)
# Load weights of trained model
model.load_weights("best_model.weights.h5")

In [47]:
def build_decoder(with_labels=True, dim=1024):
    def get_audio(filepath):
        def _load_audio(filepath):
            audio, _ = librosa.load(filepath.numpy().decode('utf-8'), sr=Config.sample_rate, mono=True)
            # Pad or truncate the audio to a fixed length
            if len(audio) < Config.audio_len:
                audio = np.pad(audio, (0, Config.audio_len - len(audio)), mode='constant')
            else:
                audio = audio[:Config.audio_len]
            return audio.astype(np.float32)

        audio = tf.py_function(_load_audio, [filepath], tf.float32)
        audio.set_shape([Config.audio_len])  # Ensure fixed shape
        return audio

    def create_frames(audio, duration=5, sr=32000):
        frame_size = int(duration * sr)
        # Pad the audio to ensure it can be divided evenly into frames
        padding_size = (frame_size - (len(audio) % frame_size)) % frame_size
        audio = tf.pad(audio, [[0, padding_size]])
        frames = tf.reshape(audio, [-1, frame_size])
        return frames

    def apply_preproc(spec):
        mean = tf.math.reduce_mean(spec)
        std = tf.math.reduce_std(spec)
        spec = tf.where(tf.math.equal(std, 0), spec - mean, (spec - mean) / std)
        min_val = tf.math.reduce_min(spec)
        max_val = tf.math.reduce_max(spec)
        spec = tf.where(tf.math.equal(max_val - min_val, 0), spec - min_val, (spec - min_val) / (max_val - min_val))
        return spec

    def decode(path):
        audio = get_audio(path)
        print(f"Audio shape: {audio.shape}")
        
        audio = create_frames(audio)
        print(f"Frames shape: {audio.shape}")
        
        # Compute the spectrogram
        spec = keras.layers.MelSpectrogram(
            num_mel_bins=Config.img_size[0],
            fft_length=Config.nfft,
            sequence_stride=Config.hop_length,
            sampling_rate=Config.sample_rate
        )(audio)
        print(f"Spectrogram shape before adding channel: {spec.shape}")
        
        # Add a channel dimension
        spec = tf.expand_dims(spec, axis=-1)  # Shape: [batch_size, height, width, 1]
        print(f"Spectrogram shape after adding channel: {spec.shape}")
        
        # Transpose the spectrogram to (batch_size, width, height, channels)
        spec = tf.transpose(spec, perm=[0, 2, 1, 3])  # Swap height and width
        print(f"Spectrogram shape after transpose: {spec.shape}")
        
        # Resize the spectrogram to the expected width (384)
        if spec.shape[2] != Config.img_size[1]:
            spec = tf.image.resize(spec, [Config.img_size[0], Config.img_size[1]])
        print(f"Spectrogram shape after resize: {spec.shape}")
        
        # Apply preprocessing
        spec = apply_preproc(spec)
        
        # Convert to 3-channel image
        spec = tf.tile(spec, [1, 1, 1, 3])
        print(f"Final spec shape: {spec.shape}")
        
        return spec
    return decode

In [48]:
# Build data loader
def build_dataset(paths, batch_size=1, decode_fn=None, cache=False):
    if decode_fn is None:
        decode_fn = build_decoder(dim=Config.audio_len) # decoder
    AUTO = tf.data.experimental.AUTOTUNE
    slices = (paths,)
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.map(decode_fn, num_parallel_calls=AUTO) # decode audio to spectrograms then create frames
    ds = ds.cache() if cache else ds # cache files
    ds = ds.batch(batch_size, drop_remainder=False) # create batches
    ds = ds.prefetch(AUTO)
    return ds

In [49]:
def kaggleSubmission():
        # Initialize empty list to store ids
    ids = []

    # Initialize empty array to store predictions
    preds = np.empty(shape=(0, Config.num_classes), dtype='float32')

    # Build test dataset
    test_paths = test_df.filepath.tolist()
    test_ds = build_dataset(paths=test_paths, batch_size=1)

    # Iterate over each audio file in the test dataset
    for idx, specs in enumerate(tqdm(iter(test_ds), desc='test ', total=len(test_df))):
        # Extract the filename without the extension
        filename = test_paths[idx].split('/')[-1].replace('.ogg','')
        
        # Convert to backend-specific tensor while excluding extra dimension
        specs = keras.ops.convert_to_tensor(specs[0])
        
        # Predict bird species for all frames in a recording using all trained models
        frame_preds = model.predict(specs, verbose=0)
        
        # Create a ID for each frame in a recording using the filename and frame number
        frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(frame_preds))]
        
        # Concatenate the ids
        ids += frame_ids
        # Concatenate the predictions
        preds = np.concatenate([preds, frame_preds], axis=0)


        # Submit prediction
    pred_df = pd.DataFrame(ids, columns=['row_id'])
    pred_df.loc[:, Config.class_names] = preds
    pred_df.to_csv('submission.csv',index=False)

In [50]:

def simple_prediction():
    # Initialize lists to store file names and predicted classes
    file_names = []
    predicted_classes = []

    # Build test dataset
    test_paths = test_df.filepath.tolist()
    test_ds = build_dataset(paths=test_paths, batch_size=1)

    # Iterate over each audio file in the test dataset
    for idx, specs in enumerate(tqdm(iter(test_ds), desc='Predicting', total=len(test_df))):
        # Extract the filename
        filename = test_paths[idx].split('/')[-1]
        
        # Convert to backend-specific tensor while excluding extra dimension
        specs = keras.ops.convert_to_tensor(specs[0])
        
        # Predict bird species for all frames in a recording
        frame_preds = model.predict(specs, verbose=0)
        
        # Get the predicted class for each frame (average predictions across frames)
        avg_preds = np.mean(frame_preds, axis=0)
        predicted_class_idx = np.argmax(avg_preds)
        predicted_class = Config.label2name[predicted_class_idx]
        
        # Append results to lists
        file_names.append(filename)
        predicted_classes.append(predicted_class)

    # Create a DataFrame with the results
    results_df = pd.DataFrame({
        'File': file_names,
        'Predicted Class': predicted_classes
    })

    # Save the results to a CSV file
    results_df.to_csv('predictions.csv', index=False)
    print("Predictions saved to 'predictions.csv'")

In [52]:
# simple_prediction()
kaggleSubmission()

Audio shape: (480000,)
Frames shape: (3, 160000)
Spectrogram shape before adding channel: (3, 128, 128)
Spectrogram shape after adding channel: (3, 128, 128, 1)
Spectrogram shape after transpose: (3, 128, 128, 1)
Spectrogram shape after resize: (3, 128, 384, 1)
Final spec shape: (3, 128, 384, 3)


test : 100%|██████████| 8444/8444 [11:39<00:00, 12.07it/s]
