# Import Libraries

In [1]:
# -*- coding: utf-8 -*-
# libraries for setting up the environment
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['KERAS_BACKEND'] = 'jax'
os.path.join('./')
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
from typing import Tuple
# libraries for scientific computing
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
cmap = mpl.colormaps.get_cmap('coolwarm')
# libraries for audio processing
import librosa
import soundfile as sf
# from scipy import signal
from IPython.display import Audio
# libraries for machine learning
from sklearn.model_selection import train_test_split

# import keras_cv
# import keras
# import keras.backend as K
import tensorflow as tf
import tensorflow_io as tfio

# from glob import glob
# from tqdm import tqdm

# Load the audio processing functions from the provided files
from audio import process, extraction, reduction, augmentation

In [2]:
print("Is GPU available:", tf.config.list_physical_devices('GPU'))
# print("Keras version:", keras.__version__)
# print("Keras CV version:", keras_cv.__version__)
print("Tensorflow version:", tf.__version__)

Is GPU available: []
Tensorflow version: 2.15.1


In [3]:
class Configure:
    data_path = '/home/max/Desktop/birdclef/data'
    audio_path = '/home/max/Desktop/birdclef/data/train_audio'
    file_name = 'train_metadata.csv'
    
    seed = 42
    
    img_size = [128, 384]
    batch_size = 16
    
    duration = 5
    sample_rate = 32000
    audio_length = duration * sample_rate
    
    nfft = 1024
    frame_length = 1024
    frame_step = 512
    n_mels = 128
    window = 1024
    hop_length = 512
    fmin = 20
    fmax = 12000
    
    epochs = 10
    preset = 'efficientnetv2_b2_imagenet'
    
    augment = True
    
    class_names = sorted(os.listdir(audio_path))
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v: k for k, v in label2name.items()}


In [4]:
tf.random.set_seed(Configure.seed)

In [5]:
def process_audio_pipeline(file_path: str) -> Tuple [ np.ndarray, int, np.ndarray]:
    # Load audio
    audio, sample_rate = process.process_audio(file_path, Configure.sample_rate, Configure.audio_length)

    # reduce noise
    audio_clean = reduction.clean_audio(audio, sample_rate)
    
    # Extract features
    magnitude_audio, phase_audio, mfccs, chroma, mel_spectrogram = extraction.extract_audio(audio_clean, sample_rate)
    features = np.concatenate([magnitude_audio, phase_audio, mfccs, chroma, mel_spectrogram], axis=1)
    
    # Augment audio
    augmented_audio = augmentation.augment_audio(audio_clean, sample_rate)

    return augmented_audio, sample_rate, features

def save_spectrogram(audio: np.ndarray, sr: int, output_path: str):
    plt.figure(figsize=(10, 4))
    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, fmax=12000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap=cmap)
    plt.axis('off')
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    
def process_and_save_spectrograms(df: pd.DataFrame, output_dir: str) -> pd.DataFrame:
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    feature_list = []
    for idx, row in df.iterrows():
        file_path = row['file_path']
        processed_audio, sample_rate, features = process_audio_pipeline(file_path)

        # Save spectrogram
        output_path = os.path.join(output_dir, f"processed_{row['file_name']}.png")
        save_spectrogram(processed_audio, sample_rate, output_path)
        print(f"Processed spectrogram saved to: {output_path}")
        
        feature_list.append(features)
    
    feature_df = pd.DataFrame(feature_list, columns=['magnitude_audio', 'phase_audio', 'mfccs', 'chroma', 'mel_spectrogram'])
    
    result_df = pd.concat([df.reset_index(drop=True), feature_df.reset_index(drop=True)], axis=1)
    
    return result_df

In [14]:
# Process the metadata
metadata = pd.read_csv(f'{Configure.data_path}/train_metadata.csv')
metadata['file_path'] = metadata.apply(lambda x: f"{Configure.audio_path}/{x['filename']}", axis=1)
metadata['label'] = metadata['primary_label'].map(Configure.name2label)
metadata['file_name'] = metadata['file_path'].map(lambda x: x.split('/')[-1].split('.')[0])
metadata = metadata[['file_name', 'latitude', 'longitude', 'rating', 'label', 'file_path']]

processed_metadata = process_and_save_spectrograms(metadata.iloc[:,:100], f'{Configure.data_path}/processed_spectrograms')

In [None]:
train_df, valid_df = train_test_split(metadata, test_size=0.2)
print(f"Num Train: {len(train_df)} | Num Valid: {len(valid_df)}")

# Data Loader

In [None]:
"""
TODO List:
1. Load audio
2. Data Exploration
3. Resample audio
4. Crop or Pad audio
5. Standardize and Normalize audio
6. Frame and Window audio
7. Trim audio (select frequency range)
8. Extract features (spectrogram, mel spectrogram, MFCC)
9. Augment audio (time shift, pitch shift, speed tuning, noise injection, mixup, mixmatch, cutmix, specaugment, time masking, frequency masking)
10. Split audio data into training and validation sets
11. Build data pipeline
12. Finetune model (efficientnetv2_b2_imagenet, or bird_vocalization_classifier) and Hyperparameter tuning
13. Model evaluation
"""

def build_decoder(with_labels=True, dim=1024):
    def get_audio(filepath):
        audio = tfio.audio.AudioIOTensor(filepath)
        sr = audio.rate.numpy()
        

In [None]:
# file_bytes = tf.io.read_file('/home/max/Desktop/birdclef/data/train_audio/asbfly/XC49755.ogg')
# print(file_bytes)

# print(audio)


audio = tfio.audio.AudioIOTensor('/home/max/Desktop/birdclef/data/train_audio/asbfly/XC49755.ogg')

audio_slice = audio[:Configure.audio_length]

audio_tensor = tf.squeeze(audio_slice, axis=[-1])

display(Audio(audio_tensor.numpy(), rate=audio.rate.numpy()))
display(plt.plot(audio_tensor.numpy()))


In [None]:
position = tfio.audio.trim(audio_tensor, axis=0, epsilon=0.1, name='trim_silence')
print(position)

start = position[0]
stop = position[1]
print(start, stop)

processed = audio_tensor[start:stop]
display(Audio(processed.numpy(), rate=audio.rate.numpy()))
print(audio.rate.numpy())

display(plt.plot(processed.numpy()))

In [None]:
fade = tfio.audio.fade(processed, fade_in=1000, fade_out=2000, mode='logarithmic')
plt.plot(fade.numpy())
display(Audio(fade.numpy(), rate=audio.rate.numpy()))

In [None]:
spectrogram = tfio.audio.spectrogram(fade, nfft=Configure.nfft, window=Configure.window, stride=Configure.hop_length)
plt.imshow(tf.math.log(spectrogram).numpy(),cmap='coolwarm')

In [None]:
mel_spectrogram = tfio.audio.melscale(spectrogram, rate=Configure.sample_rate, mels=Configure.n_mels, fmin=Configure.fmin, fmax=Configure.fmax)
plt.imshow(tf.math.log(mel_spectrogram).numpy(), cmap='coolwarm')

In [None]:
dbScale_mel_spectrogram = tfio.audio.dbscale(mel_spectrogram, top_db=80)
plt.imshow(dbScale_mel_spectrogram.numpy(), cmap='coolwarm')

In [None]:
freq_mask = tfio.audio.freq_mask(dbScale_mel_spectrogram, param=50)
plt.imshow(freq_mask.numpy(),cmap='coolwarm')

In [None]:
time_mask = tfio.audio.time_mask(dbScale_mel_spectrogram, param=40)
plt.imshow(time_mask.numpy(),cmap='coolwarm')

In [None]:
def load_audio(file_path, duration):
    audio = tfio.audio.AudioIOTensor(file_path)
    audio_slice = audio[:duration]
    audio_tensor = tf.squeeze(audio_slice, axis=[-1])
    return audio_tensor.numpy(), audio.rate.numpy()

def resample_audio(data, sample_rate, target_sample_rate):
    if sample_rate != target_sample_rate:
        audio= tfio.audio.resample(data, sample_rate, target_sample_rate, name='resampled_audio')
        return audio.numpy(), target_sample_rate
    else:
        return data, sample_rate
    

In [None]:
audio, sr = load_audio('/home/max/Desktop/birdclef/data/train_audio/asbfly/XC164848.ogg', Configure.audio_length)
print(audio, sr)
audio, sr = resample_audio(audio, sr, Configure.sample_rate)
print(audio, sr)

In [None]:
def load_audio(file_path, target_sample_rate=16000, audio_length=16000):
    # Load audio file
    audio = tfio.audio.AudioIOTensor(file_path)
    # Squeeze the audio tensor to remove the channel dimension
    audio_squeeze = tf.squeeze(audio.to_tensor(), axis=[-1])
    
    # Resample audio if necessary
    if audio.rate.numpy() != target_sample_rate:
        audio_resample = tfio.audio.resample(audio_squeeze, rate_in=audio.rate.numpy(), 
                                            rate_out=target_sample_rate, name='resampled_audio')
    else:
        audio_resample = audio_squeeze
    
    # Crop or pad audio to the target length
    length = tf.shape(audio_resample)[0]
    if length > audio_length:
        audio_crop = audio_resample[:audio_length]
    elif length < audio_length:
        paddings = [[0, audio_length - length]]
        audio_crop = tf.pad(audio_resample, paddings, "CONSTANT")
    else:
        audio_crop = audio_resample
    
    # Standardize audio
    mean = tf.math.reduce_mean(audio_crop)
    std = tf.math.reduce_std(audio_crop)
    audio_standardize = tf.where(tf.math.equal(std, 0), audio_crop - mean, (audio_crop - mean) / std)
    
    # Normalize using Min-Max scaling
    min_val = tf.math.reduce_min(audio_standardize)
    max_val = tf.math.reduce_max(audio_standardize)
    audio_normalize = tf.where(tf.math.equal(max_val - min_val, 0), 
                                audio_standardize - min_val, (audio_standardize - min_val) / (max_val - min_val))
    
    return audio_normalize, target_sample_rate

In [None]:
audio, sr = load_audio('/home/max/Desktop/birdclef/data/train_audio/asbfly/XC49755.ogg', Configure.sample_rate, Configure.audio_length)