## Import Libraries 📚

In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # "jax" or "tensorflow" or "torch"

import keras_cv
import keras
import keras.backend as K
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm

import librosa
import IPython.display as ipd
import librosa.display as lid

import matplotlib.pyplot as plt
import matplotlib as mpl

cmap = mpl.cm.get_cmap('coolwarm')

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Set random seed for reproducibility
torch.manual_seed(42)

# Configuration
class CFG:
    seed = 42
    img_size = [128, 384]
    batch_size = 64
    duration = 15  # seconds
    sample_rate = 32000
    audio_len = duration * sample_rate
    nfft = 2028
    window = 2048
    hop_length = audio_len // (img_size[1] - 1)
    fmin = 20
    fmax = 16000
    epochs = 10
    num_classes = len(os.listdir("content/birdclef-2024/train_audio/"))
    augment = True

# Load dataset
DATASET_PATH = 'content/birdclef-2024'
df = pd.read_csv(f'{DATASET_PATH}/train_metadata.csv')
df['filepath'] = DATASET_PATH + '/train_audio/' + df.filename
df['target'] = df.primary_label.map({name: idx for idx, name in enumerate(sorted(os.listdir(f"{DATASET_PATH}/train_audio/")))})
df['filename'] = df.filepath.map(lambda x: x.split('/')[-1])
df['xc_id'] = df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])

# Split dataset
train_df, valid_df = train_test_split(df, test_size=0.2)

# Audio processing functions
def load_audio(filepath):
    audio, sr = librosa.load(filepath, sr=CFG.sample_rate)
    if len(audio) < CFG.audio_len:
        padding = CFG.audio_len - len(audio)
        audio = np.pad(audio, (0, padding), 'constant')
    else:
        audio = audio[:CFG.audio_len]
    return audio, sr

def get_spectrogram(audio):
    spec = librosa.feature.melspectrogram(y=audio, sr=CFG.sample_rate, n_mels=256, n_fft=2048, hop_length=512, fmax=CFG.fmax, fmin=CFG.fmin)
    spec = librosa.power_to_db(spec, ref=1.0)
    spec = (spec - spec.min()) / (spec.max() - spec.min())
    return spec

# Custom Dataset
class BirdCLEFDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio, _ = load_audio(row.filepath)
        audio = audio[:CFG.audio_len]
        spec = get_spectrogram(audio)
        spec = np.repeat(spec[..., np.newaxis], 3, -1)  # Convert to 3-channel image
        spec = torch.tensor(spec, dtype=torch.float32).permute(2, 0, 1)
        label = torch.tensor(row.target, dtype=torch.long)
        return spec, label

# DataLoader
train_dataset = BirdCLEFDataset(train_df)
valid_dataset = BirdCLEFDataset(valid_df)
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False)

# Model
class EfficientNetV2(nn.Module):
    def __init__(self, num_classes):
        super(EfficientNetV2, self).__init__()
        self.backbone = models.efficientnet_v2_s(pretrained=True)
        self.backbone.classifier[1] = nn.Linear(self.backbone.classifier[1].in_features, num_classes)

    def forward(self, x):
        return self.backbone(x)

model = EfficientNetV2(CFG.num_classes)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
for epoch in range(CFG.epochs):
    model.train()
    train_loss = 0.0
    for inputs, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader)}")

    model.eval()
    valid_loss = 0.0
    with torch.no_grad():
        for inputs, labels in tqdm(valid_loader):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
    print(f"Epoch {epoch+1}, Valid Loss: {valid_loss/len(valid_loader)}")