#### Import Necessary Libraries

In [1]:
import os
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pylab as ply
import seaborn as sns
from itertools import cycle
# Set interactive backend
%matplotlib inline


cmap = mpl.cm.get_cmap('coolwarm')
sns.set_theme(style="white", palette=None)
color_pal = ply.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(ply.rcParams["axes.prop_cycle"].by_key()["color"])


  cmap = mpl.cm.get_cmap('coolwarm')


#### Hugging Face Transformer
Using a Hugging Face Transformer model for audio embedding extraction is a great alternative, especially with models like Wav2Vec 2.0 or HuBERT, which are pre-trained on large-scale audio datasets and can be fine-tuned for specific tasks like bird sound classification.

Below, I'll show you how to use the Wav2Vec 2.0 model from Hugging Face's transformers library to extract embeddings from audio files.



In [2]:
### Load Dataset
DATASET_PATH = 'content/birdclef-2024'
## To handle our settings  and configurations, let's create a class
class Config:    
    #Yamnet Model
    sample_rate = 16000
    preset = 'facebook/wav2vec2-base-960h'
    class_names = sorted(os.listdir(f'{DATASET_PATH}/train_audio/'))
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v:k for k,v in label2name.items()}
    # Define split ratios
    train_ratio = 0.8  # 80% for training
    val_ratio = 0.1    # 10% for validation
    test_ratio = 0.1   # 10% for testing 

### Load and Explore the dataset

In [3]:
df = pd.read_csv(f'{DATASET_PATH}/train_metadata.csv')
df['filepath'] = DATASET_PATH + '/train_audio/' + df.filename
df['target'] = df.primary_label.map(Config.name2label)
df['filename'] = df.filepath.map(lambda x: x.split('/')[-1])
df['xc_id'] = df.filepath.map(lambda x: x.split('/')[-1].split('.')[0])

## display a few rows of the dataframe
df = df.sample(frac=1, random_state=42)
df.head(5)
for row in df.head(5).iterrows():
    print(row[1].filepath)

content/birdclef-2024/train_audio/blrwar1/XC184748.ogg
content/birdclef-2024/train_audio/whtkin2/XC797017.ogg
content/birdclef-2024/train_audio/hoopoe/XC349675.ogg
content/birdclef-2024/train_audio/grnsan/XC478932.ogg
content/birdclef-2024/train_audio/tibfly3/XC645726.ogg


#### Function to load audio files, which will also be used later when working with the training data.

In [4]:
# Load the dataset
df['filepath'] = df.filepath.map(lambda x: x.replace('train_audio', 'train_wav_audio').replace('.ogg', '.wav')) 

In [5]:
for x in df.head(5).iterrows():
    print(x[1].filepath)

content/birdclef-2024/train_wav_audio/blrwar1/XC184748.wav
content/birdclef-2024/train_wav_audio/whtkin2/XC797017.wav
content/birdclef-2024/train_wav_audio/hoopoe/XC349675.wav
content/birdclef-2024/train_wav_audio/grnsan/XC478932.wav
content/birdclef-2024/train_wav_audio/tibfly3/XC645726.wav


####  Prepare the Dataset
We'll create a custom dataset class to load the audio files and extract embeddings.

In [6]:
import torch
from torch.utils.data import Dataset
import librosa
import numpy as np
from transformers import Wav2Vec2FeatureExtractor

class BirdSoundDataset(Dataset):
    def __init__(self, filepaths, labels, feature_extractor, target_length=160000):
        self.filepaths = filepaths
        self.labels = labels
        self.feature_extractor = feature_extractor
        self.target_length = target_length

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        # Load and preprocess the audio file
        waveform, _ = librosa.load(self.filepaths[idx], sr=16000, mono=True)
        
        # Pad or truncate the waveform to the target length
        if len(waveform) < self.target_length:
            padding = np.zeros(self.target_length - len(waveform))
            waveform = np.concatenate([waveform, padding])
        else:
            waveform = waveform[:self.target_length]
        
        # Extract features using the Wav2Vec 2.0 feature extractor
        inputs = self.feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        
        # Return the input values and label
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return {
            "input_values": inputs.input_values.squeeze(0),  # Remove batch dimension
            "labels": label.unsqueeze(0)
        }



In [7]:
# Load the feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

In [8]:

# Prepare the dataset
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['primary_label'])

In [9]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
train_filepaths, test_filepaths, train_labels, test_labels = train_test_split(
    df['filepath'], df['target'], test_size=0.2, random_state=42
)

# Create datasets
train_dataset = BirdSoundDataset(train_filepaths.tolist(), train_labels.tolist(), feature_extractor)
test_dataset = BirdSoundDataset(test_filepaths.tolist(), test_labels.tolist(), feature_extractor)

#### Define the Model
We'll define a simple neural network on top of the Wav2Vec 2.0 embeddings.

In [10]:
from torch import nn
from transformers import Wav2Vec2Model

class BirdSoundClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),  # Wav2Vec 2.0 embeddings are 768-dimensional
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_values):
        # Get embeddings from Wav2Vec 2.0
        outputs = self.wav2vec2(input_values)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average over time dimension
        
        # Pass through the classifier
        logits = self.classifier(embeddings)
        print("Model output shape:", logits.shape)  # Debug: Check output shape
        return logits

# Initialize the model
num_classes = len(label_encoder.classes_)
model = BirdSoundClassifier(num_classes)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Use a DataLoader with Smaller Batch Size
Reduce the batch size to process fewer samples at a time, which will lower memory usage.

In [11]:
from torch.utils.data import DataLoader

# Create datasets
train_dataset = BirdSoundDataset(train_filepaths.tolist(), train_labels.tolist(), feature_extractor, target_length=160000)
test_dataset = BirdSoundDataset(test_filepaths.tolist(), test_labels.tolist(), feature_extractor, target_length=160000)

def collate_fn(batch):
    input_values = [item['input_values'] for item in batch]
    labels = torch.tensor([item['labels'].item() for item in batch])  # Extract scalar labels
    input_values = torch.stack(input_values)  # Stack into a batch tensor
    return {"input_values": input_values, "labels": labels}

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)


In [12]:
for batch in train_loader:
    print("Input Values Shape:", batch['input_values'].shape)
    print("Labels Shape:", batch['labels'].shape)
    break

Input Values Shape: torch.Size([4, 160000])
Labels Shape: torch.Size([4])


#### Set Up Training
We'll use Hugging Face's Trainer and TrainingArguments for training.

In [13]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
import numpy as np
# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=4,   # Reduce batch size
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()



Model output shape: torch.Size([4, 182])


RuntimeError: grad can be implicitly created only for scalar outputs

#### Evaluate the Model
After training, evaluate the model on the test set:

In [None]:
# Evaluate the model
results = trainer.evaluate()
print("Test Accuracy:", results["eval_accuracy"])

#### Save the trained model and label encoder for future use.

In [None]:
import joblib

## make a directory to save the model
os.makedirs("transformers", exist_ok=True)
# Save the model
trainer.save_model("transformers/bird_sound_transformer_classifier")

# Save the label encoder
joblib.dump(label_encoder, "transformers/label_encoder.pkl")