In [None]:
!pip install fiftyone huggingface-hub

In [4]:
import os 
import numpy as np
from scipy.io import wavfile
from datasets import load_dataset

esc_fifty = load_dataset(
    "ashraq/esc50", 
    split="train",
    cache_dir='.')

def organize_esc10_dataset(dataset, base_output_dir="esc10_organized"):
    # Create base output directory
    os.makedirs(base_output_dir, exist_ok=True)
    
    # Filter for ESC-10 samples
    esc10_samples = dataset.filter(lambda x: x['esc10'] == True)
    
    # Process each sample
    for sample in esc10_samples:
        category_dir = os.path.join(base_output_dir, sample['category'])
        os.makedirs(category_dir, exist_ok=True)
        
        wav_path = os.path.join(category_dir, sample['filename'])
        
        # Convert float32 audio to int16 PCM
        audio_array = sample['audio']['array']
        # Normalize to [-1, 1] if not already
        audio_array = audio_array / np.max(np.abs(audio_array))
        # Convert to int16
        audio_array = (audio_array * 32767).astype(np.int16)
        
        # Save audio array as wav file
        wavfile.write(
            wav_path, 
            sample['audio']['sampling_rate'],
            audio_array
        )
    
    print(f"Dataset organized in {base_output_dir}")
    return esc10_samples

organize_esc10_dataset(esc_fifty)

Now, let's [download a plugin](https://github.com/danielgural/audio_loader/tree/main) that will create spectograms from the audio files.

FiftyOne's plugin framework lets you extend and customize the functionality of FiftyOne to suit your needs. If you’re interested in learning more about plugins, you might be interested in attending one of our monthly workshops. You can [see the full schedule here](https://voxel51.com/computer-vision-events/) and look for the *Advanced Computer Vision Data Curation and Model Evaluation* workshop.

In [None]:
!fiftyone plugins download https://github.com/danielgural/audio_loader

Once the plugin is downloaded there are two ways you can use it.

1. You can launch the FiftyOne app in your local browser by opening the terminal and running: `fiftyone app launch`. Once the app has launched hit the backtick (\`\) button on your keyboard, this will open the Operator browser. Type in "Load Audio" and click on the operator. This will open up the form for the Load Audio plugin which you can fill in (each element of the form will appear once you populate each one). You can choose to kick off a [delegated service](https://docs.voxel51.com/plugins/developing_plugins.html#delegated-execution) if you'd like. 

Below is an example of the form:

<img src="load_audio_form.png" width="50%"/>

The plugin will take some moments to run, depending on the size of your dataset. In this case, it should take no more than 1 minute.

2. Alternatively, instead of launching the app via terminal, you can launch the app in the cell of a Jupyter Notebook. To do that you must first create a dummy dataset and then launch the app in the cell. The pattern for this is as follows:

```python
import fiftyone as fo

dummy_dataset = fo.Dataset()

fo.launch_app(dummy_dataset)
```
Once the app has launched you can open the Operator browser and hit backtick (\`\), then follow the instructions as outlined above.

In both cases, you can then load the dataset once it has been created. Depending on what you named your dataset, you can load it as follows:

In [6]:
import fiftyone as fo

audio_dataset = fo.load_dataset("esc10")

Now let’s install a plugin that allows us to create custom dashboards and glean more insight into our dataset:

In [None]:
!fiftyone plugins download \
    https://github.com/voxel51/fiftyone-plugins \
    --plugin-names @voxel51/dashboard

In [None]:
fo.launch_app(audio_dataset)

We'll need the labels, so we can get them like so:

In [8]:
audio_classes = audio_dataset.distinct("ground_truth.label")

Talk about music2latent

make mention that you should be on torch<2.6 and torchvision<0.21.0

In [None]:
!pip install music2latent librosa

In [None]:
import librosa
from torch.nn.functional import normalize

from music2latent import EncoderDecoder

music_to_latent_model = EncoderDecoder()

for sample in audio_dataset.iter_samples(autosave=True):
    wav_path = sample["wav_path"]
    sample_rate = sample["frame_rate"]
    loaded_wave, _ = librosa.load(wav_path, sr=44100)
    latents = music_to_latent_model.encode(loaded_wave, extract_features=True)
    embedding = latents.mean(dim=-1).squeeze(0) 
    normalized_embedding = normalize(embedding, p=2, dim=0)
    sample["wav_embedding"] = normalized_embedding.detach().cpu().numpy() #shape (8192,)

Talk real briefly about this model



We'll use this model below for zero-shot-audio classification

In [13]:
import torch
from torch.nn.functional import normalize

import librosa

from transformers import ClapModel, ClapProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"

clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)

clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")

for sample in audio_dataset.iter_samples(autosave=True):
    wav_path = sample["wav_path"]
    sample_rate = sample["frame_rate"]
    loaded_wave, _ = librosa.load(wav_path, sr=48000)
    clap_inputs = clap_processor(audios=loaded_wave, return_tensors="pt", sampling_rate=48000).to(device)
    audio_embed = clap_model.get_audio_features(**clap_inputs).squeeze(0)  
    normalized_embedding = normalize(audio_embed, p=2, dim=0)
    sample["clap_embeddings"] = normalized_embedding.detach().cpu().numpy() #shape (512,)

I'll also compute embedding using AIMv2, which is a vision encoder. [Read this blog](https://medium.com/voxel51/visual-understanding-with-aimv2-76c58dcd68f9) for a deep dive into the AIMv2 family of models.

This, dare I say, "multimodal" approach to analyzing embeddings provides different ways of exploring and understanding audio content, ultimately leading to an experiment with vision-language models (VLMs). Models like Music2Latent and CLAP operate directly on the raw audio waveforms, capturing temporal patterns, frequency relationships, and acoustic features in their native form. 

In parallel, we can compute embeddings using AIMv2 on the spectrograms - visual representations that encode time-frequency relationships in a 2D format.  This sets up (at least what I think is) a fascinating comparison: while the audio-specific models represent our 'traditional' approach to audio understanding, the spectrogram-based analysis might hint at the suitability of a vision-language model to perform audio classification. 

By converting audio into spectrograms, we can potentially tap into the sophisticated visual pattern recognition and semantic understanding capabilities of VLMs, even though they weren't specifically trained on audio data.

In [None]:
!fiftyone plugins download https://github.com/harpreetsahota204/aim-embeddings-plugin

In [None]:
import fiftyone.operators as foo

embedding_operator = foo.get_operator("@harpreetsahota/aimv2_embeddings/compute_aimv2_embeddings")

In [None]:
embedding_operator(
    audio_dataset,
    model_name="apple/aimv2-large-patch14-224",  # Choose any supported model
    embedding_types="mean",  # Either "cls" or "mean"
    emb_field="aimv2_embeddings",  # Name for the embeddings field
)

Let's visualize our embeddings to better understand how our different models are grouping similar audio classes. 

Since our embeddings are high-dimensional, we'll use UMAP to reduce them to 2D for visualization. This will help us see if the models are clustering similar genres together.

In [None]:
import fiftyone.brain as fob

embedding_fields = [ "aimv2_embeddings", "wav_embedding", "clap_embeddings"]

for fields in embedding_fields:
    _fname = fields.split("_embeddings")[0]
    results = fob.compute_visualization(
        audio_dataset,
        embeddings=fields,
        method="umap",
        brain_key=f"{_fname}_viz",
        num_dims=2,
        )

In [None]:
fo.launch_app(audio_dataset)

Before testing our VLM approach on spectrograms, we'll establish a baseline using a specialized audio model. 

We'll use LAION's CLAP model with a zero-shot audio classification pipeline. `This model was specifically trained on audio-text pairs and can classify audio into arbitrary categories without needing to be fine-tuned on our specific genre labels. `

This will give us a reference point for how well a dedicated audio model performs on our genre classification task, which we can later compare against our VLM-based approach using spectrograms.

In [None]:
from transformers import pipeline

zsc_audio_classifier = pipeline(
    task="zero-shot-audio-classification", 
    model="laion/clap-htsat-unfused"
    )

In [None]:
for sample in audio_dataset.iter_samples(autosave=True):
    wav_path = sample["wav_path"]
    zsc_audio_preds = zsc_audio_classifier(wav_path, candidate_labels= audio_classes)
    sample["zsc_audio_preds"] = fo.Classification(
        label=zsc_audio_preds[0]["label"], 
        confidence=zsc_audio_preds[0]["score"]
    )


#### Model evaluation in FiftyOne

You can use the [`evaluate_classifications`](https://docs.voxel51.com/tutorials/evaluate_classifications.html?highlight=evaluate%20classification) method to evaluate the predictions of the zero-shot classifiers. This will return a `ClassificationResults` instance that provides various methods for generating aggregate evaluation reports about your model.

By default, the classifications will be treated as a generic multiclass classification task, and for illustration purposes, I am explicitly requesting that simple evaluation be used by setting the method argument to `simple`; but you can specify other evaluation strategies such as `top-k` accuracy or `binary` evaluation via the method parameter.



In [None]:
audio_dataset.evaluate_classifications(
    pred_field="zsc_audio_preds",
    gt_field="ground_truth",
    method="simple",
    eval_key=f"clap_simple_eval",
    )

In [None]:
fo.launch_app(audio_dataset)

# VLMs

Prompt

In [None]:
string_audio_classes

In [38]:
string_audio_classes = ', '.join(audio_classes)

vlm_query_prompt_long = f"""Your task is to analyze a spectrogram, which is a visual representation of the frequency spectrum 
of sound over time, and determine the most likely sound class from a given list of possibilities. 

Analyze the spectrogram image,considering factors such as frequency patterns, intensity, and time variations. 

Focus solely on the patterns presented in the spectrogram. Do not let any assumptions about common sounds or 
environmental settings influence your decision.  

Here are the classes {string_audio_classes}:. 

Your response must always contain the exact name of the class only. 

For example, if you believe the spectrogram matches best with rain, your response would be rain. 

Here is the spectrogram:
"""

In [43]:
vlm_query_prompt_short = f"""Which of the following classes {string_audio_classes} does this spectrogram best represent? 
Respond only with the name of the class, nothing more.
"""


### Moondream

In [None]:
!fiftyone plugins download https://github.com/harpreetsahota204/moondream2-plugin

In [None]:
!fiftyone plugins requirements @harpreetsahota/moondream2 --install

In [None]:
import fiftyone.operators as foo

moondream = foo.get_operator("@harpreetsahota/moondream2/moondream")

await moondream(
    audio_dataset,
    revision="2025-01-09",
    operation="query",
    output_field="moondream_classification",
    query_text=vlm_query_prompt_short,
    delegate=True
)

# Janus classification on spectogram

Janus-Pro is an advanced multimodal model designed for both multimodal understanding and visual generation, emphasizing improvements in understanding tasks. The model's architecture is built upon decoupled visual encoding, which allows it to handle the differing representation needs of these two types of tasks more effectively.

NOTE: This plugin only supports multimodal understanding tasks. 

In [None]:
!fiftyone plugins download https://github.com/harpreetsahota204/janus-vqa-fiftyone

In [None]:
!fiftyone plugins requirements @harpreetsahota/janus_vqa --install

In [25]:
import fiftyone.operators as foo

janus_vqa = foo.get_operator("@harpreetsahota/janus_vqa/janus_vqa")

In [None]:
await janus_vqa(
    audio_dataset,
    model_path="deepseek-ai/Janus-Pro-1B",
    question=vlm_query_prompt_short,
    question_field="query",
    answer_field="janus_classification",
    delegate=True
)

In [None]:
fo.launch_app(audio_dataset)