# Confidentiality Statement

As the organizers of this contest, we assure all participants that their submitted models and code will be treated with strict confidentiality. 

Submissions will only be accessed by the designated review team for evaluation purposes and will not be shared, distributed, or used beyond the scope of this challenge.

Participants retain full ownership of their work. We will not claim any rights over the submitted materials, nor will we use them for any purpose outside of the challenge evaluation process.

We appreciate your participation in this challenge.

# Query by Vocal Imitation: Submission Template


This is the submission template for the Query by Vocal Imitation challenge at the 2025 AES International Conference on Artificial Intelligence and Machine Learning for Audio.

Instructions are contained in the code and comments, so please read them carefully. You should only modify a single code block and leave the others untouched.

In [None]:
"""
DO NOT MODIFY THIS BLOCK.
"""

!pip install -q "numpy<2" tqdm soundfile resampy

This block contains the abstract base class `QBVModel`, which you should subclass to wrap your model.

In [None]:
"""
DO NOT MODIFY THIS BLOCK.
"""

from abc import ABC, abstractmethod
import numpy as np

SAMPLE_RATE = 16000

class QBVModel(ABC):
    @abstractmethod
    def embed_item(self, audio: np.ndarray) -> np.ndarray:
        """Generate an embedding for a single audio recording.

        Args:
            audio (np.ndarray): One-dimensional numpy array containing the audio to be embedded.

        Returns:
            embedding (np.ndarray): One-dimensional numpy array containing the generated embedding.
        """
        pass

    @abstractmethod
    def compute_scores(
            self, embedding_files: dict[str, str], queries: dict[str, np.ndarray]
    ) -> dict[str, dict[str, float]]:
        """Compute similarity scores given the previously calculated embeddings and a set of queries.

        Each <embedding, query> pairing should be assigned a single floating point score, where higher
        scores indicate higher similarity.

        Args:
            embedding_files (dict[str, str]): A dictionary mapping item ids to the file containing the
                corresponding embedding.
            queries (dict[str, np.ndarray]): A dictionary mapping query ids to the corresponding audio

        Returns:
            scores (dict[str, dict[str, float]]): A dictionary mapping query ids to a dictionary of item
                ids and their corresponding similarity scores. E.g:
                {
                    "query_1": {
                        "item_1": 0.8,
                        "item_2": 0.6,
                        ...
                    },
                    "query_2": {
                        "item_1": 0.4,
                        "item_2": 0.9,
                        ...
                    },
                    ...
                }
        """
        pass

The following block contains a baseline implementation of a MobileNetV3 model as used in [1] to illustrate the expected structure of the model class.

[1] https://dcase.community/documents/workshop2024/proceedings/DCASE2024Workshop_Greif_36.pdf

In [None]:
"""
DO NOT MODIFY THIS BLOCK.
"""

!git clone https://github.com/jonathan-greif/qbv.git
!pip install pandas torchaudio torch torchvision pytorch-lightning==1.9.4


from qbv.helpers.get_module import get_module
from qbv.helpers.utils_test import get_single_emb, padding
import torch
import torch.nn as nn
from tqdm import tqdm
from numpy.linalg import norm

class MobileNetV3(nn.Module, QBVModel):
    def __init__(self):
        super(MobileNetV3, self).__init__()
        self.model,_,_ = get_module("MN", False, False,
                                 "", (None, None))
        self.sr = SAMPLE_RATE
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def forward(self, x):
        x = padding(x, 32000, 10)
        x = torch.from_numpy(x)
        embedding = get_single_emb(self.model, "MN", x)
        return embedding
    
    def embed_item(self, item):
        return self(item).detach().squeeze().cpu().numpy()

    def compute_scores(self, embedding_files, queries):
        scores = {key: {} for key in queries}

        query_embs = {key: self.embed_item(val) for key, val in queries.items()}

        for item, emb_file in tqdm(embedding_files.items()):
            embedding = np.load(emb_file)
            for query_name, query_emb in query_embs.items():
                sim = np.dot(embedding, query_emb) / (norm(embedding) * norm(query_emb))
                scores[query_name][item] = sim.item()
        return scores


### TODO: Change the bottom block

In [None]:
"""
CHANGE THIS BLOCK: Download any resources you require in this block (git repos, checkpoints etc.), and implement and instantiate your model.
"""

# class YourAwesomeModel(QBVModel):
#     def embed_item(self, audio: np.ndarray) -> np.ndarray:
#         pass
#
#     def compute_scores(
#         self, embedding_files: dict[str, str], queries: dict[str, np.ndarray]
#     ) -> dict[str, dict[str, float]]:
#         pass

qbv_model = MobileNetV3()


### Test Block


In [None]:
"""
DO NOT MODIFY THIS BLOCK. This block calls your model to generate embeddings and compute scores. Make sure that your model is able to run this block without any errors.
"""

items_path = "items"
embeddings_path = "embeddings"
queries_path = "queries"

embedding_files = {}
query_files = {}

import os, glob, json
from resampy import resample
import soundfile as sf

for item in glob.glob(os.path.join(items_path, "*.wav")):
    item_name = os.path.splitext(os.path.basename(item))[0]
    audio, sr = sf.read(item)
    audio = resample(audio, sr, SAMPLE_RATE)
    emb = qbv_model.embed_item(audio)

    emb_file = os.path.join(embeddings_path, item_name + ".npy")
    np.save(emb_file, emb)

    embedding_files[item_name] = emb_file

for query in glob.glob(os.path.join(queries_path, "*.wav")):
    query_name = os.path.splitext(os.path.basename(query))[0]
    audio, sr = sf.read(query)
    audio = resample(audio, sr, SAMPLE_RATE)
    query_files[query_name] = audio

scores = qbv_model.compute_scores(embedding_files, query_files)

with open("scores.json", "w") as f:
    json.dump(scores, f)
