# Querying Audio with CLAP embeddings

## In this walkthrough, we will be using a dataset of audio files and embed them using the CLAP model (https://huggingface.co/docs/transformers/v4.30.0/en/model_doc/clap#transformers.ClapModel)

## Installation Requirements

In [1]:
!pip install librosa
!pip install datasets
!pip install transformers
!pip install torch



In [2]:
from datasets import load_dataset
from transformers import AutoProcessor, ClapModel, AutoTokenizer
import numpy as np
import torch
import vexpresso
from vexpresso.utils import ResourceRequest, DataType

## Load Data

Here we load a dataset of audio files from https://huggingface.co/datasets/ashraq/esc50

In [3]:
dataset = load_dataset("ashraq/esc50")

Using custom data configuration ashraq--esc50-1000c3b73cc1500f
Found cached dataset parquet (/home/kokkgoblin/.cache/huggingface/datasets/ashraq___parquet/ashraq--esc50-1000c3b73cc1500f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Convert to dictionary

In [4]:
dictionary = dataset['train'].to_dict()
audios = dataset['train']['audio']
dictionary['audio'] = audios

## Create Collection

Lets create a collection with the audios that we downloaded!

In [5]:
collection = vexpresso.create(data=dictionary, backend="ray")

[32m2023-06-13 13:41:30.375[0m | [1mINFO    [0m | [36mdaft.context[0m:[36mrunner[0m:[36m71[0m - [1mUsing RayRunner[0m
2023-06-13 13:41:30,377	INFO worker.py:946 -- Connecting to existing Ray cluster at address: 10.0.0.118:57158
2023-06-13 13:41:30,377	INFO worker.py:963 -- Calling ray.init() again after it has already been called.


In [6]:
collection.show(5)

filename Utf8,fold Int64,target Int64,category Utf8,esc10 Boolean,src_file Int64,take Utf8,"audio Struct[array: List[item:Float64], path: Null, sampling_rate: Int64]"
1-100032-A-0.wav,1,0,dog,True,100032,A,"{'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
1-100038-A-14.wav,1,14,chirping_birds,False,100038,A,"{'array': [-0.0118408203125, -0.103363037109375, -0.14141..."
1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A,"{'array': [-0.0069580078125, -0.01251220703125, -0.011260..."
1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B,"{'array': [0.538970947265625, 0.396270751953125, 0.267395..."
1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,"{'array': [-0.0003662109375, -0.000701904296875, -0.00079..."


Let's filter out the B takes

In [7]:
collection = collection.filter({"take":{"eq":"A"}}).execute()

In [8]:
collection.show(5)

filename Utf8,fold Int64,target Int64,category Utf8,esc10 Boolean,src_file Int64,take Utf8,"audio Struct[array: List[item:Float64], path: Null, sampling_rate: Int64]"
1-100032-A-0.wav,1,0,dog,True,100032,A,"{'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
1-100038-A-14.wav,1,14,chirping_birds,False,100038,A,"{'array': [-0.0118408203125, -0.103363037109375, -0.14141..."
1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A,"{'array': [-0.0069580078125, -0.01251220703125, -0.011260..."
1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,"{'array': [-0.0003662109375, -0.000701904296875, -0.00079..."
1-101336-A-30.wav,1,30,door_wood_knock,False,101336,A,"{'array': [0.0001220703125, 0.00018310546875, 0.000122070..."


Lets take a look at the different categories!

In [9]:
np.unique(collection["category"].to_list())

array(['airplane', 'breathing', 'brushing_teeth', 'can_opening',
       'car_horn', 'cat', 'chainsaw', 'chirping_birds', 'church_bells',
       'clapping', 'clock_alarm', 'clock_tick', 'coughing', 'cow',
       'crackling_fire', 'crickets', 'crow', 'crying_baby', 'dog',
       'door_wood_creaks', 'door_wood_knock', 'drinking_sipping',
       'engine', 'fireworks', 'footsteps', 'frog', 'glass_breaking',
       'hand_saw', 'helicopter', 'hen', 'insects', 'keyboard_typing',
       'laughing', 'mouse_click', 'pig', 'pouring_water', 'rain',
       'rooster', 'sea_waves', 'sheep', 'siren', 'sneezing', 'snoring',
       'thunderstorm', 'toilet_flush', 'train', 'vacuum_cleaner',
       'washing_machine', 'water_drops', 'wind'], dtype='<U16')

Because this is a demo, let's only get one sound from each category

In [10]:
def unique_filter(category):
    unique_set = set([])
    out = []
    for c in category:
        if c not in unique_set:
            out.append("valid")
            unique_set.add(c)
        else:
            out.append(None)
    return out

In [11]:
collection = collection.apply(unique_filter, collection["category"], to="filter_valid").filter({"filter_valid":{"eq":"valid"}}).execute()

In [12]:
collection.show(5)

filename Utf8,fold Int64,target Int64,category Utf8,esc10 Boolean,src_file Int64,take Utf8,"audio Struct[array: List[item:Float64], path: Null, sampling_rate: Int64]",filter_valid Python
1-100032-A-0.wav,1,0,dog,True,100032,A,"{'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",valid
1-100038-A-14.wav,1,14,chirping_birds,False,100038,A,"{'array': [-0.0118408203125, -0.103363037109375, -0.14141...",valid
1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A,"{'array': [-0.0069580078125, -0.01251220703125, -0.011260...",valid
1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,"{'array': [-0.0003662109375, -0.000701904296875, -0.00079...",valid
1-101336-A-30.wav,1,30,door_wood_knock,False,101336,A,"{'array': [0.0001220703125, 0.00018310546875, 0.000122070...",valid


## Multimodal CLAP Embedding function

In [13]:
class ClAPEmbeddingsFunction:
    def __init__(self):

        self.model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        self.processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")
        self.tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
        self.device = torch.device('cpu')

        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            self.model = self.model.to(self.device)

    def __call__(self, inp, inp_type):
        if inp_type == "audio":
            inputs = self.processor(audios=inp, return_tensors="pt", padding=True)
            print(inputs.keys())
            for k in inputs:
                inputs[k] = inputs[k].to(self.device)
            return self.model.get_audio_features(**inputs).detach().cpu().numpy()
        if inp_type == "text":
            inputs = self.tokenizer(inp, padding=True, return_tensors="pt")
            inputs["input_ids"] = inputs["input_ids"].to(self.device)
            inputs["attention_mask"] = inputs["attention_mask"].to(self.device)
            return self.model.get_text_features(**inputs).detach().cpu().numpy()

## Now lets embed the audio arrays!

This may take a while because we're embedding 2000 audio files

In [14]:
collection = collection.embed(collection["audio.array"], inp_type="audio", embedding_fn=ClAPEmbeddingsFunction, to="audio_embeddings", resource_request=ResourceRequest(num_gpus=1)).execute()

[2m[36m(single_partition_pipeline pid=34698)[0m It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


[2m[36m(single_partition_pipeline pid=34698)[0m dict_keys(['input_features', 'is_longer'])


In [15]:
collection.show(5)

filename Utf8,fold Int64,target Int64,category Utf8,esc10 Boolean,src_file Int64,take Utf8,"audio Struct[array: List[item:Float64], path: Null, sampling_rate: Int64]",filter_valid Python,audio.array Python,audio_embeddings Python
1-100032-A-0.wav,1,0,dog,True,100032,A,"{'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",valid,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","<np.ndarray shape=(512,) dtype=float32>"
1-100038-A-14.wav,1,14,chirping_birds,False,100038,A,"{'array': [-0.0118408203125, -0.103363037109375, -0.14141...",valid,"[-0.0118408203125, -0.103363037109375, -0.14141845703125,...","<np.ndarray shape=(512,) dtype=float32>"
1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A,"{'array': [-0.0069580078125, -0.01251220703125, -0.011260...",valid,"[-0.0069580078125, -0.01251220703125, -0.011260986328125,...","<np.ndarray shape=(512,) dtype=float32>"
1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,"{'array': [-0.0003662109375, -0.000701904296875, -0.00079...",valid,"[-0.0003662109375, -0.000701904296875, -0.00079345703125,...","<np.ndarray shape=(512,) dtype=float32>"
1-101336-A-30.wav,1,30,door_wood_knock,False,101336,A,"{'array': [0.0001220703125, 0.00018310546875, 0.000122070...",valid,"[0.0001220703125, 0.00018310546875, 0.0001220703125, 0.00...","<np.ndarray shape=(512,) dtype=float32>"
