Install libraries as needed

In [1]:
!pip install sounddevice
!pip install soundfile
!pip install numpy
!pip install torch
!pip install torchaudio

Defaulting to user installation because normal site-packages is not writeable


Clone Imagebind to current directory

In [10]:
!git init
!git remote add origin https://github.com/facebookresearch/ImageBind.git
!git pull origin main

Initialized empty Git repository in C:/Users/Dell/Downloads/COE/197Z/p2/.git/


From https://github.com/facebookresearch/ImageBind
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> origin/main


Download SPEECHCOMMANDS Dataset

In [12]:
import requests
import os
from tqdm import tqdm
import gzip
import shutil
import os
import tarfile

file_path = "./speech_commands_v0.02.tar.gz"
if not os.path.exists(file_path):
    url = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
    folder_path = './'  # Replace with the desired folder path

    file_name = url.split('/')[-1]  # Extract the file name from the URL

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 KB
    progress_bar = tqdm(total=total_size, unit='B', unit_scale=True)

    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)

    progress_bar.close()

    print(f"File downloaded and saved to {file_path}")
if not os.path.exists('./speech_commands_v0.02.tar'):
    print("Extracting gz...")
    file_path = './speech_commands_v0.02.tar.gz'  # Replace with the path to your .gz file
    folder_path = './'  # Replace with the desired folder path

    # Extract the file name from the path
    file_name = os.path.basename(file_path)

    # Construct the output file path by removing the .gz extension
    output_file_path = os.path.join(folder_path, file_name[:-3])

    with gzip.open(file_path, 'rb') as gz_file:
        with open(output_file_path, 'wb') as out_file:
            shutil.copyfileobj(gz_file, out_file)

    print(f"File extracted and saved to {output_file_path}")

if not os.path.exists('./speech_commands_v0.02/'):
    print("Extracting tar...")
    file_path = './speech_commands_v0.02.tar'  # Replace with the path to your .tar file
    folder_path = './'  # Replace with the desired folder path

    # Extract the file name from the path
    file_name = os.path.basename(file_path)

    # Construct the output folder path by removing the .tar extension
    output_folder_path = os.path.join(folder_path, file_name[:-4])

    with tarfile.open(file_path, 'r') as tar:
        tar.extractall(output_folder_path)

    print(f"File extracted and saved to {output_folder_path}")

100%|██████████| 2.43G/2.43G [02:13<00:00, 18.3MB/s] 


File downloaded and saved to ./speech_commands_v0.02.tar.gz
Extracting gz...
File extracted and saved to ./speech_commands_v0.02.tar
Extracting tar...
File extracted and saved to ./speech_commands_v0.02


Import Modules and Declare Functions

In [16]:
import sounddevice as sd
import soundfile as sf
import numpy as np
import data
import torch, torchaudio
from models import imagebind_model
from models.imagebind_model import ModalityType

def record_to_wav(filename, duration, sample_rate):
    # Set up the recording parameters
    sd.default.samplerate = sample_rate
    sd.default.channels = 1  # Mono recording

    # Start the recording
    print("Recording started. Speak into the microphone now!")
    audio = sd.rec(int(duration * sample_rate), dtype='float32')
    
    # Wait for the recording to complete
    sd.wait()

    # Save the recorded audio to a WAV file using soundfile
    sf.write(filename, audio, sample_rate)

    print(f"Recording saved as {filename}.")
    return filename

def pad_wav(input_path, duration=5):
    output_path = "./temp_sound.wav"
    # Load the waveform from the input WAV file
    waveform, sample_rate = torchaudio.load(input_path)
    
    target_samples = int(duration * sample_rate)
    waveform_samples = waveform.size(1)
    if waveform_samples > target_samples:
        return input_path
    waveform = waveform.view(-1)

    # Calculate the number of samples for the padding
    padding_samples = (target_samples - waveform_samples) // 2

    # Create a tensor of zeros for the padding
    padding = torch.zeros(padding_samples)

    # Pad the waveform with zeros before and after
    padded_waveform = torch.cat((padding, waveform, padding))

    # Reshape the padded waveform to a 2-dimensional tensor
    padded_waveform = padded_waveform.view(1, -1)

    # Save the padded waveform to a new WAV file
    torchaudio.save(output_path, padded_waveform, sample_rate)
    
    return output_path


Read Testing list file

In [17]:
file_path = './speech_commands_v0.02/testing_list.txt'

with open(file_path, 'r') as file:
    test_list = [line.rstrip('\n') for line in file]

Pick if sample is to be recorded or choses from test split

In [21]:
choice = input("Enter 0 if you want to record your own voice, 1 if you want to use a sample from test split: ")
if choice:
    sample = record_to_wav("./temp_sound.wav", 5, 16000)
    correct_class = "recorded, not in test split"
else:
    rand_sample = np.random.randint(len(test_list))
    sample = "./speech_commands_v0.02/"+test_list[rand_sample]
    print(sample)
    correct_class = sample.split("/")[2]

Recording started. Speak into the microphone now!
Recording saved as ./temp_sound.wav.


Demo: Classify sound sample and show correct class if from test samples

In [22]:
CLASSES = ['silence', 'unknown', 'backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']

wav = pad_wav(sample)
text_list = CLASSES

audio_paths=[wav]

device = "cuda" if torch.cuda.is_available() else "cpu"

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

# Load data
inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    #ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
}

with torch.no_grad():
    embeddings = model(inputs)

results = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)
res_list = results.tolist()[0]
print("Imagebind Inference: ",CLASSES[res_list.index(max(res_list))])
print("Ground Truth: ", correct_class)


Evaluating the model by running all or part of the test split

In [None]:
test_percentage = 100 # Change this to test on a different percentage of the test split takes velues from 0 to 100
test_list = test_list[:int(len(test_list)*test_percentage/100)]

print_flag = False
score = 0
for sound in test_list:
    audio_paths = [pad_wav(sound)]
    correct_class = sound.split("/")[2]
    
    inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
    }
    
    with torch.no_grad():
        embeddings = model(inputs)
        
    results = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)
    res_list = results.tolist()[0]
    if print_flag:
        print("Imagebind Inference: ",CLASSES[res_list.index(max(res_list))])
        print("Ground Truth: ", correct_class)
    if (correct_class == CLASSES[res_list.index(max(res_list))]):
        score += 1

In [None]:
accuracy = score/len(test_list)
print("Imagebind Zero shot" )
print("Number of test samples: ", len(test_list))
print(" Accuracy : ", accuracy )
print("TripLet Loss res 15 accuracy")
print("SOTA Accuracy : 0.9856") #TripletLoss-res15: https://github.com/roman-vygon/triplet_loss_kws

In [None]:
print("SOTA Models Scores")
print("| Model                 | Training Approach   | Evaluation     | Accuracy   |")
print("|-----------------------|---------------------|----------------|------------|")
print("| Imagebind             | Unsupervised        | Zero Shot      | {:.4f}     |".format(accuracy))
print("| TripletLoss-res15     | Unsupervised        | Not zero or few| {:.4f}     |".format(0.9856))