<a href="https://colab.research.google.com/github/karthik19-cloud/GenAI-Training/blob/main/GenAI-L4/02-find-similar-audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1) Clean out conflicting packages
!pip uninstall -y torch torchaudio torchvision tensorflow opencv-python opencv-python-headless opencv-contrib-python numba speechbrain numpy

# 2) Install matching torch/torchaudio + compatible numpy (GPU wheels, CUDA 12.1).
# For CPU-only, change the index URL to .../cpu
!pip install --quiet torch==2.3.1 torchaudio==2.3.1 numpy==1.26.3 --index-url https://download.pytorch.org/whl/cu121
!pip install --quiet speechbrain


Found existing installation: torch 2.9.0+cu126
Uninstalling torch-2.9.0+cu126:
  Successfully uninstalled torch-2.9.0+cu126
Found existing installation: torchaudio 2.9.0+cu126
Uninstalling torchaudio-2.9.0+cu126:
  Successfully uninstalled torchaudio-2.9.0+cu126
Found existing installation: torchvision 0.24.0+cu126
Uninstalling torchvision-0.24.0+cu126:
  Successfully uninstalled torchvision-0.24.0+cu126
Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
Found existing installation: opencv-python 4.12.0.88
Uninstalling opencv-python-4.12.0.88:
  Successfully uninstalled opencv-python-4.12.0.88
Found existing installation: opencv-python-headless 4.12.0.88
Uninstalling opencv-python-headless-4.12.0.88:
  Successfully uninstalled opencv-python-headless-4.12.0.88
Found existing installation: opencv-contrib-python 4.12.0.88
Uninstalling opencv-contrib-python-4.12.0.88:
  Successfully uninstalled opencv-contrib-python-4

In [2]:
import torch, torchaudio, numpy as np
print("Torch:", torch.__version__)
print("Torchaudio:", torchaudio.__version__)
print("NumPy:", np.__version__)
print("Backends:", torchaudio.list_audio_backends())


Torch: 2.3.1+cu121
Torchaudio: 2.3.1+cu121
NumPy: 2.0.2
Backends: ['ffmpeg', 'soundfile']


In [6]:
!pip install --quiet speechbrain


In [7]:
import os, glob, pickle
from typing import List, Tuple
import torch, torchaudio
import torch.nn.functional as F
from google.colab import files

AUDIO_LIBRARY_DIR = "/content/audio_library"
EMBED_CACHE_PATH = "/content/audio_embeddings.pkl"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TARGET_SR = 16000

from speechbrain.pretrained import EncoderClassifier
model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    run_opts={"device": DEVICE},
    savedir="/content/speechbrain_ecapa"
)

def load_audio(path: str, target_sr: int = TARGET_SR) -> torch.Tensor:
    wav, sr = torchaudio.load(path)
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)
    if sr != target_sr:
        wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=target_sr)
    return wav.squeeze(0)

def compute_embedding(wav: torch.Tensor) -> torch.Tensor:
    with torch.no_grad():
        emb = model.encode_batch(wav.unsqueeze(0).to(DEVICE))
    return emb.squeeze(0).cpu()

def index_library(audio_dir: str, cache_path: str = None) -> List[Tuple[str, torch.Tensor]]:
    files = glob.glob(os.path.join(audio_dir, "**", "*.*"), recursive=True)
    audio_files = [f for f in files if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".m4a"))]
    index = []
    for path in audio_files:
        emb = compute_embedding(load_audio(path))
        index.append((path, emb))
    if cache_path:
        with open(cache_path, "wb") as f:
            pickle.dump(index, f)
    return index

def load_index(cache_path: str):
    if cache_path and os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            return pickle.load(f)
    return None

def find_similar(query_path: str, index: List[Tuple[str, torch.Tensor]], top_k: int = 5):
    q_emb = compute_embedding(load_audio(query_path))
    scores = []
    for path, emb in index:
        sim = F.cosine_similarity(q_emb, emb, dim=0).item()
        scores.append((path, sim))
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

def main():
    os.makedirs(AUDIO_LIBRARY_DIR, exist_ok=True)
    index = load_index(EMBED_CACHE_PATH)
    if index is None:
        print("Building index...")
        index = index_library(AUDIO_LIBRARY_DIR, cache_path=EMBED_CACHE_PATH)
        print(f"Indexed {len(index)} files.")
    else:
        print(f"Loaded cached index with {len(index)} files.")

    print("Upload a query audio file to search against the library.")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return
    query_name = list(uploaded.keys())[0]
    query_path = f"/content/{query_name}"
    results = find_similar(query_path, index, top_k=5)
    for rank, (path, sim) in enumerate(results, 1):
        print(f"{rank}. {path} — cosine similarity: {sim:.4f}")

if __name__ == "__main__":
    main()


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject