<a href="https://colab.research.google.com/github/jeff-ai-ml/genai/blob/main/search_similar_audio_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torchaudio
from transformers import ClapModel, ClapProcessor
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
from IPython.display import Audio, display

# --- 1. Install necessary libraries ---

In [2]:
!pip install transformers torchaudio soundfile scikit-learn matplotlib

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

# --- 2. Load the pre-trained CLAP model and processor ---

In [3]:
# Using a CLAP model for semantic audio understanding
# 'laion/clap-htsat-unfused' is a good choice for general audio, including music.
# It's multimodal, meaning it learns audio semantics well.
model_name = "laion/clap-htsat-unfused"
processor = ClapProcessor.from_pretrained(model_name)
model = ClapModel.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using device: {device}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/614M [00:00<?, ?B/s]

Using device: cpu


In [4]:
# Get the target sample rate and mono status expected by the CLAP model
# CLAP models often expect specific sample rates (e.g., 48000 Hz)
# Check processor.feature_extractor.sampling_rate for the exact value
TARGET_SAMPLING_RATE = processor.feature_extractor.sampling_rate
MONO_AUDIO = True # CLAP models usually expect mono audio

In [5]:
# --- Function Definition: get_audio_clap_embedding ---
# This function is the core of getting embeddings.
def get_audio_clap_embedding(audio_path, model, processor, device, target_sr, mono):
    """Loads audio with torchaudio, resamples, and gets CLAP embedding."""
    try:
        # Load audio using torchaudio. Returns (channels, samples) and sample_rate
        waveform, sr = torchaudio.load(audio_path)

        # Move waveform to device immediately
        waveform = waveform.to(device)

        # Resample if necessary
        if sr != target_sr:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr).to(device)
            waveform = resampler(waveform)

        # Convert to mono if requested and if it's currently multi-channel
        if mono and waveform.shape[0] > 1:
            # Average the channels to make it mono. Resulting shape will be (1, N_samples)
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Ensure it's a 1D tensor (samples,) if it's mono (e.g., (1, N_samples) becomes (N_samples,))
        # The CLAP processor's `audios` argument often expects a 1D tensor for a single audio file,
        # and then it internally adds the batch dimension.
        if waveform.shape[0] == 1:
            waveform = waveform.squeeze(0) # Remove the channel dimension if it's 1

        # Prepare inputs for the CLAP model using the processor
        # The processor expects a 1D tensor (samples,) for a single audio input.
        inputs = processor(audios=waveform, sampling_rate=target_sr, return_tensors="pt")
        # Ensure the tensors in the inputs dictionary are also on the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            audio_features = model.get_audio_features(**inputs)

        # Move the features back to CPU, convert to numpy, and flatten
        return audio_features.cpu().numpy().flatten()

    except Exception as e:
        print(f"Could not process audio {os.path.basename(audio_path)}: {e}")
        return None # Return None if processing failed


In [32]:
# --- 3. Prepare your uploaded audio files ---
audio_dir = "/content/uploaded_songs"

In [33]:
# Get a list of all MP3 file paths from the directory
uploaded_audio_paths = glob.glob(os.path.join(audio_dir, "*.mp3"))
uploaded_audio_paths.sort() # Sort for consistent order

In [34]:
if not uploaded_audio_paths:
    print(f"Error: No MP3 files found in '{audio_dir}'.")
    print("Please ensure you have manually uploaded your songs to this directory in Colab.")
    print("Go to the left sidebar (folder icon) -> content -> uploaded_songs, then drag and drop your MP3 files there.")
    exit()

print(f"Found {len(uploaded_audio_paths)} uploaded songs:")
for path in uploaded_audio_paths:
    print(f"- {path}")

Found 10 uploaded songs:
- /content/uploaded_songs/Bill Haley - Rock Around the Clock lyrics.mp3
- /content/uploaded_songs/Blue - All Rise.mp3
- /content/uploaded_songs/Bob Marley & The Wailers - Buffalo Soldier (Official Music Video).mp3
- /content/uploaded_songs/Den Harrow - Catch the Fox.mp3
- /content/uploaded_songs/Dr. Alban - Enemies (1997).mp3
- /content/uploaded_songs/Dvorak_ Symphony No. 9 - Movement 3.mp3
- /content/uploaded_songs/Junoon - Sayonee (Official Music Video).mp3
- /content/uploaded_songs/Rihanna - Unfaithful (Official Music Video).mp3
- /content/uploaded_songs/The Jon Spencer Blues Explosion - Bellbottoms.mp3
- /content/uploaded_songs/Westlife - Hit You With the Real Thing (Official Audio).mp3


# --- 4. Generate embeddings for all uploaded songs using CLAP ---

In [35]:
print(f"\nGenerating embeddings for uploaded songs (target SR: {TARGET_SAMPLING_RATE})...")
uploaded_embeddings = []
valid_uploaded_audio_paths = []

for audio_path in uploaded_audio_paths:
    embedding = get_audio_clap_embedding(audio_path, model, processor, device, TARGET_SAMPLING_RATE, MONO_AUDIO)
    if embedding is not None:
        uploaded_embeddings.append(embedding)
        valid_uploaded_audio_paths.append(audio_path)
    else:
        print(f"Skipping {os.path.basename(audio_path)} due to embedding error.")

uploaded_embeddings = np.array(uploaded_embeddings)

if uploaded_embeddings.size == 0:
    print("No valid audio embeddings could be generated. Exiting.")
    exit()

print(f"Generated embeddings for {uploaded_embeddings.shape[0]} songs, each with dimension {uploaded_embeddings.shape[1]}.")



Generating embeddings for uploaded songs (target SR: 48000)...
Generated embeddings for 10 songs, each with dimension 512.


# --- 5. Define  query song ---

In [36]:
query_dir = "/content/query_song"

In [37]:
# Get a list of all MP3 file paths from the directory
uploaded_query_paths = glob.glob(os.path.join(query_dir, "*.mp3"))
uploaded_query_paths.sort() # Sort for consistent order
uploaded_query_paths

['/content/query_song/All Day Jolly Day - Official Video Manadhai Thirudivittai Prabhu Deva Kausalya #ddmusic.mp3',
 '/content/query_song/Karu Karu Vizhigalal - 4K Video கர கர Pachaikili Muthucharam Sarath Kumar Harris Jayaraj.mp3',
 '/content/query_song/Meghamai Vanthu Pogiren - Video Song Thullatha Manamum Thullum Vijay Simran Sun Music.mp3',
 '/content/query_song/Nerrukku Ner Movie songs Akhila Akhila Song Vijay Suriya Simran Kausalya Deva.mp3',
 '/content/query_song/Unakkena Naan - Video Song Kadhalil Vizhunthen Nakkul Sunaina Sun Music.mp3']

In [38]:
query_song_path = uploaded_query_paths[4]
query_song_path

'/content/query_song/Unakkena Naan - Video Song Kadhalil Vizhunthen Nakkul Sunaina Sun Music.mp3'

# --- 6. Generate embedding for the query song ---

In [39]:
query_embedding = get_audio_clap_embedding(query_song_path, model, processor, device, TARGET_SAMPLING_RATE, MONO_AUDIO)

if query_embedding is None:
    print("Failed to generate embedding for the query song. Exiting.")
    exit()
else:
  print("Embedding for query song is done!")
  print(query_embedding)

Embedding for query song is done!
[-9.98901669e-03  1.85331609e-02  8.94368067e-03  1.41465636e-02
  7.46547580e-02 -5.31663112e-02  4.63174433e-02  3.90328141e-03
  1.47913350e-02 -3.37423049e-02  6.56297524e-03 -6.57081231e-02
  6.35769404e-03  5.95381334e-02 -5.41418232e-02  5.09539619e-02
  1.75404884e-02 -9.09257233e-02  3.93575914e-02  2.34574974e-02
  6.85465662e-03  3.71740758e-02  1.05366055e-02  1.47407325e-02
 -6.74979538e-02  2.26960629e-02 -2.92732697e-02  1.89444311e-02
  1.58825163e-02 -1.01748444e-02 -4.91517223e-02  3.80121730e-02
  3.74070778e-02 -8.27240059e-04  2.04896983e-02 -2.29677893e-02
 -5.07833473e-02  7.67703023e-05 -1.28756478e-01 -3.21044996e-02
  1.84012316e-02  1.49922390e-02 -3.06258928e-02  7.56200077e-03
 -4.26411368e-02 -1.88315101e-02  1.79433022e-02 -3.51163708e-02
  4.04477492e-02 -3.08790803e-02 -1.50049170e-02 -3.17196921e-02
 -4.07794639e-02  3.41788307e-02  4.87461612e-02  4.46591387e-03
 -1.51026668e-02 -7.22825900e-03 -6.35011643e-02  1.0980

# --- 7. Perform similarity search (Cosine Similarity) ---

In [40]:
query_embedding_reshaped = query_embedding.reshape(1, -1)
similarities = cosine_similarity(query_embedding_reshaped, uploaded_embeddings)[0]

# --- 8. Rank and display results ---

In [41]:
results = []
for i, score in enumerate(similarities):
    results.append((score, valid_uploaded_audio_paths[i]))

results.sort(key=lambda x: x[0], reverse=True)

print("\n--- Semantic Audio Search Results (Highest Similarity First) ---")
print(f"Query Song: {os.path.basename(query_song_path)}")
display(Audio(query_song_path)) # Display audio player for the query song


--- Semantic Audio Search Results (Highest Similarity First) ---
Query Song: Unakkena Naan - Video Song Kadhalil Vizhunthen Nakkul Sunaina Sun Music.mp3


In [42]:
# Display similar songs
print("\nSimilar Songs:")
for i, (score, path) in enumerate(results):
    print(f"{i+1}. Song: {os.path.basename(path)}, Similarity: {score:.4f}")
    # You can uncomment the line below to play each similar song in Colab output
    # Be cautious with many songs; it can make the output very long.
    # if score < 0.999: # Avoid playing the exact same song again if it's the query
    #     display(Audio(path))


Similar Songs:
1. Song: Westlife - Hit You With the Real Thing (Official Audio).mp3, Similarity: 0.7586
2. Song: Dr. Alban - Enemies (1997).mp3, Similarity: 0.7327
3. Song: Rihanna - Unfaithful (Official Music Video).mp3, Similarity: 0.6885
4. Song: Den Harrow - Catch the Fox.mp3, Similarity: 0.6534
5. Song: Bob Marley & The Wailers - Buffalo Soldier (Official Music Video).mp3, Similarity: 0.6501
6. Song: Blue - All Rise.mp3, Similarity: 0.6433
7. Song: Junoon - Sayonee (Official Music Video).mp3, Similarity: 0.6101
8. Song: The Jon Spencer Blues Explosion - Bellbottoms.mp3, Similarity: 0.5231
9. Song: Bill Haley - Rock Around the Clock lyrics.mp3, Similarity: 0.5129
10. Song: Dvorak_ Symphony No. 9 - Movement 3.mp3, Similarity: 0.4319
