### Audio Similarity Search: Compare originaltrack.mp3 with audio_database files

In [28]:
import torchaudio
import torch
from scipy.spatial.distance import cdist
import numpy as np
import os
import pandas as pd

print("=" * 80)
print("Audio Similarity Search - Comparing originaltrack.mp3 with audio_database")
print("=" * 80)

Audio Similarity Search - Comparing originaltrack.mp3 with audio_database


In [29]:
# Load pre-trained Wav2Vec2 model
print("\nLoading Wav2Vec2 model...")
bundle = torchaudio.pipelines.WAV2VEC2_BASE
model = bundle.get_model()
model.eval()
print("✓ Model loaded")

def extract_embedding(audio_tensor):
    """Extract embedding from audio tensor - returns 1D array"""
    # Convert stereo to mono if needed
    if audio_tensor.shape[0] > 1:
        audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)
    
    with torch.inference_mode():
        features, _ = model(audio_tensor)
        # features shape: (batch, time_steps, feature_dim)
    
    # Mean pool over time dimension
    embedding = torch.mean(features, dim=1)  # Shape: (batch, feature_dim)
    embedding = embedding.detach().cpu().numpy()  # Convert to numpy
    
    # Flatten to 1D
    embedding = embedding.flatten()
    
    # Handle edge cases
    if embedding.ndim == 0:
        embedding = np.array([embedding.item()])
    
    return embedding.astype(np.float32)


Loading Wav2Vec2 model...
✓ Model loaded


In [30]:
# Load original track
original_path = 'sorcetrack.mp3' #'originaltrack.mp3'
print(f"\n✓ Loading {original_path}...")
original_audio, orig_sr = torchaudio.load(original_path)

# Resample if needed
if orig_sr != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=16000)
    original_audio = resampler(original_audio)

original_embedding = extract_embedding(original_audio)
print(f"✓ Original embedding shape: {original_embedding.shape}")


✓ Loading sorcetrack.mp3...
✓ Original embedding shape: (768,)


In [31]:
# Process all files in audio_database
audio_folder = 'audio_database/'
audio_files = sorted([f for f in os.listdir(audio_folder) if f.endswith(('.mp3', '.wav', '.flac'))])
print(f"\n✓ Found {len(audio_files)} files in {audio_folder}")

embeddings_list = []
valid_files = []

print("\nProcessing audio files...")
for i, file in enumerate(audio_files, 1):
    print(f"[{i}/{len(audio_files)}] {file}...", end=" ")
    try:
        audio_path = os.path.join(audio_folder, file)
        audio, sr = torchaudio.load(audio_path)
        
        # Resample if needed
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
            audio = resampler(audio)
        
        embedding = extract_embedding(audio)
        embeddings_list.append(embedding)
        valid_files.append(file)
        print(f"✓ shape={embedding.shape}")
    except Exception as e:
        print(f"✗ Error: {str(e)[:30]}")

print(f"\n✓ Processed {len(embeddings_list)} files successfully")


✓ Found 3 files in audio_database/

Processing audio files...
[1/3] deviatedtrack1.mp3... ✓ shape=(768,)
[2/3] moredeviatedtrack.mp3... ✓ shape=(768,)
[3/3] originaltrack.mp3... ✓ shape=(768,)

✓ Processed 3 files successfully


In [32]:
# Ensure all embeddings have the same dimension
print("\nAligning embeddings...")
print(f"Original embedding shape: {original_embedding.shape}")
print(f"Database embedding shapes: {[e.shape for e in embeddings_list]}")

# Get the feature dimension (should be same for all, but check)
feature_dim = original_embedding.shape[0]
print(f"\nFeature dimension: {feature_dim}")

# Verify all embeddings are 1D with correct dimension
for i, emb in enumerate(embeddings_list):
    if emb.shape[0] != feature_dim:
        print(f"❌ Embedding {i} has wrong dimension: {emb.shape[0]} vs {feature_dim}")
    assert emb.ndim == 1, f"Embedding {i} is not 1D: {emb.ndim}D"

assert original_embedding.ndim == 1, f"Original embedding is not 1D: {original_embedding.ndim}D"

# Create 2D arrays for cdist
original_2d = original_embedding.reshape(1, -1)
database_2d = np.vstack(embeddings_list)

print(f"\nFinal shapes before cdist:")
print(f"  Original: {original_2d.shape}")
print(f"  Database: {database_2d.shape}")

# Compute similarities
print("\nComputing similarity scores...")
distances = cdist(original_2d, database_2d, metric='cosine')
similarity_scores = 1 - distances[0]

print(f"✓ Computed {len(similarity_scores)} similarity scores")


Aligning embeddings...
Original embedding shape: (768,)
Database embedding shapes: [(768,), (768,), (768,)]

Feature dimension: 768

Final shapes before cdist:
  Original: (1, 768)
  Database: (3, 768)

Computing similarity scores...
✓ Computed 3 similarity scores


In [33]:
# Create results dataframe
results_df = pd.DataFrame({
    'Audio File': valid_files,
    'Similarity Score': similarity_scores,
    'Similarity %': (similarity_scores * 100).round(2)
})

# Sort by similarity
results_df = results_df.sort_values('Similarity Score', ascending=False).reset_index(drop=True)

# Display full results
print("\n" + "=" * 80)
print("Similarity Results (sorcetrack.mp3 vs audio_database):")
print("=" * 80)
print(results_df.to_string(index=False))

# Display top matches with visual bars
print("\n" + "=" * 80)
print("Top Matches:")
print("=" * 80)
for idx, row in results_df.head(5).iterrows():
    pct = row['Similarity %']
    file = row['Audio File']
    bar = '█' * int(pct / 2) + '░' * (50 - int(pct / 2))
    print(f"{file:<40} [{bar}] {pct:.1f}%")

# Statistics
print("\n" + "=" * 80)
print("Statistical Summary:")
print("=" * 80)
print(f"Mean Similarity:   {results_df['Similarity %'].mean():.2f}%")
print(f"Median Similarity: {results_df['Similarity %'].median():.2f}%")
print(f"Max Similarity:    {results_df['Similarity %'].max():.2f}%")
print(f"Min Similarity:    {results_df['Similarity %'].min():.2f}%")
print(f"Std Deviation:     {results_df['Similarity %'].std():.2f}%")

print("\n✓ Audio similarity search completed successfully!")


Similarity Results (sorcetrack.mp3 vs audio_database):
           Audio File  Similarity Score  Similarity %
    originaltrack.mp3          0.929007         92.90
moredeviatedtrack.mp3          0.893584         89.36
   deviatedtrack1.mp3          0.795134         79.51

Top Matches:
originaltrack.mp3                        [██████████████████████████████████████████████░░░░] 92.9%
moredeviatedtrack.mp3                    [████████████████████████████████████████████░░░░░░] 89.4%
deviatedtrack1.mp3                       [███████████████████████████████████████░░░░░░░░░░░] 79.5%

Statistical Summary:
Mean Similarity:   87.26%
Median Similarity: 89.36%
Max Similarity:    92.90%
Min Similarity:    79.51%
Std Deviation:     6.94%

✓ Audio similarity search completed successfully!
