In [1]:
!pip install torch torchaudio transformers librosa faiss-cpu tqdm


Defaulting to user installation because normal site-packages is not writeable


In [12]:
import torch
print("GPU available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


GPU available: True
Device: NVIDIA GeForce GTX 1070


In [13]:
import os

# Change this if needed
AUDIO_FOLDER = "/home/ivan/PycharmProjects/MPr/audio_samples/fma_large/fma_large"

mp3_files = []
for root, _, files in os.walk(AUDIO_FOLDER):
    for file in files:
        if file.endswith(".mp3"):
            mp3_files.append(os.path.join(root, file))

# Optional: limit to the first 300 for now
mp3_files = mp3_files

print(f"Loaded {len(mp3_files)} files.")


Loaded 106574 files.


In [16]:
import torch
from transformers import ClapProcessor, ClapModel

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
model = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)


In [17]:
def embed_audio(filepath):
    try:
        waveform, sr = torchaudio.load(filepath)
        print(f"📂 Loaded '{filepath}' — shape: {waveform.shape}, sample rate: {sr}")

        waveform = waveform.mean(dim=0).unsqueeze(0)  # mono

        if sr != 48000:
            print(f"🔁 Resampling from {sr} → 48000 Hz")
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=48000)(waveform)

        # ✂️ Trim to exactly 30s = 1,440,000 samples
        max_len = 48000 * 30
        if waveform.shape[1] > max_len:
            print(f"✂️ Trimming waveform to 30s: {waveform.shape[1]} → {max_len}")
            waveform = waveform[:, :max_len]
        elif waveform.shape[1] < max_len:
            print(f"📏 Padding waveform to 30s: {waveform.shape[1]} → {max_len}")
            pad_len = max_len - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad_len))

        inputs = processor(
            audios=waveform.squeeze(0).cpu().numpy(),  # Convert to 1D NumPy array
            sampling_rate=48000,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            embedding = model.get_audio_features(**inputs)

        print(f"✅ Embedded {filepath}, vector shape: {embedding.shape}")
        return embedding.cpu().numpy().flatten()

    except Exception as e:
        print(f"❌ ERROR embedding {filepath}:\n{e}")
        traceback.print_exc()
        raise


In [None]:
import torch
import torchaudio
import numpy as np
from tqdm import tqdm

SAVE_EVERY = 1000  # save progress after every 1000 files
output_dir = "embeddings_large"
os.makedirs(output_dir, exist_ok=True)

embeddings = []
filenames = []
failures = []

start_idx = 0  # change this to resume (e.g. 22330 if crashed before)

for i, path in enumerate(tqdm(mp3_files[start_idx:], desc="Embedding audio files"), start=start_idx):
    try:
        vec = embed_audio(path)
        embeddings.append(vec)
        filenames.append(path)
    except Exception as e:
        failures.append((path, str(e)))

    # Periodic save
    if (i + 1) % SAVE_EVERY == 0:
        np.save(f"{/home/ivan/PycharmProjects/MPr/notebooks/embeddings/embedings_large}/vectors_{i+1}.npy", np.vstack(embeddings))
        np.save(f"{/home/ivan/PycharmProjects/MPr/notebooks/embeddings/embedings_large}/filenames_{i+1}.npy", np.array(filenames))
        print(f"✅ Saved checkpoint at {i+1} tracks.")
        # Optional: clear RAM if needed
        embeddings.clear()
        filenames.clear()

# Final save for remaining tracks
if embeddings:
    np.save(f"{output_dir}/vectors_final.npy", np.vstack(embeddings))
    np.save(f"{output_dir}/filenames_final.npy", np.array(filenames))
    print("✅ Final save complete.")


In [29]:
import os
import numpy as np

np.save("embeddings/audio_vectors_large.npy", embeddings)
np.save("embeddings/audio_filenames_large.npy", np.array(filenames))


In [30]:
import os

print("audio_vectors.npy:", os.path.exists("embeddings/audio_vectors_large.npy"))
print("audio_filenames.npy:", os.path.exists("embeddings/audio_filenames_large.npy"))


audio_vectors.npy: True
audio_filenames.npy: True


In [31]:
import numpy as np

vectors = np.load("embeddings/audio_vectors_large.npy")
filenames = np.load("embeddings/audio_filenames_large.npy")

print("✅ Vectors shape:", vectors.shape)      # Should be (N, 512)
print("✅ Filenames shape:", filenames.shape)  # Should be (N,)


✅ Vectors shape: (300, 512)
✅ Filenames shape: (300,)
