In [1]:
pip install torch torchvision torchaudio transformers librosa soundfile

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pydub


Note: you may need to restart the kernel to use updated packages.


In [1]:
import transformers
print(transformers.__version__)

4.57.3


<h1><b>CLAP Model</b></h1>

In [1]:
from pydub import AudioSegment
import librosa, soundfile as sf

from transformers import AutoProcessor, AutoModel
import torch

# Load MP3 and export to WAV
sound = AudioSegment.from_mp3("backToFriends.mp3")
sound.export("backToFriends.wav", format="wav")

y, sr = librosa.load("backToFriends.wav", sr = 48000)

a = y[:30*sr]
sf.write("songA.wav", a, sr)

# Take next 30s as Song B
b = y[30*sr:60*sr]
sf.write("songB.wav", b, sr)

a, sr = librosa.load("songA.wav", sr=48000, mono = True)
b, sr = librosa.load("songB.wav", sr = 48000, mono=True)

# Load CLAP model + processor
model_id = "laion/clap-htsat-fused"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)


# Slice last 2s of A and first 2s of B
a_window = a[-5*sr:]
b_window = b[:5*sr]

# Process into model inputs
inputs = processor(audios=[a_window, b_window], return_tensors="pt", sampling_rate=sr)

# Get embeddings
with torch.no_grad():
    embeddings = model.get_audio_features(**inputs)

# Compare similarity
cosine_sim = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)
print("Similarity score:", cosine_sim.item())

  inputs = processor(audios=[a_window, b_window], return_tensors="pt", sampling_rate=sr)


Similarity score: 0.7260153889656067


In [4]:
from pydub import AudioSegment
import librosa, soundfile as sf

from transformers import AutoProcessor, AutoModel
import torch

# Load MP3 and export to WAV
sound = AudioSegment.from_mp3("backToFriends.mp3")
sound.export("backToFriends.wav", format="wav")

sound2 = AudioSegment.from_mp3("finesse.mp3")
sound2.export("finesse.wav", format="wav")

y, sr = librosa.load("backToFriends.wav", sr = 48000)
fi, srf = librosa.load("finesse.wav", sr = 48000)

a = y[:30*sr]
sf.write("songA.wav", a, sr)

fines = fi[:30*srf]
sf.write("fines.wav", fines, srf)

a, sr = librosa.load("songA.wav", sr=48000, mono = True)
fines, srf = librosa.load("fines.wav", sr = 48000, mono = True)

# Load CLAP model + processor
model_id = "laion/clap-htsat-fused"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)


# Slice last 2s of A and first 2s of B
a_window = a[-5*sr:]
f_window = fines[:5*sr]

# Process into model inputs
inputs = processor(audios=[a_window, f_window], return_tensors="pt", sampling_rate=sr)

# Get embeddings
with torch.no_grad():
    embeddings = model.get_audio_features(**inputs)

# Compare similarity
cosine_sim = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)
print("Similarity score:", cosine_sim.item())



AttributeError: 'NoneType' object has no attribute 'from_pretrained'

<h1><b>MERT Model</b></h1>

In [7]:
from transformers import AutoProcessor, AutoModel
import torch, librosa

model_id = "m-a-p/MERT-v1-330M"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)

# Load audio at 24kHz
a, sr = librosa.load("songA.wav", sr=24000, mono=True)
b, sr = librosa.load("songB.wav", sr=24000, mono=True)

# Take windows
a_window = a[-3*sr:]
b_window = b[:3*sr]

# Put waveforms into a list
waveforms = [a_window, b_window]

# Preprocess (MERT expects raw_speech)
inputs = processor(
    raw_speech=waveforms,
    sampling_rate=sr,
    return_tensors="pt",
    padding=True
)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # mean pooling

# Cosine similarity
cosine_sim = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)
print("Similarity score:", cosine_sim.item())


Similarity score: 0.9401344060897827


from transformers import AutoProcessor, AutoModel
import torch, librosa

model_id = "m-a-p/MERT-v1-330M"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)

# Load audio at 24kHz
a, sr = librosa.load("songA.wav", sr=24000, mono=True)
b, sr = librosa.load("fines.wav", sr=24000, mono=True)

# Take windows
a_window = a[-2*sr:]
b_window = b[:2*sr]

# Put waveforms into a list
waveforms = [a_window, b_window]

# Preprocess (MERT expects raw_speech)
inputs = processor(
    raw_speech=waveforms,
    sampling_rate=sr,
    return_tensors="pt",
    padding=True
)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # mean pooling

# Cosine similarity
cosine_sim = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)
print("Similarity score:", cosine_sim.item())

In [None]:
**Trying diff window lengths**

In [4]:
from transformers import AutoProcessor, AutoModel
import torch, librosa
import numpy as np

model_id = "m-a-p/MERT-v1-330M"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)

# Load audio at 24kHz
a, sr = librosa.load("songA.wav", sr=24000, mono=True)
b, sr = librosa.load("fines.wav", sr=24000, mono=True)

# Define window lengths (in seconds)
window_lengths = [1, 2, 3, 4, 5]  # try 1–5 seconds
best_score = -1
best_window = None

for w in window_lengths:
    num_samples = int(w * sr) #have to do seconds times sampling rate (samples per second)

    # Last w seconds of song A, first w seconds of song B
    if len(a) < num_samples or len(b) < num_samples:
        continue  # skip if song is too short

    a_window = a[-num_samples:]
    b_window = b[:num_samples]

    # Preprocess
    inputs = processor(
        raw_speech=[a_window, b_window],
        sampling_rate=sr,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)

    cosine_sim = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)
    score = cosine_sim.item()

    print(f"Window: {w}s → Similarity: {score:.4f}")

    if score > best_score:
        best_score = score
        best_window = w

print(f"\nBest window length: {best_window}s (Similarity: {best_score:.4f})")


Window: 1s → Similarity: 0.6942
Window: 2s → Similarity: 0.7546
Window: 3s → Similarity: 0.7626
Window: 4s → Similarity: 0.7685
Window: 5s → Similarity: 0.7706

Best window length: 5s (Similarity: 0.7706)
