In [None]:
from structure_derivation.model.model import StructureDerivationModel, StructureDerivationModelConfig
import os
import torch
import librosa

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CHECKPOINTS_DIR = "/keshav/musical_structure_metrics/structure_derivation/artifacts/structure_derivation_model/checkpoint/"

config = StructureDerivationModelConfig()
model = StructureDerivationModel(config)
model.to(device)

ckpt = torch.load(os.path.join(CHECKPOINTS_DIR, "checkpoint.pt"), map_location=device)
if "module" in ckpt["model"]:
    model.module.load_state_dict(ckpt["model"]["module"])  # `.module` because of DDP
else:
    model.load_state_dict(ckpt["model"])  # `.module` because of DDP

audio_paths = [
    '/mnt/data/music_reward/musiccaps/data/JDWPJ1AiDKc.wav',
    '/mnt/data/music_reward/musiccaps/data/JDWPJ1AiDKc.wav'
]

# Example input
audio, sr = librosa.load(audio_paths[0], sr=32000, mono=True)

audio_tensor = torch.tensor(audio).unsqueeze(0).to(device)  # (1, T)
model.eval()
with torch.no_grad():
    output = model(audio_tensor, infer_mode=True)

print(output['latent_output'].shape)  # Example output key

  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [1]:
import librosa
import os
import numpy as np
import torch
import torch.nn.functional as F
from structure_derivation.model.model import StructureDerivationModel, StructureDerivationModelConfig
from vendi_score import vendi

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

def load_model(ckpt_path, device):
    config = StructureDerivationModelConfig()
    model = StructureDerivationModel(config)
    model.to(device)

    ckpt = torch.load(ckpt_path, map_location=device)
    if "module" in ckpt["model"]:
        model.module.load_state_dict(ckpt["model"]["module"])  # DDP checkpoint
    else:
        model.load_state_dict(ckpt["model"])
    model.eval()
    return model

def split_audio(audio_path, segment_seconds=10, target_sr=32000):
    """Load audio and split into N non-overlapping segments of segment_seconds each."""
    audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
    segment_len = segment_seconds * target_sr
    total_len = len(audio)
    segments = []
    for start in range(0, total_len, segment_len):
        end = start + segment_len
        if end <= total_len:
            segments.append(audio[start:end])
    return segments, sr

def compute_embeddings(model, segments, device):
    """Pass each segment through the model to get latent_output embeddings."""
    embeddings = []
    for seg in segments:
        seg_tensor = torch.tensor(seg, dtype=torch.float32).unsqueeze(0).to(device)  # (1, T)
        with torch.no_grad():
            out = model(seg_tensor, infer_mode=True)
        embeddings.append(out["latent_output"])  # (1, D)
    return torch.cat(embeddings, dim=0)  # (N, D)

def compute_structure_derivation(embeddings):
    """Cosine similarity between first segment and all others."""
    ref = embeddings[0].unsqueeze(0)  # (1, D)
    sims = F.cosine_similarity(ref, embeddings[1:], dim=1)  # (N-1,)
    return sims.cpu().numpy()

def compute_vendi(embeddings):
    """Compute cosine similarity matrix between all embeddings."""
    norm_emb = F.normalize(embeddings, p=2, dim=1)  # (N, D)
    sim_matrix = torch.matmul(norm_emb, norm_emb.T)  # (N, N)
    sim_matrix = sim_matrix.cpu().numpy()
    vendi_score = vendi.score_K(sim_matrix)
    return vendi_score


# ----------------- Usage -----------------
CHECKPOINTS_DIR = "/keshav/musical_structure_metrics/structure_derivation/artifacts/structure_derivation_model/checkpoint/"
ckpt_path = os.path.join(CHECKPOINTS_DIR, "checkpoint.pt")

model = load_model(ckpt_path, device)

audio_path = '/mnt/data/marble/mtg_jamendo/mtg-jamendo-dataset/data/raw_30s_audio/91/1092591.mp3'
segments, sr = split_audio(audio_path, segment_seconds=10, target_sr=32000)
print(f"Split into {len(segments)} segments.")

embeddings = compute_embeddings(model, segments, device)
print("Embeddings shape:", embeddings.shape)  # (N, D)

similarities = compute_structure_derivation(embeddings)
print("Cosine similarities with S1:", similarities)

# Average Structure Derivation score
avg_structure_derivation = similarities.mean()
print("Average Structure Derivation similarity with S1:", avg_structure_derivation)

# Vendi score
vendi_score = compute_vendi(embeddings)
print("Vendi score:", vendi_score)


  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Split into 24 segments.
Embeddings shape: torch.Size([24, 768])
Cosine similarities with S1: [0.52317435 0.50659543 0.5110918  0.39754653 0.3822517  0.4601025
 0.47347623 0.41313756 0.5016664  0.3899931  0.4722015  0.46710354
 0.42274976 0.42616975 0.4232423  0.45664036 0.41684464 0.5037488
 0.43335164 0.47571242 0.5043582  0.5803537  0.6478833 ]
Average Structure Derivation similarity with S1: 0.4691042
Vendi score: 2.6937397


In [1]:
from structure_derivation.model.model import StructureDerivationModel, StructureDerivationModelConfig
import os
import torch

def load_model(ckpt_path, device):
    config = StructureDerivationModelConfig()
    model = StructureDerivationModel(config)
    model.to(device)

    ckpt = torch.load(ckpt_path, map_location=device)
    if "module" in ckpt["model"]:
        model.module.load_state_dict(ckpt["model"]["module"])  # DDP checkpoint
    else:
        model.load_state_dict(ckpt["model"])
    model.eval()
    return model

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
CHECKPOINTS_DIR = "/keshav/musical_structure_metrics/structure_derivation/artifacts/structure_derivation_model/checkpoint/"
ckpt_path = os.path.join(CHECKPOINTS_DIR, "checkpoint.pt")
config = StructureDerivationModelConfig()
config.auto_map = {
    "AutoConfig": "structure_derivation.model.model.StructureDerivationModelConfig",
    "AutoModel": "structure_derivation.model.model.StructureDerivationModel"
}
model = load_model(ckpt_path, device)

model.save_pretrained("/keshav/musical_structure_metrics/structure_derivation/artifacts/structure_derivation_model/huggingface/")
config.save_pretrained("/keshav/musical_structure_metrics/structure_derivation/artifacts/structure_derivation_model/huggingface/")

  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [2]:
from structure_derivation.model.model import StructureDerivationModel, StructureDerivationModelConfig

model = StructureDerivationModel.from_pretrained("/keshav/musical_structure_metrics/structure_derivation/artifacts/structure_derivation_model/huggingface/")
config = StructureDerivationModelConfig.from_pretrained("/keshav/musical_structure_metrics/structure_derivation/artifacts/structure_derivation_model/huggingface/")

In [None]:
from huggingface_hub import login

# Login to Hugging Face Hub using CLI: `huggingface-cli login`
# Replace 'your_huggingface_token' with your actual Hugging Face token
login(token="YOUR_HUGGINGFACE_TOKEN")

model.push_to_hub("keshavbhandari/Structure-Derivation")
config.push_to_hub("keshavbhandari/Structure-Derivation")

Processing Files (1 / 1): 100%|██████████|  121MB /  121MB, 28.5MB/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  


CommitInfo(commit_url='https://huggingface.co/keshavbhandari/Structure-Derivation/commit/127f7747c57dc7d6703670ef6fc31b808e374ce9', commit_message='Upload model', commit_description='', oid='127f7747c57dc7d6703670ef6fc31b808e374ce9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/keshavbhandari/Structure-Derivation', endpoint='https://huggingface.co', repo_type='model', repo_id='keshavbhandari/Structure-Derivation'), pr_revision=None, pr_num=None)

In [None]:
from structure_derivation.model.model import StructureDerivationModel

model = StructureDerivationModel.from_pretrained(
    "keshavbhandari/structure-derivation"
)

In [5]:
import librosa
import os
import numpy as np
import torch
import torch.nn.functional as F
from vendi_score import vendi

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

def split_audio(audio_path, segment_seconds=10, target_sr=32000):
    """Load audio and split into N non-overlapping segments of segment_seconds each."""
    audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
    segment_len = segment_seconds * target_sr
    total_len = len(audio)
    segments = []
    for start in range(0, total_len, segment_len):
        end = start + segment_len
        if end <= total_len:
            segments.append(audio[start:end])
    return segments, sr

def compute_embeddings(model, segments, device):
    """Pass each segment through the model to get latent_output embeddings."""
    embeddings = []
    for seg in segments:
        seg_tensor = torch.tensor(seg, dtype=torch.float32).unsqueeze(0).to(device)  # (1, T)
        with torch.no_grad():
            out = model(seg_tensor, infer_mode=True)
        embeddings.append(out["latent_output"])  # (1, D)
    return torch.cat(embeddings, dim=0)  # (N, D)

def compute_structure_derivation(embeddings):
    """Cosine similarity between first segment and all others."""
    ref = embeddings[0].unsqueeze(0)  # (1, D)
    sims = F.cosine_similarity(ref, embeddings[1:], dim=1)  # (N-1,)
    return sims.cpu().numpy()

def compute_vendi(embeddings):
    """Compute cosine similarity matrix between all embeddings."""
    norm_emb = F.normalize(embeddings, p=2, dim=1)  # (N, D)
    sim_matrix = torch.matmul(norm_emb, norm_emb.T)  # (N, N)
    sim_matrix = sim_matrix.cpu().numpy()
    vendi_score = vendi.score_K(sim_matrix)
    return vendi_score

# Check with loaded model from Hugging Face
audio_path = '/mnt/data/marble/mtg_jamendo/mtg-jamendo-dataset/data/raw_30s_audio/91/1092591.mp3'
device = torch.device('cpu')
segments, sr = split_audio(audio_path, segment_seconds=10, target_sr=32000)
print(f"Split into {len(segments)} segments.")

embeddings = compute_embeddings(model, segments, device)
print("Embeddings shape:", embeddings.shape)  # (N, D)

similarities = compute_structure_derivation(embeddings)
print("Cosine similarities with S1:", similarities)

# Average Structure Derivation score
avg_structure_derivation = similarities.mean()
print("Average Structure Derivation similarity with S1:", avg_structure_derivation)

# Vendi score
vendi_score = compute_vendi(embeddings)
print("Vendi score:", vendi_score)

Split into 24 segments.
Embeddings shape: torch.Size([24, 768])
Cosine similarities with S1: [0.52317435 0.50659543 0.5110919  0.39754656 0.38225168 0.46010253
 0.47347632 0.41313758 0.5016665  0.38999307 0.47220153 0.46710354
 0.42274985 0.42616975 0.42324224 0.45664036 0.41684467 0.50374883
 0.43335167 0.4757125  0.5043582  0.5803536  0.6478831 ]
Average Structure Derivation similarity with S1: 0.4691042
Vendi score: 2.693742
