***Imports***

In [2]:
import os
import re
import pathlib
import numpy as np
import pandas as pd
import torch
from laion_clap import CLAP_Module

***Utility Functions***

In [3]:
def l2norm(x, axis=-1, eps=1e-12):
    """
    Apply L2 normalization to vectors along a given axis.
    """
    norm = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (norm + eps)

AUDIO_EXT = ".wav"

# Generic regex that matches any <stem>_<type>_<desc>_<scale>.wav
GENERIC_AUDIO_RE = re.compile(
    r"^(?P<stem>.+?)_(?P<type>eq|reverb|rvb)_(?P<desc>[A-Za-z0-9\-]+)_(?P<scale>0\.\d+|1(?:\.0)?)\.wav$",
    re.IGNORECASE
)

def parse_audio_files(audio_dir: str, valid_types=None, audio_ext=".wav") -> pd.DataFrame:
    if not os.path.isdir(audio_dir):
        raise FileNotFoundError(f"❌ Directory does not exist: {audio_dir}")

    if valid_types is not None:
        valid_types = set(t.lower() for t in valid_types)

    rows = []
    for root, _, files in os.walk(audio_dir):
        for f in files:
            if pathlib.Path(f).suffix.lower() != audio_ext:
                continue
            m = GENERIC_AUDIO_RE.match(f)
            if not m:
                continue
            file_type = m.group("type").lower()
            if valid_types and file_type not in valid_types:
                continue
            rows.append({
                "path": os.path.join(root, f),
                "type": file_type,
                "stem": m.group("stem"),
                "descriptor": m.group("desc").lower(),
                "scale": float(m.group("scale"))
            })

    if not rows:
        print(f"⚠️ No matching files found in: {audio_dir}")
        return pd.DataFrame(columns=["path", "type", "stem", "descriptor", "scale"])

    return pd.DataFrame(rows).sort_values(["type", "descriptor", "scale", "path"]).reset_index(drop=True)




def classify_trend(df_desc):
    """
    Classify the trend of delta_target values across scale levels.

    Returns:
        str: Trend type (monotonic or peak location)
    """
    g = df_desc.set_index("scale").reindex(EXPECTED_SCALES)
    if g["delta_target"].isna().any():
        return "Insufficient data"

    d0, d1, d2 = g["delta_target"].tolist()

    # Check monotonicity (with tolerance)
    if d0 <= d1 + EPS and d1 <= d2 + EPS:
        return "Monotonic up"
    if d0 >= d1 - EPS and d1 >= d2 - EPS:
        return "Monotonic down"

    # Identify peak location
    peak_idx = int(np.argmax([d0, d1, d2]))
    return {0: "Peak low (0.3)", 1: "Peak mid (0.6)", 2: "Peak high (1.0)"}[peak_idx]


**EQ**

***Configuration***

In [7]:
# List of EQ descriptors used in filenames and CLAP embeddings
EQ_DESCRIPTORS = [
    'warm', 'cold', 'soft', 'loud', 'bright', 'soothing', 'harsh', 'heavy', 'cool',
    'smooth', 'calm', 'clear', 'tinny', 'sharp', 'hard', 'crisp', 'mellow', 'dark',
    'peaceful', 'gentle'
]

# === PATHS ===
# NOTE: Replace these with actual file paths when running
EQ_DIR = " "   # Directory containing EQ-modified audio files
EQ_ORIGINAL_PATH =  " "  # Path to the original unmodified audio file

***Load Model & Embed Text***

In [5]:
laion_eq = CLAP_Module(enable_fusion=False)
laion_eq.load_ckpt()  # default pretrained
laion_eq.eval()

EQ_TEXT_LABELS = EQ_DESCRIPTORS
EQ_TEXT_EMBS = laion_eq.get_text_embedding(EQ_TEXT_LABELS, use_tensor=False)  # Shape: [M, D]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2.we

***Parse Audio Files***

In [9]:
eq_files_df = parse_audio_files(EQ_DIR, valid_types=["eq"])
print(eq_files_df.head(10))

                                                path    stem descriptor  scale
0  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar     bright    0.3
1  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar     bright    0.6
2  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar     bright    1.0
3  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar       calm    0.3
4  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar       calm    0.6
5  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar       calm    1.0
6  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar      clear    0.3
7  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar      clear    0.6
8  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar      clear    1.0
9  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  guitar       cold    0.3


***Embed Audio***

In [10]:
# Get audio embeddings: shape = [1 + N, D]
eq_audio_paths = [EQ_ORIGINAL_PATH] + eq_files_df["path"].tolist()
eq_audio_embs = laion_eq.get_audio_embedding_from_filelist(x=eq_audio_paths, use_tensor=False)

# Split original and manipulated embeddings
eq_orig_emb = eq_audio_embs[0:1, :]     # Shape: [1, D]
eq_manip_embs = eq_audio_embs[1:, :]    # Shape: [N, D]

# Normalize all embeddings (L2 norm)
A_orig = l2norm(eq_orig_emb)
A_manip = l2norm(eq_manip_embs)
T = l2norm(EQ_TEXT_EMBS)  # Text embeddings

***Compute Similarity Delta***

In [26]:
# Original audio similarity to each text descriptor → [M]
eq_orig_sims = (A_orig @ T.T)[0]

# Manipulated audio similarity to each descriptor → [N, M]
eq_manip_sims = A_manip @ T.T

# Map descriptor name to index in the text embedding matrix
eq_desc2idx = {desc: i for i, desc in enumerate(EQ_TEXT_LABELS)}

rows = []
for i, row in eq_files_df.iterrows():
    desc = row["descriptor"]
    idx = eq_desc2idx[desc]
    
    s_target = float(eq_manip_sims[i, idx])      # Similarity of manipulated audio
    s_orig_target = float(eq_orig_sims[idx])     # Similarity of original audio

    rows.append({
        "path": row["path"],
        "stem": row["stem"],
        "descriptor": desc,
        "scale": float(row["scale"]),
        "sim_target": s_target,
        "sim_orig_target": s_orig_target,
        "delta_target": s_target - s_orig_target
    })

# Final DataFrame of results
laion_eq_target_df = pd.DataFrame(rows).sort_values(["descriptor", "scale"]).reset_index(drop=True)
laion_eq_target_df.head(12)


Unnamed: 0,path,stem,descriptor,scale,sim_target,sim_orig_target,delta_target
0,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,bright,0.3,-0.011053,0.001072,-0.012124
1,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,bright,0.6,0.020476,0.001072,0.019404
2,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,bright,1.0,0.060727,0.001072,0.059655
3,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,calm,0.3,-0.001792,0.001695,-0.003487
4,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,calm,0.6,0.012207,0.001695,0.010512
5,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,calm,1.0,0.029213,0.001695,0.027518
6,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,clear,0.3,0.001004,-0.006024,0.007028
7,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,clear,0.6,0.025466,-0.006024,0.03149
8,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,clear,1.0,0.076112,-0.006024,0.082136
9,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar,cold,0.3,-0.08427,-0.072623,-0.011647


***Aggregate Metrics***

In [27]:
# Overall mean delta_target by scale
laion_eq_overall = (
    laion_eq_target_df
    .groupby("scale")["delta_target"]
    .mean()
    .reset_index()
    .sort_values("scale")
)

# Mean delta_target by descriptor and scale
laion_eq_by_desc = (
    laion_eq_target_df
    .groupby(["descriptor", "scale"])["delta_target"]
    .mean()
    .reset_index()
    .sort_values(["descriptor", "scale"])
)

laion_eq_overall, laion_eq_by_desc.head(10)

# Save to CSV
OUT = "./outputs/LAION_CLAP"
os.makedirs(OUT, exist_ok=True)

laion_eq_overall.to_csv(f"{OUT}/laionclap_eq_mean_delta_by_scale.csv", index=False)
laion_eq_by_desc.to_csv(f"{OUT}/laionclap_eq_delta_by_descriptor_and_scale.csv", index=False)
laion_eq_overall, laion_eq_by_desc.head(60)


(   scale  delta_target
 0    0.3      0.017558
 1    0.6      0.042074
 2    1.0      0.075825,
    descriptor  scale  delta_target
 0      bright    0.3     -0.012124
 1      bright    0.6      0.019404
 2      bright    1.0      0.059655
 3        calm    0.3     -0.003487
 4        calm    0.6      0.010512
 5        calm    1.0      0.027518
 6       clear    0.3      0.007028
 7       clear    0.6      0.031490
 8       clear    1.0      0.082136
 9        cold    0.3     -0.011647
 10       cold    0.6     -0.035722
 11       cold    1.0      0.007908
 12       cool    0.3      0.038752
 13       cool    0.6      0.018034
 14       cool    1.0      0.017952
 15      crisp    0.3      0.006799
 16      crisp    0.6      0.010322
 17      crisp    1.0      0.088049
 18       dark    0.3      0.043593
 19       dark    0.6      0.075318
 20       dark    1.0      0.104756
 21     gentle    0.3     -0.000326
 22     gentle    0.6     -0.027566
 23     gentle    1.0     -0.033722
 24

***Data Analysis***

In [28]:
#trend analysis
EXPECTED_SCALES = [0.3, 0.6, 1.0]
EPS = 1e-6 

# Apply trend classification per descriptor
trend_rows = []
for desc, group in laion_eq_target_df.groupby("descriptor", as_index=False):
    trend = classify_trend(group[["scale", "delta_target"]].copy())
    trend_rows.append({"descriptor": desc, "trend_type": trend})

# Final trend classification DataFrame
laion_trend_df = pd.DataFrame(trend_rows).sort_values("trend_type").reset_index(drop=True)
display(laion_trend_df)

# Count occurrences of each trend type
laion_trend_counts = laion_trend_df["trend_type"].value_counts()
display(laion_trend_counts)



Unnamed: 0,descriptor,trend_type
0,warm,Monotonic down
1,cool,Monotonic down
2,gentle,Monotonic down
3,soothing,Monotonic up
4,soft,Monotonic up
5,smooth,Monotonic up
6,sharp,Monotonic up
7,mellow,Monotonic up
8,loud,Monotonic up
9,heavy,Monotonic up


trend_type
Monotonic up       14
Monotonic down      3
Peak high (1.0)     2
Peak mid (0.6)      1
Name: count, dtype: int64

In [29]:
OUT = "./outputs/LAION_CLAP"
os.makedirs(OUT, exist_ok=True)

# Attach model tag and standardize columns
laion_eq_targets = laion_eq_target_df.copy()
laion_eq_targets["model"] = "LAION-CLAP"
laion_eq_targets = laion_eq_targets[
    ["model","descriptor","scale","sim_target","sim_orig_target","delta_target","path","stem"]
].sort_values(["descriptor","scale","path"])

laion_eq_trends = laion_trend_df.copy()
laion_eq_trends["model"] = "LAION-CLAP"
laion_eq_trends = laion_eq_trends[["model","descriptor","trend_type"]].sort_values(["descriptor"])

# Save CSVs
laion_eq_targets.to_csv(f"{OUT}/laionclap_eq_targets.csv", index=False)
laion_eq_trends.to_csv (f"{OUT}/laionclap_eq_trends.csv",  index=False)

display(laion_eq_targets.head(6))
display(laion_eq_trends.head(6))


Unnamed: 0,model,descriptor,scale,sim_target,sim_orig_target,delta_target,path,stem
0,LAION-CLAP,bright,0.3,-0.011053,0.001072,-0.012124,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar
1,LAION-CLAP,bright,0.6,0.020476,0.001072,0.019404,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar
2,LAION-CLAP,bright,1.0,0.060727,0.001072,0.059655,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar
3,LAION-CLAP,calm,0.3,-0.001792,0.001695,-0.003487,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar
4,LAION-CLAP,calm,0.6,0.012207,0.001695,0.010512,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar
5,LAION-CLAP,calm,1.0,0.029213,0.001695,0.027518,/Users/lindseydeng/Desktop/timbre_semantics_ex...,guitar


Unnamed: 0,model,descriptor,trend_type
10,LAION-CLAP,bright,Monotonic up
15,LAION-CLAP,calm,Monotonic up
14,LAION-CLAP,clear,Monotonic up
17,LAION-CLAP,cold,Peak high (1.0)
1,LAION-CLAP,cool,Monotonic down
13,LAION-CLAP,crisp,Monotonic up


**Reverb**

In [2]:
# === CONFIG ===
RVB_DESCRIPTORS = [
  'echo', 'distant', 'warm', 'spacious', 'loud', 'deep', 
    'muffled', 'church', 'big', 'distorted', 'hollow', 'sad', 'soft', 
    'bass', 'strong', 'low', 'haunting', 'clear','tinny', 'hall']

RVB_DIR = " " 
RVB_ORIGINAL_PATH = " "

In [3]:
laion_rvb = CLAP_Module(enable_fusion=False)
laion_rvb.load_ckpt()  # default pretrained
laion_rvb.eval()


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2.we

CLAP_Module(
  (model): CLAP(
    (audio_branch): HTSAT_Swin_Transformer(
      (spectrogram_extractor): Spectrogram(
        (stft): STFT(
          (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(480,), bias=False)
          (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(480,), bias=False)
        )
      )
      (logmel_extractor): LogmelFilterBank()
      (spec_augmenter): SpecAugmentation(
        (time_dropper): DropStripes()
        (freq_dropper): DropStripes()
      )
      (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (patch_embed): PatchEmbed(
        (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4))
        (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (layers): ModuleList(
        (0): BasicLayer(
          dim=96, input_resolution=(64, 64), depth=2
          (blocks): ModuleList(
            (0): SwinTransformerBlock(
      

In [4]:
RVB_TEXT_LABELS = RVB_DESCRIPTORS
RVB_TEXT_EMBS = laion_rvb.get_text_embedding(RVB_TEXT_LABELS, use_tensor=False)  # [M, D]
RVB_TEXT_EMBS.shape


(20, 512)

In [5]:
rvb_files_df = parse_audio_files(RVB_DIR, valid_types=["reverb", "rvb"])
rvb_files_df.head(10), len(rvb_files_df)

(                                                path    stem descriptor  scale
 0  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar       bass    0.3
 1  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar       bass    0.6
 2  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar       bass    1.0
 3  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar        big    0.3
 4  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar        big    0.6
 5  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar        big    1.0
 6  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar     church    0.3
 7  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar     church    0.6
 8  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar     church    1.0
 9  //Users/lindseydeng/Desktop/timbre_semantics_e...  guitar      clear    0.3,
 60)

In [6]:
rvb_audio_paths = [RVB_ORIGINAL_PATH] + rvb_files_df["path"].tolist()
rvb_audio_embs = laion_rvb.get_audio_embedding_from_filelist(x=rvb_audio_paths, use_tensor=False)  # [1+N, D]
rvb_orig_emb = rvb_audio_embs[0:1, :]     # [1, D]
rvb_manip_embs = rvb_audio_embs[1:, :]    # [N, D]
rvb_manip_embs.shape, rvb_orig_emb.shape

# Normalize
A_orig = l2norm(rvb_orig_emb)          # [1, D]
A_manip = l2norm(rvb_manip_embs)       # [N, D]
T = l2norm(RVB_TEXT_EMBS)              # [M, D]

rvb_manip_embs.shape, rvb_orig_emb.shape


((60, 512), (1, 512))

In [9]:
# Original audio similarity to each text descriptor → [M]
rvb_orig_sims = (A_orig @ T.T)[0]

# Manipulated audio similarity to each descriptor → [N, M]
rvb_manip_sims = A_manip @ T.T

# Map descriptor name to index in the text embedding matrix
rvb_desc2idx = {d:i for i, d in enumerate(RVB_TEXT_LABELS)}
rows = []
for i, r in rvb_files_df.iterrows():
    desc = r["descriptor"]
    idx = rvb_desc2idx[desc]
    s_target = float(rvb_manip_sims[i, idx])
    s_orig_target = float(rvb_orig_sims[idx])
    rows.append({
        "path": r["path"],
        "stem": r["stem"],
        "descriptor": desc,
        "scale": float(r["scale"]),
        "sim_target": s_target,           # cosine(sim) in [-1, 1]
        "sim_orig_target": s_orig_target,
        "delta_target": s_target - s_orig_target
    })

laion_rvb_target_df = pd.DataFrame(rows).sort_values(["descriptor","scale"]).reset_index(drop=True)
laion_rvb_target_df.head(12)
    


Unnamed: 0,path,stem,descriptor,scale,sim_target,sim_orig_target,delta_target
0,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,bass,0.3,0.109753,0.080024,0.029729
1,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,bass,0.6,0.118151,0.080024,0.038126
2,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,bass,1.0,0.019157,0.080024,-0.060867
3,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,big,0.3,0.07218,0.045002,0.027178
4,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,big,0.6,0.091153,0.045002,0.04615
5,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,big,1.0,0.079543,0.045002,0.034541
6,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,church,0.3,-0.025536,-0.079649,0.054112
7,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,church,0.6,-0.009863,-0.079649,0.069786
8,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,church,1.0,0.063787,-0.079649,0.143436
9,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar,clear,0.3,0.015447,0.000297,0.015149


In [10]:
laion_rvb_overall = (
    laion_rvb_target_df.groupby("scale")["delta_target"]
    .mean()
    .reset_index()
    .sort_values("scale")
)
laion_rvb_by_desc = (
    laion_rvb_target_df.groupby(["descriptor","scale"])["delta_target"]
    .mean()
    .reset_index()
    .sort_values(["descriptor","scale"])
)

laion_rvb_overall, laion_rvb_by_desc.head(60)


(   scale  delta_target
 0    0.3      0.024491
 1    0.6      0.036284
 2    1.0      0.090863,
    descriptor  scale  delta_target
 0        bass    0.3      0.029729
 1        bass    0.6      0.038126
 2        bass    1.0     -0.060867
 3         big    0.3      0.027178
 4         big    0.6      0.046150
 5         big    1.0      0.034541
 6      church    0.3      0.054112
 7      church    0.6      0.069786
 8      church    1.0      0.143436
 9       clear    0.3      0.015149
 10      clear    0.6      0.011591
 11      clear    1.0      0.004892
 12       deep    0.3      0.047259
 13       deep    0.6      0.064997
 14       deep    1.0      0.258337
 15    distant    0.3      0.022482
 16    distant    0.6      0.077123
 17    distant    1.0      0.133342
 18  distorted    0.3     -0.039165
 19  distorted    0.6     -0.058085
 20  distorted    1.0      0.049977
 21       echo    0.3      0.012167
 22       echo    0.6      0.014727
 23       echo    1.0      0.084676
 24

In [11]:
#trend analysis
EXPECTED_SCALES = [0.3, 0.6, 1.0]
EPS = 1e-6 

# Apply trend classification per descriptor
trend_rows = []
for desc, group in laion_rvb_target_df.groupby("descriptor", as_index=False):
    trend = classify_trend(group[["scale", "delta_target"]].copy())
    trend_rows.append({"descriptor": desc, "trend_type": trend})

# Final trend classification DataFrame
laion_trend_df = pd.DataFrame(trend_rows).sort_values("trend_type").reset_index(drop=True)
display(laion_trend_df)

# Count occurrences of each trend type
laion_trend_counts = laion_trend_df["trend_type"].value_counts()
display(laion_trend_counts)


Unnamed: 0,descriptor,trend_type
0,warm,Monotonic down
1,clear,Monotonic down
2,strong,Monotonic up
3,spacious,Monotonic up
4,soft,Monotonic up
5,low,Monotonic up
6,hollow,Monotonic up
7,tinny,Monotonic up
8,haunting,Monotonic up
9,echo,Monotonic up


trend_type
Monotonic up       12
Monotonic down      2
Peak high (1.0)     2
Peak low (0.3)      2
Peak mid (0.6)      2
Name: count, dtype: int64

In [12]:
OUT = "./outputs/LAION_CLAP"
os.makedirs(OUT, exist_ok=True)

# Attach model tag and standardize columns
laion_rvb_targets = laion_rvb_target_df.copy()
laion_rvb_targets["model"] = "LAION-CLAP"
laion_rvb_targets = laion_rvb_targets[
    ["model","descriptor","scale","sim_target","sim_orig_target","delta_target","path","stem"]
].sort_values(["descriptor","scale","path"])

laion_rvb_trends = laion_trend_df.copy()
laion_rvb_trends["model"] = "LAION-CLAP"
laion_rvb_trends = laion_rvb_trends[["model","descriptor","trend_type"]].sort_values(["descriptor"])

# Save CSVs
laion_rvb_targets.to_csv(f"{OUT}/laionclap_rvb_targets.csv", index=False)
laion_rvb_trends.to_csv (f"{OUT}/laionclap_rvb_trends.csv",  index=False)

display(laion_rvb_targets.head(6))
display(laion_rvb_trends.head(6))


Unnamed: 0,model,descriptor,scale,sim_target,sim_orig_target,delta_target,path,stem
0,LAION-CLAP,bass,0.3,0.109753,0.080024,0.029729,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar
1,LAION-CLAP,bass,0.6,0.118151,0.080024,0.038126,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar
2,LAION-CLAP,bass,1.0,0.019157,0.080024,-0.060867,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar
3,LAION-CLAP,big,0.3,0.07218,0.045002,0.027178,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar
4,LAION-CLAP,big,0.6,0.091153,0.045002,0.04615,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar
5,LAION-CLAP,big,1.0,0.079543,0.045002,0.034541,//Users/lindseydeng/Desktop/timbre_semantics_e...,guitar


Unnamed: 0,model,descriptor,trend_type
19,LAION-CLAP,bass,Peak mid (0.6)
18,LAION-CLAP,big,Peak mid (0.6)
12,LAION-CLAP,church,Monotonic up
1,LAION-CLAP,clear,Monotonic down
11,LAION-CLAP,deep,Monotonic up
10,LAION-CLAP,distant,Monotonic up
