***Imports***

In [28]:
import os
import re
import pathlib
import numpy as np
import pandas as pd
import torch
from laion_clap import CLAP_Module

***Utility Functions***

In [49]:
def l2norm(x, axis=-1, eps=1e-12):
    """
    Apply L2 normalization to vectors along a given axis.
    """
    norm = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (norm + eps)

def spearman_corr(x, y):
    rx = pd.Series(x).rank(method="average").to_numpy()
    ry = pd.Series(y).rank(method="average").to_numpy()
    if np.std(rx) < 1e-12 or np.std(ry) < 1e-12:
        return np.nan
    return float(np.corrcoef(rx, ry)[0,1])

AUDIO_EXT = ".wav"

# Generic regex that matches any <stem>_<type>_<desc>_<scale>.wav
GENERIC_AUDIO_RE = re.compile(
    r"^(?P<stem>.+?)_(?P<type>eq|reverb|rvb)_(?P<desc>[A-Za-z0-9\-]+)_(?P<scale>0\.\d+|1(?:\.0)?)\.wav$",
    re.IGNORECASE
)

def parse_audio_files(audio_dir: str, valid_types=None, audio_ext=".wav") -> pd.DataFrame:
    if not os.path.isdir(audio_dir):
        raise FileNotFoundError(f"❌ Directory does not exist: {audio_dir}")

    if valid_types is not None:
        valid_types = set(t.lower() for t in valid_types)

    rows = []
    for root, _, files in os.walk(audio_dir):
        for f in files:
            if pathlib.Path(f).suffix.lower() != audio_ext:
                continue
            m = GENERIC_AUDIO_RE.match(f)
            if not m:
                continue
            file_type = m.group("type").lower()
            if valid_types and file_type not in valid_types:
                continue
            rows.append({
                "path": os.path.join(root, f),
                "type": file_type,
                "stem": m.group("stem"),
                "descriptor": m.group("desc").lower(),
                "scale": float(m.group("scale"))
            })

    if not rows:
        print(f"⚠️ No matching files found in: {audio_dir}")
        return pd.DataFrame(columns=["path", "type", "stem", "descriptor", "scale"])

    return pd.DataFrame(rows).sort_values(["type", "descriptor", "scale", "path"]).reset_index(drop=True)



def classify_trend(df_desc):
    """
    Classify the trend of delta_target values across scale levels.

    Returns:
        str: Trend type (monotonic or peak location)
    """
    g = df_desc.set_index("scale").reindex(EXPECTED_SCALES)
    if g["delta_target"].isna().any():
        return "Insufficient data"

    d0, d1, d2 = g["delta_target"].tolist()

    # Check monotonicity (with tolerance)
    if d0 <= d1 + EPS and d1 <= d2 + EPS:
        return "Monotonic up"
    if d0 >= d1 - EPS and d1 >= d2 - EPS:
        return "Monotonic down"

    # Identify peak location
    peak_idx = int(np.argmax([d0, d1, d2]))
    return {0: "Peak low (0.3)", 1: "Peak mid (0.6)", 2: "Peak high (1.0)"}[peak_idx]



**EQ**

***Configuration***

In [30]:

# List of EQ descriptors used in filenames and CLAP embeddings
EQ_DESCRIPTORS = [
    'warm', 'cold', 'soft', 'loud', 'bright', 'soothing', 'harsh', 'heavy', 'cool',
    'smooth', 'calm', 'clear', 'tinny', 'sharp', 'hard', 'crisp', 'mellow', 'dark',
    'peaceful', 'gentle'
]

# === PATHS ===
# NOTE: Replace these with actual file paths when running
EQ_DIR = "/Users/lindseydeng/Desktop/Timbre_Study/timbre_semantics_experiment2/test_audio/eq_audio"   # Directory containing EQ-modified audio files
EQ_ORIGINAL_PATH =  "/Users/lindseydeng/Desktop/Timbre_Study/timbre_semantics_experiment2/test_audio/guitar.wav"  # Path to the original unmodified audio file

***Load Model & Text Embeddings***

In [31]:
laion_eq = CLAP_Module(enable_fusion=False)
laion_eq.load_ckpt()  # Load default pretrained checkpoint
laion_eq.eval()

EQ_TEXT_LABELS = EQ_DESCRIPTORS
EQ_TEXT_EMBS = laion_eq.get_text_embedding(EQ_TEXT_LABELS, use_tensor=False)  # Shape: [M, D]


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2.we

***Parse Audio Files***

In [50]:
AUDIO_EXT = ".wav"
EQ_FNAME_RE = re.compile(
    r"^(?P<stem>.+?)_eq_(?P<desc>[A-Za-z0-9\-]+)_(?P<scale>0\.\d+|1\.0|1)\.wav$",
    re.IGNORECASE
)

eq_files_df = parse_audio_files(EQ_DIR, valid_types=["eq"])
print(eq_files_df.head(10))
print(f"✅ Total files found: {len(eq_files_df)}")


                                                path type    stem descriptor  \
0  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar     bright   
1  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar     bright   
2  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar     bright   
3  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar       calm   
4  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar       calm   
5  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar       calm   
6  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar      clear   
7  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar      clear   
8  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar      clear   
9  /Users/lindseydeng/Desktop/Timbre_Study/timbre...   eq  guitar       cold   

   scale  
0    0.3  
1    0.6  
2    1.0  
3    0.3  
4    0.6  
5    1.0  
6    0.3  
7    0.6  
8    1.0  
9    0.3 

***Embed Audio***

In [52]:
# Get audio embeddings: shape = [1 + N, D]
eq_audio_paths = [EQ_ORIGINAL_PATH] + eq_files_df["path"].tolist()
eq_audio_embs = laion_eq.get_audio_embedding_from_filelist(x=eq_audio_paths, use_tensor=False)

# Split original and manipulated embeddings
eq_orig_emb = eq_audio_embs[0:1, :]     # Shape: [1, D]
eq_manip_embs = eq_audio_embs[1:, :]    # Shape: [N, D]

# Normalize all embeddings (L2 norm)
A_orig = l2norm(eq_orig_emb)
A_manip = l2norm(eq_manip_embs)
T = l2norm(EQ_TEXT_EMBS)  # Text embeddings


***Compute Similarity Delta***

In [54]:
# Original audio similarity to each text descriptor → [M]
eq_orig_sims = (A_orig @ T.T)[0]

# Manipulated audio similarity to each descriptor → [N, M]
eq_manip_sims = A_manip @ T.T

# Map descriptor name to index in the text embedding matrix
eq_desc2idx = {desc: i for i, desc in enumerate(EQ_TEXT_LABELS)}

rows = []
for i, row in eq_files_df.iterrows():
    desc = row["descriptor"]
    idx = eq_desc2idx[desc]
    
    s_target = float(eq_manip_sims[i, idx])      # Similarity of manipulated audio
    s_orig_target = float(eq_orig_sims[idx])     # Similarity of original audio

    rows.append({
        "path": row["path"],
        "stem": row["stem"],
        "descriptor": desc,
        "scale": float(row["scale"]),
        "sim_target": s_target,
        "sim_orig_target": s_orig_target,
        "delta_target": s_target - s_orig_target
    })

# Final DataFrame of results
laion_eq_target_df = pd.DataFrame(rows).sort_values(["descriptor", "scale"]).reset_index(drop=True)
laion_eq_target_df.head(12)


Unnamed: 0,path,stem,descriptor,scale,sim_target,sim_orig_target,delta_target
0,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,bright,0.3,-0.037304,-0.01368,-0.023624
1,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,bright,0.6,0.013013,-0.01368,0.026693
2,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,bright,1.0,0.059596,-0.01368,0.073276
3,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,calm,0.3,0.000181,0.004824,-0.004644
4,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,calm,0.6,0.02004,0.004824,0.015216
5,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,calm,1.0,0.049583,0.004824,0.044759
6,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,clear,0.3,-0.005885,-0.015323,0.009438
7,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,clear,0.6,0.035166,-0.015323,0.050488
8,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,clear,1.0,0.111233,-0.015323,0.126556
9,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar,cold,0.3,-0.084302,-0.070721,-0.013581


***Aggregate Metrics***

In [55]:
# Overall mean delta_target by scale
laion_eq_overall = (
    laion_eq_target_df
    .groupby("scale")["delta_target"]
    .mean()
    .reset_index()
    .sort_values("scale")
)

# Mean delta_target by descriptor and scale
laion_eq_by_desc = (
    laion_eq_target_df
    .groupby(["descriptor", "scale"])["delta_target"]
    .mean()
    .reset_index()
    .sort_values(["descriptor", "scale"])
)

laion_eq_overall, laion_eq_by_desc.head(10)

# Save to CSV
OUT = "./outputs/LAION_CLAP"
os.makedirs(OUT, exist_ok=True)

laion_eq_overall.to_csv(f"{OUT}/laionclap_eq_mean_delta_by_scale.csv", index=False)
laion_eq_by_desc.to_csv(f"{OUT}/laionclap_eq_delta_by_descriptor_and_scale.csv", index=False)

***Data Analysis***

In [36]:
#trend analysis

EXPECTED_SCALES = [0.3, 0.6, 1.0]
EPS = 1e-6 

# Apply trend classification per descriptor
trend_rows = []
for desc, group in laion_eq_target_df.groupby("descriptor", as_index=False):
    trend = classify_trend(group[["scale", "delta_target"]].copy())
    trend_rows.append({"descriptor": desc, "trend_type": trend})

# Final trend classification DataFrame
laion_trend_df = pd.DataFrame(trend_rows).sort_values("trend_type").reset_index(drop=True)
display(laion_trend_df)

# Count occurrences of each trend type
laion_trend_counts = laion_trend_df["trend_type"].value_counts()
display(laion_trend_counts)


Unnamed: 0,descriptor,trend_type
0,warm,Monotonic down
1,gentle,Monotonic down
2,smooth,Monotonic up
3,sharp,Monotonic up
4,peaceful,Monotonic up
5,mellow,Monotonic up
6,loud,Monotonic up
7,heavy,Monotonic up
8,harsh,Monotonic up
9,crisp,Monotonic up


trend_type
Monotonic up       11
Peak mid (0.6)      3
Monotonic down      2
Peak high (1.0)     2
Peak low (0.3)      2
Name: count, dtype: int64

In [56]:
# OUTPUT DIRECTORY ===

OUT = "./outputs/LAION_CLAP"
os.makedirs(OUT, exist_ok=True)

# PREPARE TARGETS DATAFRAME 

laion_eq_targets = laion_eq_target_df.copy()
laion_eq_targets["model"] = "LAION-CLAP"
laion_eq_targets = laion_eq_targets[
    ["model", "descriptor", "scale", "sim_target", "sim_orig_target", "delta_target", "path", "stem"]
].sort_values(["descriptor", "scale", "path"])

#PREPARE TRENDS DATAFRAME

laion_eq_trends = laion_trend_df.copy()
laion_eq_trends["model"] = "LAION-CLAP"
laion_eq_trends = laion_eq_trends[["model", "descriptor", "trend_type"]].sort_values("descriptor")

#SAVE MAIN RESULTS

laion_eq_targets.to_csv(f"{OUT}/laionclap_eq_targets.csv", index=False)
laion_eq_trends.to_csv(f"{OUT}/laionclap_eq_trends.csv", index=False)

display(laion_eq_targets.head(6))
display(laion_eq_trends.head(6))


Unnamed: 0,model,descriptor,scale,sim_target,sim_orig_target,delta_target,path,stem
0,LAION-CLAP,bright,0.3,-0.037304,-0.01368,-0.023624,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar
1,LAION-CLAP,bright,0.6,0.013013,-0.01368,0.026693,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar
2,LAION-CLAP,bright,1.0,0.059596,-0.01368,0.073276,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar
3,LAION-CLAP,calm,0.3,0.000181,0.004824,-0.004644,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar
4,LAION-CLAP,calm,0.6,0.02004,0.004824,0.015216,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar
5,LAION-CLAP,calm,1.0,0.049583,0.004824,0.044759,/Users/lindseydeng/Desktop/Timbre_Study/timbre...,guitar


Unnamed: 0,model,descriptor,trend_type
19,LAION-CLAP,bright,Peak mid (0.6)
16,LAION-CLAP,calm,Peak low (0.3)
11,LAION-CLAP,clear,Monotonic up
13,LAION-CLAP,cold,Peak high (1.0)
10,LAION-CLAP,cool,Monotonic up
9,LAION-CLAP,crisp,Monotonic up


**Reverb**

In [43]:
RVB_DESCRIPTORS = [
    'echo', 'distant', 'warm', 'spacious', 'loud', 'deep', 
    'muffled', 'church', 'big', 'distorted', 'hollow', 'sad', 'soft', 
    'bass', 'strong', 'low', 'haunting', 'clear', 'tinny', 'hall'
]

# === PATHS ===
RVB_DIR = "/Users/lindseydeng/Desktop/Timbre_Study/timbre_semantics_experiment2/test_audio/reverb"   # NOTE: Replace these with actual file paths when running
RVB_ORIGINAL_PATH = "/Users/lindseydeng/Desktop/Timbre_Study/timbre_semantics_experiment2/test_audio/guitar.wav"   # Path to the original unmodified audio file


***Load Model & Text Embeddings (Reverb)***

In [40]:
laion_rvb = CLAP_Module(enable_fusion=False)
laion_rvb.load_ckpt()
laion_rvb.eval()


RVB_TEXT_LABELS = RVB_DESCRIPTORS
RVB_TEXT_EMBS = laion_rvb.get_text_embedding(RVB_TEXT_LABELS, use_tensor=False)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2.we

In [46]:
import glob, os

print("RVB_DIR is:", RVB_DIR)
files = glob.glob(os.path.join(RVB_DIR, "**/*.wav"), recursive=True)
print(f"Found {len(files)} .wav files")
for f in files[:10]:
    print("-", os.path.basename(f))


RVB_DIR is: /Users/lindseydeng/Desktop/Timbre_Study/timbre_semantics_experiment2/test_audio/reverb
Found 60 .wav files
- guitar_reverb_deep_0.6.wav
- guitar_reverb_strong_0.3.wav
- guitar_reverb_big_0.6.wav
- guitar_reverb_warm_0.6.wav
- guitar_reverb_spacious_0.6.wav
- guitar_reverb_haunting_0.3.wav
- guitar_reverb_spacious_0.3.wav
- guitar_reverb_sad_1.0.wav
- guitar_reverb_haunting_0.6.wav
- guitar_reverb_hollow_1.0.wav


***Parse Audio Files***

In [51]:
rvb_files_df = parse_audio_files(RVB_DIR, valid_types=["reverb", "rvb"])
rvb_files_df.head(10), len(rvb_files_df)
#rvb_audio_paths = [RVB_ORIGINAL_PATH] + rvb_files_df["path"].tolist()


(                                                path    type    stem  \
 0  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 1  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 2  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 3  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 4  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 5  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 6  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 7  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 8  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 9  /Users/lindseydeng/Desktop/Timbre_Study/timbre...  reverb  guitar   
 
   descriptor  scale  
 0       bass    0.3  
 1       bass    0.6  
 2       bass    1.0  
 3        big    0.3  
 4        big    0.6  
 5        big    1.0  
 6     church    0.3  
 7     chur

***Embed Audio***

In [None]:
rvb_audio_embs = laion_rvb.get_audio_embedding_from_filelist(x=rvb_audio_paths, use_tensor=False)  # [1+N, D]
rvb_orig_emb = rvb_audio_embs[0:1, :]     # [1, D]
rvb_manip_embs = rvb_audio_embs[1:, :]    # [N, D]
rvb_manip_embs.shape, rvb_orig_emb.shape

# Normalize
A_orig = l2norm(rvb_orig_emb)          # [1, D]
A_manip = l2norm(rvb_manip_embs)       # [N, D]
T = l2norm(RVB_TEXT_EMBS)              # [M, D]


***Compute Similarity***

In [None]:
# Original audio similarity to each text descriptor → [M]
rvb_orig_sims = (A_orig @ T.T)[0]

# Manipulated audio similarity to each descriptor → [N, M]
rvb_manip_sims = A_manip @ T.T

# Map descriptor name to index in the text embedding matrix
rvb_desc2idx = {d:i for i, d in enumerate(RVB_TEXT_LABELS)}
rows = []
for i, r in rvb_files_df.iterrows():
    desc = r["descriptor"]
    idx = rvb_desc2idx[desc]
    s_target = float(rvb_manip_sims[i, idx])
    s_orig_target = float(rvb_orig_sims[idx])
    rows.append({
        "path": r["path"],
        "stem": r["stem"],
        "descriptor": desc,
        "scale": float(r["scale"]),
        "sim_target": s_target,           # cosine(sim) in [-1, 1]
        "sim_orig_target": s_orig_target,
        "delta_target": s_target - s_orig_target
    })

laion_rvb_target_df = pd.DataFrame(rows).sort_values(["descriptor","scale"]).reset_index(drop=True)
laion_rvb_target_df.head(12)

***Aggregate Metrices***

In [None]:
laion_rvb_overall = (
    laion_rvb_target_df.groupby("scale")["delta_target"]
    .mean()
    .reset_index()
    .sort_values("scale")
)
laion_rvb_by_desc = (
    laion_rvb_target_df.groupby(["descriptor","scale"])["delta_target"]
    .mean()
    .reset_index()
    .sort_values(["descriptor","scale"])
)

laion_rvb_overall, laion_rvb_by_desc.head(10)

OUT = "./outputs/LAION_CLAP"
os.makedirs(OUT, exist_ok=True)
laion_rvb_overall.to_csv(f"{OUT}/laionclap_rvb_mean_delta_by_scale.csv", index=False)
laion_rvb_by_desc.to_csv(f"{OUT}/laionclap_rvb_delta_by_descriptor_and_scale.csv", index=False)