In [None]:
import numpy as np
import librosa


def evaluate_melodic_similarity(
    guitar_wav: str,
    organ_wav: str,
    sr: int = 16000,
    hop_length: int = 256,
):
    y_g, _ = librosa.load(guitar_wav, sr=sr, mono=True)
    y_o, _ = librosa.load(organ_wav, sr=sr, mono=True)

    f0_g, _, _ = librosa.pyin(
        y_g,
        fmin=librosa.note_to_hz("E2"),
        fmax=librosa.note_to_hz("E6"),
        sr=sr,
        hop_length=hop_length,
    )

    f0_o, _, _ = librosa.pyin(
        y_o,
        fmin=librosa.note_to_hz("E2"),
        fmax=librosa.note_to_hz("E6"),
        sr=sr,
        hop_length=hop_length,
    )

    def hz_to_cents(f):
        c = np.full_like(f, np.nan, dtype=float)
        mask = np.isfinite(f) & (f > 0)
        c[mask] = 1200 * np.log2(f[mask] / 440.0)
        return c

    c_g = hz_to_cents(f0_g)
    c_o = hz_to_cents(f0_o)

    def interp(x):
        idx = np.arange(len(x))
        mask = np.isfinite(x)
        return np.interp(idx, idx[mask], x[mask])

    c_g = interp(c_g)
    c_o = interp(c_o)

    D, wp = librosa.sequence.dtw(
        X=c_g[np.newaxis, :],
        Y=c_o[np.newaxis, :],
        metric="euclidean",
    )
    wp = np.array(wp)

    diff = c_g[wp[:, 0]] - c_o[wp[:, 1]]
    abs_diff = np.abs(diff)

    chroma_g = librosa.feature.chroma_cqt(y=y_g, sr=sr, hop_length=hop_length)
    chroma_o = librosa.feature.chroma_cqt(y=y_o, sr=sr, hop_length=hop_length)

    chroma_sim = np.dot(
        chroma_g.mean(axis=1),
        chroma_o.mean(axis=1),
    ) / (
        np.linalg.norm(chroma_g.mean(axis=1))
        * np.linalg.norm(chroma_o.mean(axis=1))
        + 1e-8
    )

    return {
        "pitch_dtw_mean_abs_cents": float(np.mean(abs_diff)),
        "pitch_rmse_cents": float(np.sqrt(np.mean(diff**2))),
        "pitch_within_25c_pct": float(np.mean(abs_diff < 25) * 100),
        "pitch_within_50c_pct": float(np.mean(abs_diff < 50) * 100),
        "chroma_cosine_similarity": float(chroma_sim),
    }


In [10]:
melody_metrics = evaluate_melodic_similarity(
    "test/heather.wav",
    "test_synth.wav",
)


In [11]:
melody_metrics

{'pitch_dtw_mean_abs_cents': 0.78616352201257,
 'pitch_rmse_cents': 3.5020209798623463,
 'pitch_within_25c_pct': 99.37106918238993,
 'pitch_within_50c_pct': 100.0,
 'chroma_cosine_similarity': 0.999643862247467}

In [12]:
melody_metrics = evaluate_melodic_similarity(
    r"test\1 (2) (1).wav",
    r"test\1.wav",
)

In [13]:
melody_metrics

{'pitch_dtw_mean_abs_cents': 52.0054697669975,
 'pitch_rmse_cents': 165.2974683544762,
 'pitch_within_25c_pct': 75.90197413206263,
 'pitch_within_50c_pct': 82.91354663036078,
 'chroma_cosine_similarity': 0.9770371317863464}

In [None]:
melody_metrics = evaluate_melodic_similarity(
    r"test\2 (2) (1).wav",
    r"test\2.wav",
)

In [15]:
melody_metrics

{'pitch_dtw_mean_abs_cents': 84.51744929665753,
 'pitch_rmse_cents': 133.5955435175542,
 'pitch_within_25c_pct': 52.605210420841686,
 'pitch_within_50c_pct': 58.61723446893787,
 'chroma_cosine_similarity': 0.9105223417282104}

In [20]:
melody_metrics = evaluate_melodic_similarity(
    r"test\note (2) (1).wav",
    r"test\note.wav",
)

In [21]:
melody_metrics

{'pitch_dtw_mean_abs_cents': 13.59223300970875,
 'pitch_rmse_cents': 49.84117168388009,
 'pitch_within_25c_pct': 91.2621359223301,
 'pitch_within_50c_pct': 91.58576051779936,
 'chroma_cosine_similarity': 0.8938866853713989}

In [23]:
melody_metrics = evaluate_melodic_similarity(
    r"test\swall 2 (1).wav",
    r"test\swall.wav",
)

In [24]:
melody_metrics

{'pitch_dtw_mean_abs_cents': 9.441624365482244,
 'pitch_rmse_cents': 32.462395268972166,
 'pitch_within_25c_pct': 90.35532994923858,
 'pitch_within_50c_pct': 92.38578680203045,
 'chroma_cosine_similarity': 0.707426905632019}

In [25]:
melody_metrics = evaluate_melodic_similarity(
    r"test\nirvana (2) (1).wav",
    r"test\nirvana.wav",
)

In [26]:
melody_metrics

{'pitch_dtw_mean_abs_cents': 49.025072398456764,
 'pitch_rmse_cents': 161.30783956200278,
 'pitch_within_25c_pct': 75.30674846625767,
 'pitch_within_50c_pct': 84.50920245398773,
 'chroma_cosine_similarity': 0.9157077074050903}

In [None]:
def evaluate_timbre_and_quality(
    guitar_wav: str,
    organ_wav: str,
    sr: int = 16000,
):

    y_g, _ = librosa.load(guitar_wav, sr=sr, mono=True)
    y_o, _ = librosa.load(organ_wav, sr=sr, mono=True)

    mfcc_g = librosa.feature.mfcc(y=y_g, sr=sr, n_mfcc=20)
    mfcc_o = librosa.feature.mfcc(y=y_o, sr=sr, n_mfcc=20)

    mfcc_dist = np.linalg.norm(
        mfcc_g.mean(axis=1) - mfcc_o.mean(axis=1)
    )

    cent_g = librosa.feature.spectral_centroid(y=y_g, sr=sr).mean()
    cent_o = librosa.feature.spectral_centroid(y=y_o, sr=sr).mean()

    clipping_pct = np.mean(np.abs(y_o) > 0.99) * 100

    rms_o = librosa.feature.rms(y=y_o)[0]
    rms_std = np.std(rms_o)

    return {
        "mfcc_distance": float(mfcc_dist),
        "spectral_centroid_shift": float(cent_o - cent_g),
        "clipping_pct": float(clipping_pct),
        "rms_std": float(rms_std),
    }


In [28]:
timbre_metrics = evaluate_timbre_and_quality(
    "test/heather.wav",
    "test_synth.wav",
)

In [29]:
timbre_metrics

{'mfcc_distance': 21.628280639648438,
 'spectral_centroid_shift': -51.84551645644129,
 'clipping_pct': 0.01741640127388535,
 'rms_std': 0.04477958381175995}

In [30]:
timbre_metrics = evaluate_timbre_and_quality(
    r"test\1 (2) (1).wav",
    r"test\1.wav",
)

In [31]:
timbre_metrics

{'mfcc_distance': 83.90296936035156,
 'spectral_centroid_shift': -415.14449069322427,
 'clipping_pct': 0.002045157068062827,
 'rms_std': 0.08693229407072067}

In [32]:
timbre_metrics = evaluate_timbre_and_quality(
    r"test\2 (2) (1).wav",
    r"test\2.wav",
)

In [33]:
timbre_metrics

{'mfcc_distance': 187.072998046875,
 'spectral_centroid_shift': -196.32429954768844,
 'clipping_pct': 0.003090387658227848,
 'rms_std': 0.15520422160625458}

In [34]:
timbre_metrics = evaluate_timbre_and_quality(
    r"test\note (2) (1).wav",
    r"test\note.wav",
)

In [35]:
timbre_metrics

{'mfcc_distance': 167.3216094970703,
 'spectral_centroid_shift': 99.3898740719477,
 'clipping_pct': 0.01775568181818182,
 'rms_std': 0.1684872955083847}

In [36]:
timbre_metrics = evaluate_timbre_and_quality(
    r"test\swall 2 (1).wav",
    r"test\swall.wav",
)

In [37]:
timbre_metrics

{'mfcc_distance': 101.84064483642578,
 'spectral_centroid_shift': -29.706965167943167,
 'clipping_pct': 0.009964923469387755,
 'rms_std': 0.15601694583892822}

In [38]:
timbre_metrics = evaluate_timbre_and_quality(
    r"test\nirvana (2) (1).wav",
    r"test\nirvana.wav",
)

In [39]:
timbre_metrics

{'mfcc_distance': 173.7267608642578,
 'spectral_centroid_shift': -576.381065788094,
 'clipping_pct': 0.0023344123505976097,
 'rms_std': 0.08905413746833801}