In [29]:
from pesq import pesq
import librosa

ref, sr = librosa.load("male.wav", sr=16000)
deg, _ = librosa.load("output_from_hifigan.wav", sr=16000)

score = pesq(sr, ref, deg, 'wb')  # 'wb' = wideband
print("PESQ Score:", score)


PESQ Score: 1.3531402349472046


In [30]:
from pystoi import stoi
import librosa

# Load audio (ref = original, deg = reconstructed)
ref, sr = librosa.load("scale.wav", sr=16000)
deg, _ = librosa.load("output_from_hifigan.wav", sr=16000)

# Make sure lengths match
min_len = min(len(ref), len(deg))
ref = ref[:min_len]
deg = deg[:min_len]

# Calculate STOI
score = stoi(ref, deg, sr, extended=False)
print("STOI Score:", score)


STOI Score: 0.30145119395815145


In [31]:
import librosa
import numpy as np

def calculate_mcd(ref_wav, deg_wav, sr=16000, n_mfcc=13):
    # MFCCs extract karo
    ref_mfcc = librosa.feature.mfcc(y=ref_wav, sr=sr, n_mfcc=n_mfcc)
    deg_mfcc = librosa.feature.mfcc(y=deg_wav, sr=sr, n_mfcc=n_mfcc)
    
    # Frames align karo (shortest length use karenge)
    min_frames = min(ref_mfcc.shape[1], deg_mfcc.shape[1])
    ref_mfcc = ref_mfcc[:, :min_frames]
    deg_mfcc = deg_mfcc[:, :min_frames]
    
    # (exclude 0th coefficient) → distortion perceptual hota hai higher cepstral coeffs se
    ref_mfcc = ref_mfcc[1:, :]
    deg_mfcc = deg_mfcc[1:, :]
    
    # MCD formula (log-spectral distance)
    diff = ref_mfcc - deg_mfcc
    dist = np.sqrt((diff ** 2).sum(axis=0))
    mcd = (10.0 / np.log(10)) * np.mean(dist)
    return mcd

# ---------------------------
# Example run
# ---------------------------
ref, sr = librosa.load("scale.wav", sr=16000)
deg, _ = librosa.load("output_from_hifigan.wav", sr=16000)

min_len = min(len(ref), len(deg))
ref, deg = ref[:min_len], deg[:min_len]

mcd_score = calculate_mcd(ref, deg, sr=16000)
print("MCD Score:", mcd_score, "dB")


MCD Score: 354.0697160356897 dB


In [32]:
import numpy as np
import librosa
from pesq import pesq
from pystoi import stoi
import python_speech_features as psf
from dtw import accelerated_dtw

def evaluate(ref_path, deg_path, sr=16000):
    # Load
    ref, _ = librosa.load(ref_path, sr=sr)
    deg, _ = librosa.load(deg_path, sr=sr)

    # Normalize
    ref = ref / np.max(np.abs(ref))
    deg = deg / np.max(np.abs(deg))

    # Length match
    min_len = min(len(ref), len(deg))
    ref, deg = ref[:min_len], deg[:min_len]

    # PESQ
    pesq_score = pesq(sr, ref, deg, 'wb')

    # STOI
    stoi_score = stoi(ref, deg, sr, extended=False)

    # MCD
    ref_mfcc = psf.mfcc(ref, sr)
    deg_mfcc = psf.mfcc(deg, sr)
    dist, _, _, _ = accelerated_dtw(ref_mfcc, deg_mfcc, dist='euclidean')
    mcd = dist / len(ref_mfcc)

    print(f"✅ PESQ : {pesq_score:.3f}")
    print(f"✅ STOI : {stoi_score:.3f}")
    print(f"✅ MCD  : {mcd:.3f} dB")
    




In [19]:
evaluate("scale.wav", "output_from_hifigan.wav")

✅ PESQ : 1.104
✅ STOI : 0.301
✅ MCD  : 69.444 dB


In [34]:
import json
import librosa
import numpy as np

def extract_mel_from_wav(wav_path, config_path, output_npy="input_mel.npy"):
    # ---- Load config ----
    with open(config_path) as f:
        config = json.load(f)

    sr = config["sampling_rate"]
    n_fft = config["n_fft"]
    hop_length = config["hop_size"]
    win_length = config["win_size"]
    n_mels = config["num_mels"]
    fmin = config["fmin"]
    fmax = config["fmax"]

    print(f"✅ Using Config: sr={sr}, n_fft={n_fft}, hop={hop_length}, win={win_length}, mels={n_mels}")

    # ---- Load wav ----
    y, _ = librosa.load("scale.wav", sr=sr)

    # ---- Extract Mel ----
    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        n_mels=n_mels,
        fmin=fmin,
        fmax=fmax,
        power=1.0
    )

    # Convert to log-mel (HiFi-GAN usually expects log scale)
    mel_db = np.log(np.clip(mel, a_min=1e-5, a_max=None))

    # Save mel
    np.save(output_npy, mel_db)
    print(f"✅ Mel saved to {output_npy}, shape={mel_db.shape}")

    return mel_db


# ------------------ Run Example ------------------
if __name__ == "__main__":
    mel = extract_mel_from_wav("input.wav", "config_v1.json", "input_mel.npy")


✅ Using Config: sr=22050, n_fft=1024, hop=256, win=1024, mels=80
✅ Mel saved to input_mel.npy, shape=(80, 684)
