Installations 

In [2]:
!pip install jiwer
!pip install pystoi
!pip install pesq


Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0
Collecting pystoi
  Downloading pystoi-0.4.1-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading pystoi-0.4.1-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: pystoi
Successfully installed pystoi-0.4.1
Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pesq
  Building wheel for

In [3]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect


NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install -y sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=141d024f09c0bc8ee9e285bb8a64ecf4a71a1d489b4a540b4da16b15c9343621
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.2).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libwavpack1
Suggested packages:
  libsox-fmt-all
The following NEW packages will be installed

'\nRemember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\nAlternatively, you can uncomment the exit() below to crash and restart the kernel, in the case\nthat you want to use the "Run All Cells" (or similar) option.\n'

In [4]:
exit()

**Output 1**

In [4]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
from nemo.collections.asr.models import ASRModel
from jiwer import wer, cer
from pesq import pesq
from pystoi import stoi
from scipy.signal import correlate

# Initialize ASR model
asr_model = ASRModel.from_pretrained("stt_en_conformer_ctc_large").cuda()

# Paths
folder_path = "/kaggle/input/outputs1/Outputs1"
ground_truth_audio = os.path.join(folder_path, "ground_truth (1).wav")

# Transcribe ground truth
gt_text = asr_model.transcribe([ground_truth_audio])[0]
gt_text = str(gt_text).strip()  # 👈 FORCE IT TO BE STRING
print(f"Ground Truth Text:\n{gt_text}\n")

# Load ground truth audio (for audio metrics)
gt_audio, gt_sr = librosa.load(ground_truth_audio, sr=16000)

# Initialize results list
results = []

# Files to evaluate (excluding unwanted files)
files = [
    f for f in os.listdir(folder_path)
    if f.endswith('.wav') and f not in ["ground_truth (1).wav"]
]

# Loop through each file
for file_name in tqdm(files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)

    # Transcribe prediction
    pred_text = asr_model.transcribe([file_path])[0]
    pred_text = str(pred_text).strip()  # 👈 FORCE IT TO BE STRING

    # Load predicted audio
    pred_audio, pred_sr = librosa.load(file_path, sr=16000)

    # Text metrics
    wer_score = wer(gt_text, pred_text)
    cer_score = cer(gt_text, pred_text)

    # Audio metrics
    min_len = min(len(gt_audio), len(pred_audio))
    snr = 10 * np.log10(np.sum(gt_audio[:min_len] ** 2) / np.sum((gt_audio[:min_len] - pred_audio[:min_len]) ** 2))

    # PESQ
    try:
        pesq_score = pesq(16000, gt_audio[:min_len], pred_audio[:min_len], 'wb')
    except:
        pesq_score = None

    # LSD
    def compute_lsd(ref, deg):
        eps = 1e-8
        ref_spec = np.abs(np.fft.rfft(ref) + eps)
        deg_spec = np.abs(np.fft.rfft(deg) + eps)
        lsd = np.sqrt(np.mean((20 * np.log10(ref_spec / deg_spec)) ** 2))
        return lsd

    lsd_score = compute_lsd(gt_audio[:min_len], pred_audio[:min_len])

    # STOI
    try:
        stoi_score = stoi(gt_audio[:min_len], pred_audio[:min_len], 16000, extended=False)
    except:
        stoi_score = None

    # Save results
    results.append({
        "File": file_name,
        "WER": wer_score,
        "CER": cer_score,
        "SNR": snr,
        "PESQ": pesq_score,
        "LSD": lsd_score,
        "STOI": stoi_score,
    })

# Make a table
df = pd.DataFrame(results)

# Print final comparison table
print("\n🔵 Final Comparison Table:")
print(df)

# Also save to CSV
df.to_csv("/kaggle/working/comparison_metrics.csv", index=False)
print("\n✅ Metrics saved to /kaggle/working/comparison_metrics1.csv")


[NeMo I 2025-04-27 05:59:28 nemo_logging:393] Found existing object /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
[NeMo I 2025-04-27 05:59:28 nemo_logging:393] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo
[NeMo I 2025-04-27 05:59:28 nemo_logging:393] Instantiating model from pre-trained checkpoint
[NeMo I 2025-04-27 05:59:28 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2025-04-27 05:59:29 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    

[NeMo I 2025-04-27 05:59:29 nemo_logging:393] PADDING: 0
[NeMo I 2025-04-27 05:59:30 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.


Transcribing: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


Ground Truth Text:
Hypothesis(score=tensor(-3.6206), y_sequence=tensor([128, 128, 128, 128, 128, 128, 128, 128, 128, 128,  27, 128, 128, 128,
          7, 128,  15, 128, 128, 128,  63, 128,   6, 128, 128,  88,   8,  21,
         21,  21,  18, 128,   6, 128,   3,   9,   9,   8,  32,  58, 128,   1,
         20, 128, 116, 128, 128,  61,  14,   6, 128,  68, 128,  23, 128, 128,
          7, 128,  58, 128, 128,  14, 128,  28,   5, 128, 128, 128, 128,  51,
        128, 128,  58, 128,  15,  32,   3, 128,  10, 128,   2, 128,  29,  26,
        128,  54, 128, 128,  54, 128,  17, 128,  80,   4,   3, 128,   3,  25,
        128,   1,  19,   9,  13, 128,  41,  87,  13, 128,   1,  42,  22, 128,
         14,  28,   5, 128, 128,   1,  38, 128, 128,   7, 128, 128, 128,  94,
        128, 128, 128, 105, 128,   2, 128, 128, 128, 128, 128, 128, 128, 128,
         59, 128, 128, 128,  90,  15, 128, 128,  32,   3, 128,  56, 128, 128,
          1,   1,  13,  11,  76, 128, 128,  91,  26, 128,  26,  24,  24,  79,


Processing files:   0%|          | 0/8 [00:00<?, ?it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  8.05it/s][A
Processing files:  12%|█▎        | 1/8 [00:01<00:08,  1.25s/it]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  8.09it/s][A
Processing files:  25%|██▌       | 2/8 [00:02<00:06,  1.04s/it]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.07it/s][A
Processing files:  38%|███▊      | 3/8 [00:02<00:04,  1.14it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.76it/s][A
Processing files:  50%|█████     | 4/8 [00:03<00:03,  1.26it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.69it/s][A
Processing files:  62%|██████▎   | 5/8 [00:04<00:02,  1.24it/s]
Transcribing:   0%|          | 0/1


🔵 Final Comparison Table:
                         File       WER       CER        SNR      PESQ  \
0          linear_interp1.wav  0.308163  0.156929 -15.372345  1.433730   
1          spline_interp1.wav  0.287755  0.148777 -15.459622  1.634957   
2    output_predicted_sr1.wav  0.257143  0.132473   4.640611  1.662787   
3    output_predicted_se1.wav  0.275510  0.138927   4.523526  1.708377   
4  output_predicted_cbam1.wav  0.432653  0.240489  -0.000065  1.067094   
5   output_predicted_eca1.wav  0.244898  0.128736   4.612405  1.632130   
6           cubic_interp1.wav  0.255102  0.131793 -15.466371  1.678383   
7         nearest_interp1.wav  0.312245  0.164742 -15.547533  1.081836   

         LSD      STOI  
0  22.165276  0.792342  
1  28.919697  0.792651  
2  22.267670  0.808254  
3  24.361490  0.802890  
4  65.691605  0.580299  
5  24.367996  0.803787  
6  32.263535  0.789958  
7  14.946141  0.790325  

✅ Metrics saved to /kaggle/working/comparison_metrics1.csv





**Output2**

In [5]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
from nemo.collections.asr.models import ASRModel
from jiwer import wer, cer
from pesq import pesq
from pystoi import stoi
from scipy.signal import correlate

# Initialize ASR model
asr_model = ASRModel.from_pretrained("stt_en_conformer_ctc_large").cuda()

# Paths
folder_path = "/kaggle/input/outputs2/Outputs2"
ground_truth_audio = os.path.join(folder_path, "ground_truth.wav")

# Transcribe ground truth
gt_text = asr_model.transcribe([ground_truth_audio])[0]
gt_text = str(gt_text).strip()  # 👈 FORCE IT TO BE STRING
print(f"Ground Truth Text:\n{gt_text}\n")

# Load ground truth audio (for audio metrics)
gt_audio, gt_sr = librosa.load(ground_truth_audio, sr=16000)

# Initialize results list
results = []

# Files to evaluate (excluding unwanted files)
files = [
    f for f in os.listdir(folder_path)
    if f.endswith('.wav') and f not in ["ground_truth.wav"]
]

# Loop through each file
for file_name in tqdm(files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)

    # Transcribe prediction
    pred_text = asr_model.transcribe([file_path])[0]
    pred_text = str(pred_text).strip()  # 👈 FORCE IT TO BE STRING

    # Load predicted audio
    pred_audio, pred_sr = librosa.load(file_path, sr=16000)

    # Text metrics
    wer_score = wer(gt_text, pred_text)
    cer_score = cer(gt_text, pred_text)

    # Audio metrics
    min_len = min(len(gt_audio), len(pred_audio))
    snr = 10 * np.log10(np.sum(gt_audio[:min_len] ** 2) / np.sum((gt_audio[:min_len] - pred_audio[:min_len]) ** 2))

    # PESQ
    try:
        pesq_score = pesq(16000, gt_audio[:min_len], pred_audio[:min_len], 'wb')
    except:
        pesq_score = None

    # LSD
    def compute_lsd(ref, deg):
        eps = 1e-8
        ref_spec = np.abs(np.fft.rfft(ref) + eps)
        deg_spec = np.abs(np.fft.rfft(deg) + eps)
        lsd = np.sqrt(np.mean((20 * np.log10(ref_spec / deg_spec)) ** 2))
        return lsd

    lsd_score = compute_lsd(gt_audio[:min_len], pred_audio[:min_len])

    # STOI
    try:
        stoi_score = stoi(gt_audio[:min_len], pred_audio[:min_len], 16000, extended=False)
    except:
        stoi_score = None

    # Save results
    results.append({
        "File": file_name,
        "WER": wer_score,
        "CER": cer_score,
        "SNR": snr,
        "PESQ": pesq_score,
        "LSD": lsd_score,
        "STOI": stoi_score,
    })

# Make a table
df = pd.DataFrame(results)

# Print final comparison table
print("\n🔵 Final Comparison Table:")
print(df)

# Also save to CSV
df.to_csv("/kaggle/working/comparison_metrics2.csv", index=False)
print("\n✅ Metrics saved to /kaggle/working/comparison_metrics2.csv")


[NeMo I 2025-04-27 06:01:35 nemo_logging:393] Found existing object /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
[NeMo I 2025-04-27 06:01:35 nemo_logging:393] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo
[NeMo I 2025-04-27 06:01:35 nemo_logging:393] Instantiating model from pre-trained checkpoint
[NeMo I 2025-04-27 06:01:35 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2025-04-27 06:01:36 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    

[NeMo I 2025-04-27 06:01:36 nemo_logging:393] PADDING: 0
[NeMo I 2025-04-27 06:01:37 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.


Transcribing: 100%|██████████| 1/1 [00:00<00:00,  7.82it/s]


Ground Truth Text:
Hypothesis(score=tensor(-3.9283), y_sequence=tensor([128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,   1, 128,  76,
          8, 128,  16, 128, 128, 100, 128, 128,  64, 128, 128, 128,   4, 128,
        128, 128, 128, 128, 128, 128, 128, 128,   1,  13,   9,   9,   2, 128,
          2,   2,  89, 128,   1,  93, 128, 128,  76, 128,   4, 128,  14, 128,
          1,  57, 128,  15, 128,   5, 128,   4, 128, 128,  58, 128, 128, 128,
         21, 128,  53, 128,  16, 128,  78, 128, 128,   2, 128, 128,  37, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
        128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,   1, 128,
         13,   9,   2, 128,   2,   2,  89,  89, 128,   1,  93, 128, 128,  76,
        128,   4, 128,  14, 128,   1,  57, 128,  15,  15, 128,   5,   4,   4,
        128, 128, 128,   1, 128,  83, 128,  52, 128,   5, 128, 128,   1, 114,


Processing files:   0%|          | 0/8 [00:00<?, ?it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.07it/s][A
Processing files:  12%|█▎        | 1/8 [00:00<00:04,  1.48it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 11.46it/s]
Processing files:  25%|██▌       | 2/8 [00:01<00:03,  1.50it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 11.06it/s]
Processing files:  38%|███▊      | 3/8 [00:02<00:03,  1.36it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  8.81it/s][A
Processing files:  50%|█████     | 4/8 [00:02<00:02,  1.41it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.13it/s][A
Processing files:  62%|██████▎   | 5/8 [00:03<00:02,  1.28it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.32it/s][A
Processing files:  75%|███████▌ 


🔵 Final Comparison Table:
                         File       WER       CER       SNR      PESQ  \
0    output_predicted_se2.wav  0.245243  0.121833  3.621123  1.624688   
1   output_predicted_eca2.wav  0.205074  0.109337  4.584846  1.600289   
2  output_predicted_cbam2.wav  0.353066  0.202707 -0.000312  1.046489   
3    output_predicted_sr2.wav  0.226216  0.121138  4.100775  1.547909   
4          spline_interp2.wav  0.215645  0.113502 -8.972124  1.606007   
5           cubic_interp2.wav  0.211416  0.105519 -9.040170  1.673419   
6          linear_interp2.wav  0.247357  0.125651 -7.975488  1.461244   
7         nearest_interp2.wav  0.257928  0.134675 -9.441505  1.074889   

         LSD      STOI  
0  24.014624  0.787060  
1  24.005753  0.787970  
2  62.851753  0.471944  
3  25.624973  0.781667  
4  32.743362  0.794877  
5  38.544273  0.793590  
6  24.231424  0.795313  
7  15.077976  0.793344  

✅ Metrics saved to /kaggle/working/comparison_metrics2.csv





**Output3**

In [6]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
from nemo.collections.asr.models import ASRModel
from jiwer import wer, cer
from pesq import pesq
from pystoi import stoi
from scipy.signal import correlate

# Initialize ASR model
asr_model = ASRModel.from_pretrained("stt_en_conformer_ctc_large").cuda()

# Paths
folder_path = "/kaggle/input/outputs3/Outputs3"
ground_truth_audio = os.path.join(folder_path, "ground_truth.wav")

# Transcribe ground truth
gt_text = asr_model.transcribe([ground_truth_audio])[0]
gt_text = str(gt_text).strip()  # 👈 FORCE IT TO BE STRING
print(f"Ground Truth Text:\n{gt_text}\n")

# Load ground truth audio (for audio metrics)
gt_audio, gt_sr = librosa.load(ground_truth_audio, sr=16000)

# Initialize results list
results = []

# Files to evaluate (excluding unwanted files)
files = [
    f for f in os.listdir(folder_path)
    if f.endswith('.wav') and f not in ["ground_truth.wav"]
]

# Loop through each file
for file_name in tqdm(files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)

    # Transcribe prediction
    pred_text = asr_model.transcribe([file_path])[0]
    pred_text = str(pred_text).strip()  # 👈 FORCE IT TO BE STRING

    # Load predicted audio
    pred_audio, pred_sr = librosa.load(file_path, sr=16000)

    # Text metrics
    wer_score = wer(gt_text, pred_text)
    cer_score = cer(gt_text, pred_text)

    # Audio metrics
    min_len = min(len(gt_audio), len(pred_audio))
    snr = 10 * np.log10(np.sum(gt_audio[:min_len] ** 2) / np.sum((gt_audio[:min_len] - pred_audio[:min_len]) ** 2))

    # PESQ
    try:
        pesq_score = pesq(16000, gt_audio[:min_len], pred_audio[:min_len], 'wb')
    except:
        pesq_score = None

    # LSD
    def compute_lsd(ref, deg):
        eps = 1e-8
        ref_spec = np.abs(np.fft.rfft(ref) + eps)
        deg_spec = np.abs(np.fft.rfft(deg) + eps)
        lsd = np.sqrt(np.mean((20 * np.log10(ref_spec / deg_spec)) ** 2))
        return lsd

    lsd_score = compute_lsd(gt_audio[:min_len], pred_audio[:min_len])

    # STOI
    try:
        stoi_score = stoi(gt_audio[:min_len], pred_audio[:min_len], 16000, extended=False)
    except:
        stoi_score = None

    # Save results
    results.append({
        "File": file_name,
        "WER": wer_score,
        "CER": cer_score,
        "SNR": snr,
        "PESQ": pesq_score,
        "LSD": lsd_score,
        "STOI": stoi_score,
    })

# Make a table
df = pd.DataFrame(results)

# Print final comparison table
print("\n🔵 Final Comparison Table:")
print(df)

# Also save to CSV
df.to_csv("/kaggle/working/comparison_metrics3.csv", index=False)
print("\n✅ Metrics saved to /kaggle/working/comparison_metrics3.csv")


[NeMo I 2025-04-27 06:03:07 nemo_logging:393] Found existing object /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
[NeMo I 2025-04-27 06:03:07 nemo_logging:393] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo
[NeMo I 2025-04-27 06:03:07 nemo_logging:393] Instantiating model from pre-trained checkpoint
[NeMo I 2025-04-27 06:03:08 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2025-04-27 06:03:08 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    

[NeMo I 2025-04-27 06:03:08 nemo_logging:393] PADDING: 0
[NeMo I 2025-04-27 06:03:10 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.4.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.


Transcribing: 100%|██████████| 1/1 [00:00<00:00,  7.43it/s]


Ground Truth Text:
Hypothesis(score=tensor(-2.4027), y_sequence=tensor([128, 128, 128, 128,  27, 128, 128, 128,   1,  13,   9,   2, 128,   2,
          2,  89, 128,   1,  93, 128, 128,  76, 128,   4,  14, 128, 128, 128,
         98, 128, 128, 128,  40, 128, 128,  57, 128,   1,  14, 128,   6,  52,
        128,   5, 128,  23, 128,  85,  30,   4,   4,  10, 128, 128,  21, 128,
         42, 128, 128,   2, 128, 128, 128, 128, 128, 103, 128, 128, 128, 128,
        128, 128, 128, 128,  97, 128, 128, 128,  21,  19,  19,   4, 128,  23,
        128, 128,   6,  30, 128, 128,  59,  20, 128,  68, 128,  14,  26, 128,
        128,  10, 128,  35,   8,  12, 128, 128, 128, 128, 121, 128, 128,  35,
         19,   8,   3, 128,  23, 128, 128,  85, 128, 128, 128, 128,  78, 128,
         49, 128,   8, 128,  32,   2, 128, 128,  55, 128,  83, 128,   4, 128,
         26, 128, 128, 128,  73, 128, 128, 128, 128,   7, 128,   1,  86, 128,
          3, 128, 128, 100, 128,  41, 128,   4, 128,  14,   3, 128,  59, 128,


Processing files:   0%|          | 0/8 [00:00<?, ?it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 10.22it/s]
Processing files:  12%|█▎        | 1/8 [00:00<00:05,  1.35it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 10.81it/s]
Processing files:  25%|██▌       | 2/8 [00:01<00:04,  1.33it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 10.76it/s]
Processing files:  38%|███▊      | 3/8 [00:02<00:03,  1.34it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  8.34it/s][A
Processing files:  50%|█████     | 4/8 [00:02<00:02,  1.33it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.47it/s][A
Processing files:  62%|██████▎   | 5/8 [00:03<00:02,  1.24it/s]
Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████| 1/1 [00:00<00:00,  9.28it/s][A
Processing files:  75%|███████▌  | 6/8 [00:04<00:01,  1.26it/s]
Transcribing:   0%|       


🔵 Final Comparison Table:
                         File       WER       CER        SNR      PESQ  \
0           cubic_interp3.wav  0.384615  0.198060 -17.469535  1.752456   
1    output_predicted_sr3.wav  0.392713  0.202743   4.934082  1.612601   
2          linear_interp3.wav  0.425101  0.218133 -17.376537  1.459545   
3    output_predicted_se3.wav  0.370445  0.190699   4.805605  1.715643   
4  output_predicted_cbam3.wav  0.497976  0.282703  -0.000016  1.054587   
5   output_predicted_eca3.wav  0.350202  0.179659   5.781649  1.635221   
6         nearest_interp3.wav  0.437247  0.228170 -17.533460  1.077972   
7          spline_interp3.wav  0.412955  0.213449 -17.462772  1.679796   

         LSD      STOI  
0  31.605200  0.801315  
1  24.739801  0.808348  
2  21.572214  0.800090  
3  23.120979  0.813292  
4  64.146622  0.633239  
5  23.389359  0.813414  
6  14.461169  0.798942  
7  28.234589  0.799750  

✅ Metrics saved to /kaggle/working/comparison_metrics3.csv



