In [None]:
'''
Previous approach using raw data processing.
'''

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install nemo_toolkit['asr']
!pip install torchcodec
!pip install datasets huggingface_hub

In [None]:
import os, shutil, requests
import torch
import torchaudio
import pandas as pd
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import nemo.collections.asr as nemo_asr
import glob
from datasets import load_dataset, Audio
from datasets import load_from_disk
from huggingface_hub import hf_hub_download
from tqdm import tqdm

## IBM-Granite


In [None]:
# 1. UNIFIED DATASET LOADER
def load_dataset_samples(dataset, base_path):
    samples = []

    # A. MERGED LOADER FOR: US_MEDICAL + AFRISPEECH
    if dataset in ["us_medical_45", "afrispeech"]:
        if dataset == "us_medical_45":
            audio_folder = f"{base_path}/data/{dataset}/audio"
            transcript_folder = f"{base_path}/data/{dataset}/transcripts"
        else:
            audio_folder = f"{base_path}/data/afrispeech"
            transcript_folder = audio_folder   # transcripts in same folder

        file_list = sorted([
            f for f in os.listdir(audio_folder)
            if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".m4a"))
        ])

        for fname in file_list:
            uid = os.path.splitext(fname)[0]
            audio_path = os.path.join(audio_folder, fname)
            txt_path = os.path.join(transcript_folder, uid + ".txt")

            # optional transcript
            transcript = ""
            if os.path.exists(txt_path):
                transcript = open(txt_path, "r", encoding="utf-8").read().strip()

            # load audio
            try:
                wav, sr = torchaudio.load(audio_path, normalize=True)
                if wav.shape[0] > 1:
                    wav = wav.mean(dim=0).unsqueeze(0)
            except Exception as e:
                print(f"Error loading audio {audio_path}: {e}")
                continue

            samples.append((uid, wav, sr, transcript, "unknown", None))

        return samples

    # B. PRIMOCK (doctor + patient audio inside CSV)
    elif dataset == "primock":

        csv_path = f"{base_path}/data/Primock-57/primock57_details.csv"
        df = pd.read_csv(csv_path)

        for row_idx, row in df.iterrows():
            uid = row["utterance_id"]

            # DOCTOR
            if pd.notna(row["doctor_audio_path"]):
                try:
                    wav, sr = torchaudio.load(row["doctor_audio_path"], normalize=True)
                    if wav.shape[0] > 1:
                        wav = wav.mean(dim=0).unsqueeze(0)
                    transcript = row.get("doctor_utterances", "")
                    samples.append((uid, wav, sr, transcript, "doctor", row_idx))
                except Exception as e:
                    print(f"Doctor audio error for {uid}: {e}")

            # PATIENT
            if pd.notna(row["patient_audio_path"]):
                try:
                    wav, sr = torchaudio.load(row["patient_audio_path"], normalize=True)
                    if wav.shape[0] > 1:
                        wav = wav.mean(dim=0).unsqueeze(0)
                    transcript = row.get("patient_utterances", "")
                    samples.append((uid, wav, sr, transcript, "patient", row_idx))
                except Exception as e:
                    print(f"Patient audio error for {uid}: {e}")

        return samples

    else:
        raise ValueError(f"Unknown dataset: {dataset}")

In [None]:
# 2. IBM Granite CONFIG
device = "cuda" if torch.cuda.is_available() else "cpu"
dataset = "primock"   # <-- CHANGE HERE TO REQUIRED DATASET: us_medical_45 or afrispeech or primock
base_path = "/content/drive/MyDrive/Colab Notebooks/BioRAMP/ASR"
output_csv = f"{base_path}/data/asr_results/{dataset}_granite_asr.csv"

# 3. LOAD MODEL
print("Loading IBM Granite 8B...")
model_name = "ibm-granite/granite-speech-3.3-8b"
processor = AutoProcessor.from_pretrained(model_name)
tokenizer = processor.tokenizer
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, trust_remote_code=True).to(device)

system_prompt = (
    "Knowledge Cutoff Date: April 2024.\n"
    "Today's Date: April 9, 2025.\n"
    "You are Granite, developed by IBM. Transcribe speech verbatim."
)
user_prompt_base = "<|audio|> Please transcribe the speech into written format."

# 4. LOAD SAMPLES
samples = load_dataset_samples(dataset, base_path)
print(f"Loaded {len(samples)} audio samples.")

# 5. SETUP PRIMOCK CSV IF NEEDED
primock_df = None
if dataset == "primock":
    primock_path = f"{base_path}/data/Primock-57/primock57_details.csv"
    primock_df = pd.read_csv(primock_path)
    primock_df_results = primock_df.copy()
    primock_df_results["IBM-Granite-doctor"] = ""
    primock_df_results["IBM-Granite-patient"] = ""

# 6. ASR LOOP (UNIFIED)
records = []

for uid, wav, sr, transcript_text, speaker, row_idx in samples:
    print("Processing:", uid, "| Speaker:", speaker)

    # resample if needed
    if sr != 16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)
        sr = 16000

    duration_sec = wav.shape[1] / sr

    # Define the desired chunk size
    chunk_size_seconds = 30
    chunk_size_samples = chunk_size_seconds * 16000

    # Split the waveform into chunks of equal size
    chunks = torch.split(wav, chunk_size_samples, dim=1)

    # RUN ASR
    try:
        chat = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt_base}
        ]
        text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

        generated_texts = []

        for chunk in tqdm(chunks, desc="Generating transcript..."):
            model_inputs = processor(
                text,
                chunk,
                device=device, # Computation device; returned tensors are put on CPU
                return_tensors="pt",
            ).to(device)

            # Generate
            model_outputs = model.generate(
                **model_inputs,
                max_new_tokens=1000,
                num_beams=1,
                do_sample=False,
                min_length=1,
                top_p=1.0,
                repetition_penalty=1.0,
                length_penalty=1.0,
                temperature=1.0,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,)

            num_input_tokens = model_inputs["input_ids"].shape[-1]
            new_tokens = torch.unsqueeze(model_outputs[0, num_input_tokens:], dim=0)

            output_text = tokenizer.batch_decode(new_tokens, add_special_tokens=False, skip_special_tokens=True)[0]
            generated_texts.append(output_text)
        granite_text = " ".join(generated_texts)
    except Exception as e:
        granite_text = f"ERROR: {e}"
        print("ASR Error:", e)

    # CASE 1: PRIMOCK — write into same CSV, no new file
    if dataset == "primock":
        if speaker == "doctor":
            primock_df_results.at[row_idx, "IBM-Granite-doctor"] = granite_text
        elif speaker == "patient":
            primock_df_results.at[row_idx, "IBM-Granite-patient"] = granite_text

    # CASE 2: OTHER DATASETS — save results to new CSV
    else:
        records.append({
            "utterance_id": uid,
            "duration_sec": round(duration_sec, 2),
            "human_transcript": transcript_text,
            "source": dataset,
            "IBM-Granite": granite_text
        })

# 7. SAVE RESULTS
# PRIMOCK → update same CSV
if dataset == "primock":
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    primock_df_results.to_csv(output_csv, index=False)
    print("\n Primock CSV updated:", output_csv)

# OTHER DATASETS → save new CSV
else:
    df = pd.DataFrame(records)
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    df.to_csv(output_csv, index=False)
    print(f"\n {dataset} Saved ASR results:", output_csv)


## Nvidia-Parakeet


In [None]:
# 1. CREATE JOB LIST FOR ALL DATASETS so we can run it in batches since it keeps timing out
def get_job_list(dataset, base_path):
    # CASE A — US_Medical and Afrispeech Dataset
    if dataset in ["us_medical_45", "afrispeech"]:
        if dataset == "us_medical_45":
            audio_folder = f"{base_path}/data/{dataset}/audio"
            transcript_folder = f"{base_path}/data/{dataset}/transcripts"
        else:
            audio_folder = f"{base_path}/data/afrispeech"
            transcript_folder = audio_folder #the transcripts are in the same folder. modify this to your folder if needed

        files = sorted([f for f in os.listdir(audio_folder) if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".m4a"))])

        jobs = []
        for fname in files:
            uid = os.path.splitext(fname)[0]
            audio_path = os.path.join(audio_folder, fname)

            transcript = ""
            for ext in [".txt", ".tsv", ".csv"]:
                tpath = os.path.join(transcript_folder, uid + ext)
                if os.path.exists(tpath):
                    transcript = open(tpath).read().strip()
                    break

            jobs.append({
                "audio_path": audio_path,
                "utterance_id": uid,
                "transcript": transcript,
                "speaker": "unknown",
                "row_idx": None
            })

        return pd.DataFrame(jobs)

    # CASE B — PRIMOCK (doctor + patient per row)
    elif dataset == "primock":
        primock_path = f"{base_path}/data/Primock-57/primock57_details.csv"
        df = pd.read_csv(primock_path)

        jobs = []
        for idx, row in df.iterrows():
            # doctor
            if pd.notna(row["doctor_audio_path"]):
                jobs.append({
                    "audio_path": row["doctor_audio_path"],
                    "utterance_id": row["utterance_id"],
                    "transcript": row.get("doctor_utterances", ""),
                    "speaker": "doctor",
                    "row_idx": idx
                })

            # patient
            if pd.notna(row["patient_audio_path"]):
                jobs.append({
                    "audio_path": row["patient_audio_path"],
                    "utterance_id": row["utterance_id"],
                    "transcript": row.get("patient_utterances", ""),
                    "speaker": "patient",
                    "row_idx": idx
                })

        return pd.DataFrame(jobs)

    else:
        raise ValueError(f"Unsupported dataset: {dataset}")

In [None]:
# 2. NVIDIA PARAKEET BATCH PIPELINE

dataset = "us_medical_45"   # CHANGE THIS TO THE REQUIRED DATASET: us_medical_45 or afrispeech or primock
base_path = "/content/drive/MyDrive/Colab Notebooks/BioRAMP/ASR"

#set batch size to run in batches due to timeout observed when running all at once
START = 0 #change to desired start index for batch
BATCH_SIZE = 100 #change to the length of the file if you want to run all at once

model_name = "nvidia/parakeet-tdt-0.6b-v2"
nvidia_model = nemo_asr.models.ASRModel.from_pretrained(model_name)
print("Loaded NVIDIA Parakeet")

# GET JOB LIST (BATCHABLE)
job_df = get_job_list(dataset, base_path)
batch = job_df.iloc[START: START + BATCH_SIZE]

print(f"\nProcessing batch {START} → {START + len(batch)}")
print(f"Total jobs available: {len(job_df)}\n")

if dataset == "primock":
    primock_path = f"{base_path}/data/Primock-57/primock57_details.csv"
    primock_df = pd.read_csv(primock_path)

    batch_idxs = batch["row_idx"].unique()
    primock_df_results = primock_df.loc[batch_idxs].copy().reset_index(drop=True)
    idx_map = {orig: i for i, orig in enumerate(batch_idxs)}

    if "Nvidia-Parakeet-doctor" not in primock_df_results.columns:
        primock_df_results["Nvidia-Parakeet-doctor"] = ""
    if "Nvidia-Parakeet-patient" not in primock_df_results.columns:
        primock_df_results["Nvidia-Parakeet-patient"] = ""

records = []

# 3. PROCESS BATCH
for _, job in batch.iterrows():
    audio_path = job["audio_path"]
    uid        = job["utterance_id"]
    transcript = job["transcript"]
    speaker    = job["speaker"]
    row_idx    = job["row_idx"]

    print("--", audio_path)

    # RUN ASR
    try:
        asr_text = nvidia_model.transcribe([audio_path])[0].text
    except Exception as e:
        asr_text = f"ERROR: {e}"

    # SAVE TO PRIMOCK RESULT COPY
    if dataset == "primock":
        local_idx = idx_map[row_idx]
        if speaker == "doctor":
            primock_df_results.at[local_idx, "Nvidia-Parakeet-doctor"] = asr_text
        else:
            primock_df_results.at[local_idx, "Nvidia-Parakeet-patient"] = asr_text

    # Other datasets
    else:
        records.append({
            "utterance_id": uid,
            "audio_file": audio_path,
            "human_transcript": transcript,
            "source": dataset,
            "Nvidia-Parakeet": asr_text
        })

# 4. SAVE OUTPUT
if dataset == "primock":
    outpath = f"{base_path}/data/asr_results/batches/primock_nvidia_batch_{START}.csv"
    primock_df_results.to_csv(outpath, index=False)
    print("\n Saved Primock PARAKEET batch results:", outpath)

else:
    outpath = f"{base_path}/data/asr_results/batches/{dataset}_nvidia_batch_{START}.csv"
    pd.DataFrame(records).to_csv(outpath, index=False)
    print(f"\n Saved {dataset} PARAKEET batch results::", outpath)


In [None]:
#combine the output from the batches into a single csv
output_csv = f"/content/drive/MyDrive/Colab Notebooks/BioRAMP/ASR/data/asr_results/{dataset}_nvidia_asr.csv"
files = glob.glob(f"{base_path}/data/asr_results/batches/{dataset}_nvidia_batch_*.csv")
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

df.to_csv(output_csv, index=False)
print(f"Merged {dataset}_nvidia_asr.csv")

## Combine both models (Nvidia Parakeet and IBM Granite) in one csv

In [None]:
dataset = "us_medical_45" # CHANGE THIS TO THE REQUIRED DATASET: us_medical_45 or afrispeech or primock
nvidia_df = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/BioRAMP/ASR/data/asr_results/{dataset}_nvidia_asr.csv")
granite_df = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/BioRAMP/ASR/data/asr_results/{dataset}_granite_asr.csv")

if dataset == "primock":
  print("hello")
  merged_df = nvidia_df.merge(
      granite_df[['utterance_id', 'IBM-Granite-doctor', 'IBM-Granite-patient']],
      on="utterance_id",
      how="inner" 
  )

else:
  merged_df = nvidia_df.merge(
      granite_df[['utterance_id', 'IBM-Granite', 'duration_sec']],
      on="utterance_id",
      how="inner"
  )

merged_df.to_csv(f"/content/drive/MyDrive/Colab Notebooks/BioRAMP/ASR/data/asr_results/{dataset}_asr.csv", index=False)
