<a href="https://www.kaggle.com/code/veertiiiiwari/whisper-hindi?scriptVersionId=286266354" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install jiwer

# Dataset

# from youtube

In [None]:
!pip install yt-dlp
!yt-dlp -x --audio-format wav "https://www.youtube.com/watch?v=sGAe5K79h-g" -o "raw.wav"

# single audio

In [None]:
!pip install pydub

from pydub import AudioSegment, silence
import os
##### from youtube audio ######
#audio = AudioSegment.from_wav("raw.wav")

##### my recorded audio ######
audio = AudioSegment.from_wav("/kaggle/input/recording1/1.wav")

chunks = silence.split_on_silence(
    audio,
    min_silence_len=1000,   # silence >= 700ms
    silence_thresh=-40      # dBFS threshold
)

os.makedirs("wavs", exist_ok=True)

for i, chunk in enumerate(chunks):
    print(i)
    out = f"wavs/utt_{i:04d}.wav"
    chunk.export(out, format="wav")

# Multiple Audios 

In [None]:
!pip install pydub

from pydub import AudioSegment, silence
import os
from glob import glob

# Input folder containing many wav files
INPUT_DIR = "/kaggle/input/whisper-hindi-dataset-h4-ankit/whisper_hinid_dataset_h4_ankit"

# Output folder
OUTPUT_DIR = "wavs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Get list of all WAV files inside dataset
wav_files = glob(os.path.join(INPUT_DIR, "*.wav"))
print("Total WAV files found:", len(wav_files))

chunk_counter = 0   # global counter to keep slices unique

for file_path in wav_files:
    print("Processing:", file_path)

    # Load audio
    audio = AudioSegment.from_wav(file_path)

    # Split into chunks based on silence
    chunks = silence.split_on_silence(
        audio,
        min_silence_len=1000,   # adjust if needed
        silence_thresh=-40      # dBFS threshold
    )

    # Save chunks
    for chunk in chunks:
        out_path = os.path.join(OUTPUT_DIR, f"utt_{chunk_counter:05d}.wav")
        chunk.export(out_path, format="wav")
        chunk_counter += 1

print("Done! Total chunks saved:", chunk_counter)


# Transcripts from whisper v3 large hindi

In [None]:
!pip install git+https://github.com/openai/whisper.git

In [None]:
import whisper
import os

# Load the multilingual small model (supports Hindi)
model = whisper.load_model("large")

metadata = []

for f in sorted(os.listdir("wavs")):
    if f.endswith(".wav"):
        result = model.transcribe(f"wavs/{f}", language="hi")
        text = result["text"].strip()
        print(text)
        metadata.append([f, text])


# Transcripts from whisper collabora hindi

In [None]:
!pip install transformers torchaudio sentencepiece accelerate

In [None]:
import os
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Hindi fine-tuned Whisper Large v3
processor = WhisperProcessor.from_pretrained("collabora/whisper-large-v2-hindi")
model = WhisperForConditionalGeneration.from_pretrained(
    "collabora/whisper-large-v2-hindi"
).to(device)
model.eval()

metadata = []

def transcribe(path):
    # Load audio
    audio, sr = torchaudio.load(path)

    # Resample to 16k
    if sr != 16000:
        audio = torchaudio.functional.resample(audio, sr, 16000)

    # Preprocess
    inputs = processor(
        audio.squeeze().numpy(),
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    # Predict
    pred_ids = model.generate(inputs["input_features"])

    # Decode
    text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
    return text.strip()


# -------------------------
# Transcription Loop
# -------------------------
count = 0
for f in sorted(os.listdir("wavs")):
    if f.endswith(".wav"):
        count = count + 1
        text = transcribe(f"wavs/{f}")
        print(count)
        #print(text)
        metadata.append([f, text])


In [None]:
import re

# Hindi number map (0‚Äì100)
hindi_numbers = {
    0:"‡§∂‡•Ç‡§®‡•ç‡§Ø", 1:"‡§è‡§ï", 2:"‡§¶‡•ã", 3:"‡§§‡•Ä‡§®", 4:"‡§ö‡§æ‡§∞", 5:"‡§™‡§æ‡§Ç‡§ö", 6:"‡§õ‡§π", 7:"‡§∏‡§æ‡§§", 8:"‡§Ü‡§†", 9:"‡§®‡•å",
    10:"‡§¶‡§∏", 11:"‡§ó‡•ç‡§Ø‡§æ‡§∞‡§π", 12:"‡§¨‡§æ‡§∞‡§π", 13:"‡§§‡•á‡§∞‡§π", 14:"‡§ö‡•å‡§¶‡§π", 15:"‡§™‡§Ç‡§¶‡•ç‡§∞‡§π", 16:"‡§∏‡•ã‡§≤‡§π", 17:"‡§∏‡§§‡•ç‡§∞‡§π", 18:"‡§Ö‡§ü‡•ç‡§†‡§æ‡§∞‡§π", 19:"‡§â‡§®‡•ç‡§®‡•Ä‡§∏",
    20:"‡§¨‡•Ä‡§∏", 21:"‡§á‡§ï‡•ç‡§ï‡•Ä‡§∏", 22:"‡§¨‡§æ‡§á‡§∏", 23:"‡§§‡•á‡§à‡§∏", 24:"‡§ö‡•å‡§¨‡•Ä‡§∏", 25:"‡§™‡§ö‡•ç‡§ö‡•Ä‡§∏", 26:"‡§õ‡§¨‡•ç‡§¨‡•Ä‡§∏", 27:"‡§∏‡§§‡•ç‡§§‡§æ‡§à‡§∏", 28:"‡§Ö‡§ü‡•ç‡§†‡§æ‡§à‡§∏", 29:"‡§â‡§®‡§§‡•Ä‡§∏",
    30:"‡§§‡•Ä‡§∏", 31:"‡§á‡§ï‡§§‡•Ä‡§∏", 32:"‡§¨‡§§‡•ç‡§§‡•Ä‡§∏", 33:"‡§§‡•à‡§Ç‡§§‡•Ä‡§∏", 34:"‡§ö‡•å‡§Ç‡§§‡•Ä‡§∏", 35:"‡§™‡•à‡§Ç‡§§‡•Ä‡§∏", 36:"‡§õ‡§§‡•ç‡§§‡•Ä‡§∏", 37:"‡§∏‡•à‡§Ç‡§§‡•Ä‡§∏", 38:"‡§Ö‡§°‡§º‡§§‡•Ä‡§∏", 39:"‡§â‡§®‡§§‡§æ‡§≤‡•Ä‡§∏",
    40:"‡§ö‡§æ‡§≤‡•Ä‡§∏", 41:"‡§á‡§ï‡§§‡§æ‡§≤‡•Ä‡§∏", 42:"‡§¨‡§Ø‡§æ‡§≤‡•Ä‡§∏", 43:"‡§§‡•à‡§Ç‡§§‡§æ‡§≤‡•Ä‡§∏", 44:"‡§ö‡§µ‡§æ‡§≤‡•Ä‡§∏", 45:"‡§™‡•à‡§Ç‡§§‡§æ‡§≤‡•Ä‡§∏", 46:"‡§õ‡§ø‡§Ø‡§æ‡§≤‡•Ä‡§∏", 47:"‡§∏‡•à‡§Ç‡§§‡§æ‡§≤‡•Ä‡§∏", 48:"‡§Ö‡§°‡§º‡§§‡§æ‡§≤‡•Ä‡§∏", 49:"‡§â‡§®‡§ö‡§æ‡§∏",
    50:"‡§™‡§ö‡§æ‡§∏", 51:"‡§á‡§ï‡•ç‡§Ø‡§æ‡§µ‡§®", 52:"‡§¨‡§æ‡§µ‡§®", 53:"‡§§‡§ø‡§∞‡•á‡§™‡§®", 54:"‡§ö‡•å‡§µ‡§®", 55:"‡§™‡§ö‡§™‡§®", 56:"‡§õ‡§™‡•ç‡§™‡§®", 57:"‡§∏‡§§‡•ç‡§§‡§æ‡§µ‡§®", 58:"‡§Ö‡§ü‡•ç‡§†‡§æ‡§µ‡§®", 59:"‡§â‡§®‡§∏‡§†",
    60:"‡§∏‡§æ‡§†", 61:"‡§á‡§ï‡§∏‡§†", 62:"‡§¨‡§æ‡§∏‡§†", 63:"‡§§‡§ø‡§∞‡•á‡§∏‡§†", 64:"‡§ö‡•å‡§Ç‡§∏‡§†", 65:"‡§™‡•à‡§Ç‡§∏‡§†", 66:"‡§õ‡§ø‡§Ø‡§æ‡§∏‡§†", 67:"‡§∏‡§°‡§º‡§∏‡§†", 68:"‡§Ö‡§°‡§º‡§∏‡§†", 69:"‡§â‡§®‡§π‡§§‡•ç‡§§‡§∞",
    70:"‡§∏‡§§‡•ç‡§§‡§∞", 71:"‡§á‡§ï‡§π‡§§‡•ç‡§§‡§∞", 72:"‡§¨‡§π‡§§‡•ç‡§§‡§∞", 73:"‡§§‡§ø‡§π‡§§‡•ç‡§§‡§∞", 74:"‡§ö‡•å‡§π‡§§‡•ç‡§§‡§∞", 75:"‡§™‡§ö‡§π‡§§‡•ç‡§§‡§∞", 76:"‡§õ‡§ø‡§π‡§§‡•ç‡§§‡§∞", 77:"‡§∏‡§§‡§π‡§§‡•ç‡§§‡§∞", 78:"‡§Ö‡§†‡§π‡§§‡•ç‡§§‡§∞", 79:"‡§â‡§®‡•ç‡§Ø‡§æ‡§∏‡•Ä",
    80:"‡§Ö‡§∏‡•ç‡§∏‡•Ä", 81:"‡§á‡§ï‡•ç‡§Ø‡§æ‡§∏‡•Ä", 82:"‡§¨‡§Ø‡§æ‡§∏‡•Ä", 83:"‡§§‡§ø‡§∞‡§æ‡§∏‡•Ä", 84:"‡§ö‡•å‡§∞‡§æ‡§∏‡•Ä", 85:"‡§™‡§ö‡§æ‡§∏‡•Ä", 86:"‡§õ‡§ø‡§Ø‡§æ‡§∏‡•Ä", 87:"‡§∏‡§§‡•ç‡§§‡§æ‡§∏‡•Ä", 88:"‡§Ö‡§ü‡•ç‡§†‡§æ‡§∏‡•Ä", 89:"‡§®‡§µ‡§æ‡§∏‡•Ä",
    90:"‡§®‡§¨‡•ç‡§¨‡•á", 91:"‡§á‡§ï‡•ç‡§Ø‡§æ‡§®‡§¨‡•á", 92:"‡§¨‡§æ‡§®‡§µ‡•á", 93:"‡§§‡§ø‡§∞‡§æ‡§®‡§µ‡•á", 94:"‡§ö‡•å‡§∞‡§æ‡§®‡§µ‡•á", 95:"‡§™‡§ö‡§æ‡§®‡§µ‡•á", 96:"‡§õ‡§ø‡§Ø‡§æ‡§®‡§µ‡•á", 97:"‡§∏‡§§‡•ç‡§§‡§æ‡§®‡§µ‡•á", 98:"‡§Ö‡§ü‡•ç‡§†‡§æ‡§®‡§µ‡•á", 99:"‡§®‡§ø‡§®‡•ç‡§Ø‡§æ‡§®‡§µ‡•á",
    100:"‡§∏‡•å"
}

def convert_numbers_to_hindi(text):
    def replace(match):
        num = int(match.group())
        return hindi_numbers.get(num, match.group())  # fallback
    
    return re.sub(r"\b\d+\b", replace, text)


In [None]:
import re

cleaned = []

for f, text in metadata:
    t = text

    # convert numbers like 90 ‚Üí ‡§®‡§¨‡•ç‡§¨‡•á
    t = convert_numbers_to_hindi(t)

    # keep only Hindi chars + punctuation + spaces
    t = re.sub(r"[^\u0900-\u097F\s?.!,']", "", t)

    # collapse spaces
    t = re.sub(r"\s+", " ", t)

    cleaned.append([f, t.strip()])


In [None]:
with open("metadata.csv", "w") as out:
    for f, t in cleaned:
        out.write(f"wavs/{f}|{t}\n")

In [None]:
%%bash
mkdir -p wavs_16k

for f in wavs/*.wav; do
    ffmpeg -y -i "$f" -ar 16000 -ac 1 "wavs_16k/$(basename "$f")"
done

In [None]:
import soundfile as sf
import os

INPUT_META = "/kaggle/working/metadata.csv"
AUDIO_DIR = "/kaggle/working/wavs_16k"
OUTPUT_META = "/kaggle/working/metadata_clean.csv"

clean_lines = []

with open(INPUT_META) as f:
    for line in f:
        wav_path, text = line.strip().split("|", 1)

        # 1. Remove 'wavs/' prefix
        wav = os.path.basename(wav_path)

        # 2. Skip incomplete transcripts
        if "..." in text:
            print("Skipping incomplete:", text)
            continue

        # 3. Normalize multiple spaces
        text = " ".join(text.split())

        # 4. Skip too-short transcripts (<3 words)
        if len(text.split()) < 3:
            print("Short text:", text)
            continue

        # 5. Skip too-long transcripts (<3 words)
        if len(text.split()) > 30:
            print("long text:", text)
            continue

        # 5. Check audio exists
        audio_file = f"{AUDIO_DIR}/{wav}"
        if not os.path.exists(audio_file):
            print("Missing audio:", wav)
            continue

        # 6. Check sample rate
        audio, sr = sf.read(audio_file)
        if sr != 16000:
            print("Bad SR:", wav)
            continue

        clean_lines.append(f"{wav}|{text}")

# Write cleaned metadata
with open(OUTPUT_META, "w") as f:
    f.write("\n".join(clean_lines))

print("Done. Clean samples:", len(clean_lines))

In [None]:
import shutil

# Paths
audio_dir = "/kaggle/working/wavs_16k"
metadata_files = ["/kaggle/working/metadata_clean.csv", "/kaggle/working/metadata.csv"]
output_zip = "/kaggle/working/piper_dataset.zip"

# Create a temporary folder to hold all files
import os
tmp_dir = "/kaggle/working/tmp_dataset"
os.makedirs(tmp_dir, exist_ok=True)

# Copy metadata files
for f in metadata_files:
    shutil.copy(f, tmp_dir)

# Copy audio folder
shutil.copytree(audio_dir, os.path.join(tmp_dir, "wavs_16k"), dirs_exist_ok=True)

# Zip everything
shutil.make_archive("/kaggle/working/piper_dataset", 'zip', tmp_dir)

print("Dataset zipped successfully! Download from: /kaggle/working/piper_dataset.zip")

# Audio processng 

In [None]:
!pip install librosa soundfile -q
import os
import librosa
import soundfile as sf
import numpy as np


In [None]:
!pip install noisereduce -q


In [None]:
import noisereduce as nr

def clean_audio(
    input_path,
    output_path,
    target_sr=16000,
    top_db=25
):
    audio, sr = librosa.load(input_path, sr=None, mono=True)

    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

    # Noise reduction (light)
    audio = nr.reduce_noise(y=audio, sr=target_sr, prop_decrease=0.8)

    # Remove silence
    audio, _ = librosa.effects.trim(audio, top_db=top_db)

    # Normalize
    audio = audio / max(abs(audio)) if max(abs(audio)) > 0 else audio

    sf.write(output_path, audio, target_sr)


In [None]:
input_folder = "/kaggle/input/whisper-hindi1/wavs_16k"
output_folder = "/kaggle/working/wavs_cleaned_new"

os.makedirs(output_folder, exist_ok=True)

for file in os.listdir(input_folder):
    if not file.endswith(".wav"):
        continue

    in_path = os.path.join(input_folder, file)
    out_path = os.path.join(output_folder, file)

    clean_audio(in_path, out_path)

print("All audio files cleaned and saved.")


# Training whisper tiny hindi

In [None]:
!pip install git+https://github.com/openai/whisper.git -q
!pip install transformers datasets soundfile librosa evaluate -q

In [None]:
!pip install -q pyarrow==14.0.2 rich==13.7.1 transformers datasets==2.18.0 evaluate==0.4.1 soundfile librosa


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import pandas as pd
import librosa
import soundfile as sf
import os
import evaluate
from tqdm import tqdm
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Use WHISPER TINY ----
model_name = "openai/whisper-tiny"  # <<<<<< IMPORTANT

processor = WhisperProcessor.from_pretrained(
    model_name,
    language="hi",
    task="transcribe"
)

feature_extractor = processor.feature_extractor
tokenizer = processor.tokenizer

model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)

# Force Hindi
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="hi",
    task="transcribe"
)
model.config.suppress_tokens = []


In [None]:
# in working
#df = pd.read_csv("/kaggle/working/metadata_clean.csv", sep="|", header=None)
#df.columns = ["file", "transcript"]
#df["path"] = df["file"].apply(lambda x: f"/kaggle/working/wavs_16k/{x}")


In [None]:
# Load metadata from the new location
df = pd.read_csv("/kaggle/input/whisper-hindi1/metadata_clean.csv", sep="|", header=None)
df.columns = ["file", "transcript"]

# Update path to the new audio folder
WAV_DIR = "/kaggle/working/wavs_cleaned_new"
df["path"] = df["file"].apply(lambda x: os.path.join(WAV_DIR, x))

# Quick check
print(df.head())


In [None]:
train_df = df.sample(frac=0.9, random_state=42)
val_df   = df.drop(train_df.index)


In [None]:
class HindiWhisperDataset(Dataset):
    def __init__(self, dataframe, max_len=120):
        self.df = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # --- Audio ---
        audio, sr = sf.read(row["path"])
        if sr != 16000:
            audio = librosa.resample(audio, sr, 16000)

        input_features = feature_extractor(
            audio,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features[0]

        # --- Text ---
        text = row["transcript"]

        tok = tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        labels = tok["input_ids"]
        labels = labels.masked_fill(tok["attention_mask"] == 0, -100)
        labels = labels[0]

        return {
            "input_features": input_features,
            "labels": labels
        }


In [None]:
train_set = HindiWhisperDataset(train_df)
val_set   = HindiWhisperDataset(val_df)

train_loader = DataLoader(train_set, batch_size=8, shuffle=True)


In [None]:
wer_metric = evaluate.load("wer")

def evaluate_model(model, val_df):
    model.eval()
    preds, refs = [], []

    for _, row in tqdm(val_df.iterrows(), total=len(val_df)):
        audio, sr = sf.read(row["path"])
        if sr != 16000:
            audio = librosa.resample(audio, sr, 16000)

        input_features = feature_extractor(
            audio, sampling_rate=16000, return_tensors="pt"
        ).input_features.to(device)

        with torch.no_grad():
            pred_ids = model.generate(input_features)

        pred = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0]
        preds.append(pred)
        refs.append(row["transcript"])

    return wer_metric.compute(predictions=preds, references=refs) * 100


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 10
wer_list = []

for epoch in range(epochs):
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in pbar:
        input_features = batch["input_features"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_features=input_features,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        pbar.set_postfix({"loss": loss.item()})

    # Evaluate after epoch
    W = evaluate_model(model, val_df)
    wer_list.append(W)
    print(f"Epoch {epoch+1} WER = {W:.2f}%")

    plt.plot(wer_list)
    plt.xlabel("Epoch")
    plt.ylabel("WER (%)")
    plt.title("Validation WER (Whisper-Tiny Hindi)")
    plt.show()


In [None]:
model.save_pretrained("whisper-tiny-hindi_2")
processor.save_pretrained("whisper-tiny-hindi_2")


In [None]:
import shutil

# Path to your saved model
MODEL_DIR = "whisper-tiny-hindi_2"
ZIP_PATH = "whisper-tiny-hindi_my.zip"

# Create a zip
shutil.make_archive(base_name="whisper-tiny-hindi", format="zip", root_dir=MODEL_DIR)

print(f"Saved zipped model at {ZIP_PATH}")


In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
import soundfile as sf
import librosa
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

model_path = "/kaggle/working/whisper-tiny-hindi"

processor = WhisperProcessor.from_pretrained(model_path)
tokenizer = processor.tokenizer
feature_extractor = processor.feature_extractor

model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)
model.eval()

print("Loaded fine-tuned Whisper-tiny Hindi model!")


In [None]:
def transcribe_audio(path):
    audio, sr = sf.read(path)

    if sr != 16000:
        audio = librosa.resample(audio, sr, 16000)

    inputs = feature_extractor(
        audio, sampling_rate=16000, return_tensors="pt"
    ).input_features.to(device)

    with torch.no_grad():
        predicted_ids = model.generate(
            inputs,
            language="hi",
            task="transcribe"
        )

    text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return text


In [None]:
test_folder = "/kaggle/input/whisper-hindi1/wavs_16k"
pred_results = []

for f in sorted(os.listdir(test_folder)):
    if f.endswith(".wav"):
        full_path = os.path.join(test_folder, f)
        text = transcribe_audio(full_path)
        pred_results.append([f, text])
        print(f"{f} ‚Üí {text}")


In [None]:
import os
import torch
import soundfile as sf
import librosa
import pandas as pd
from transformers import WhisperForConditionalGeneration, WhisperProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---------------------------------------------------------------
# LOAD FINE-TUNED MODEL
# ---------------------------------------------------------------
fine_tuned_path = "/kaggle/working/whisper-tiny-hindi"

processor_ft = WhisperProcessor.from_pretrained(fine_tuned_path)
tokenizer_ft = processor_ft.tokenizer
feature_extractor_ft = processor_ft.feature_extractor

model_ft = WhisperForConditionalGeneration.from_pretrained(fine_tuned_path).to(device)
model_ft.eval()

print("Loaded FINE-TUNED Whisper Tiny Hindi!")


# ---------------------------------------------------------------
# LOAD ORIGINAL NON-FINE-TUNED MODEL (BASELINE)
# ---------------------------------------------------------------
# baseline using Whisper‚ÄëTiny‚ÄëHindi (Hindi‚Äëonly fine‚Äëtuned model)
processor_base = WhisperProcessor.from_pretrained("collabora/whisper-tiny-hindi")
tokenizer_base = processor_base.tokenizer
feature_extractor_base = processor_base.feature_extractor

model_base = WhisperForConditionalGeneration.from_pretrained("collabora/whisper-tiny-hindi").to(device)
model_base.eval()

print("Loaded BASELINE Whisper Tiny Hindi (non fine‚Äëtuned)")



# ---------------------------------------------------------------
# COMMON TRANSCRIPTION FUNCTION
# ---------------------------------------------------------------
def transcribe(path, model, processor, tokenizer):
    audio, sr = sf.read(path)

    if sr != 16000:
        audio = librosa.resample(audio, sr, 16000)

    inputs = processor.feature_extractor(
        audio,
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device)

    with torch.no_grad():
        predicted_ids = model.generate(
            inputs,
            language="hi",
            task="transcribe"
        )

    text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return text


# ---------------------------------------------------------------
# RUN BOTH MODELS ON TEST SET
# ---------------------------------------------------------------
test_folder = "/kaggle/input/whisper-hindi1/wavs_16k"

results = []

print("\n========= Running Predictions =========\n")

for idx, f in enumerate(sorted(os.listdir(test_folder))):
    if not f.endswith(".wav"):
        continue

    full_path = os.path.join(test_folder, f)

    pred_ft = transcribe(full_path, model_ft, processor_ft, tokenizer_ft)
    pred_base = transcribe(full_path, model_base, processor_base, tokenizer_base)

    # üîπ PRINT ONLY FIRST 50 FILES
    if idx < 50:
        print(f"\nFile: {f}")
        print("Fine-tuned ‚Üí ", pred_ft)
        print("Baseline   ‚Üí ", pred_base)

    # üîπ SAVE ALL FILES
    results.append([f, pred_ft, pred_base])



# ---------------------------------------------------------------
# SAVE COMPARISON CSV
# ---------------------------------------------------------------
df = pd.DataFrame(results, columns=["filename", "fine_tuned", "baseline"])
df.to_csv("/kaggle/working/whisper_hindi_comparison.csv", index=False)

print("\nSaved comparison CSV at: /kaggle/working/whisper_hindi_comparison.csv")


# Colabora model finetuning

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import librosa
import soundfile as sf
from transformers import WhisperProcessor, WhisperForConditionalGeneration


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
model_name = "collabora/whisper-tiny-hindi"

processor = WhisperProcessor.from_pretrained(
    model_name,
    language="hi",
    task="transcribe"
)

model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.to(device)

# Force Hindi decoding
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="hi",
    task="transcribe"
)
model.config.suppress_tokens = []


In [None]:
class HindiASRDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio, sr = sf.read(row["path"])

        if sr != 16000:
            audio = librosa.resample(audio, sr, 16000)

        input_features = processor.feature_extractor(
            audio,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features[0]

        labels = processor.tokenizer(
            row["transcript"],
            return_tensors="pt"
        ).input_ids[0]

        return {
            "input_features": input_features,
            "labels": labels
        }


In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_features = torch.stack(
        [item["input_features"] for item in batch]
    )

    labels = [item["labels"] for item in batch]
    labels = pad_sequence(
        labels,
        batch_first=True,
        padding_value=processor.tokenizer.pad_token_id
    )

    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {
        "input_features": input_features,
        "labels": labels
    }


In [None]:
train_dataset = HindiWhisperDataset(train_df)
val_dataset   = HindiWhisperDataset(val_df)

train_loader = DataLoader(
    train_dataset,
    batch_size=8,      # safer for Kaggle
    shuffle=True,
    collate_fn=collate_fn
)


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 10
wer_list = []


In [None]:
for epoch in range(epochs):
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in pbar:
        input_features = batch["input_features"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_features=input_features,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        pbar.set_postfix({"loss": loss.item()})

    # -------- VALIDATION (WER) --------
    model.eval()
    W = evaluate_model(model, val_df)   # SAME FUNCTION AS BEFORE
    wer_list.append(W)

    print(f"Epoch {epoch+1} WER = {W:.2f}%")

    plt.figure()
    plt.plot(range(1, len(wer_list) + 1), wer_list, marker="o")
    plt.xlabel("Epoch")
    plt.ylabel("WER (%)")
    plt.title("Validation WER (Collabora Whisper-Tiny Hindi)")
    plt.grid(True)
    plt.show()


In [None]:
save_path = "/kaggle/working/collabora-whisper-tiny-hindi-ft"

model.save_pretrained(save_path)
processor.save_pretrained(save_path)

print("Model saved at:", save_path)


In [None]:
import shutil

# Path to your saved model
MODEL_DIR = "/kaggle/working/collabora-whisper-tiny-hindi-ft"
ZIP_PATH = "/kaggle/working/collabora-whisper-tiny-hindi-ft.zip"

# Create a zip
shutil.make_archive(base_name="/kaggle/working/collabora-whisper-tiny-hindi-ft", format="zip", root_dir=MODEL_DIR)

print(f"Saved zipped model at {ZIP_PATH}")


In [None]:
import os
import torch
import librosa
import soundfile as sf
import pandas as pd
from transformers import WhisperProcessor, WhisperForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

ft_path = "/kaggle/working/collabora-whisper-tiny-hindi-ft"

processor_ft = WhisperProcessor.from_pretrained(ft_path)
model_ft = WhisperForConditionalGeneration.from_pretrained(ft_path).to(device)
model_ft.eval()

# Force Hindi
model_ft.config.forced_decoder_ids = processor_ft.get_decoder_prompt_ids(
    language="hi",
    task="transcribe"
)
model_ft.config.suppress_tokens = []

print("Loaded FINE-TUNED Collabora model")

base_model_name = "collabora/whisper-tiny-hindi"

processor_base = WhisperProcessor.from_pretrained(
    base_model_name,
    language="hi",
    task="transcribe"
)

model_base = WhisperForConditionalGeneration.from_pretrained(
    base_model_name
).to(device)
model_base.eval()

# Force Hindi
model_base.config.forced_decoder_ids = processor_base.get_decoder_prompt_ids(
    language="hi",
    task="transcribe"
)
model_base.config.suppress_tokens = []

print("Loaded BASELINE Collabora model")


In [None]:
def transcribe(audio_path, model, processor):
    audio, sr = sf.read(audio_path)

    if sr != 16000:
        audio = librosa.resample(audio, sr, 16000)

    inputs = processor.feature_extractor(
        audio,
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device)

    with torch.no_grad():
        predicted_ids = model.generate(inputs)

    text = processor.tokenizer.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]

    return text


In [None]:
test_folder = "/kaggle/input/whisper-hindi1/wavs_16k"

results = []

print("\n===== Running Collabora Model Comparison =====\n")

files = sorted(os.listdir(test_folder))

for idx, file in enumerate(tqdm(files, desc="Transcribing audio files")):
    if not file.endswith(".wav"):
        continue

    path = os.path.join(test_folder, file)

    pred_ft = transcribe(path, model_ft, processor_ft)
    pred_base = transcribe(path, model_base, processor_base)

    # üîπ PRINT ONLY FIRST 50 FILES
    if idx < 5:
        print(f"\nFile: {file}")
        print("Fine-tuned :", pred_ft)
        print("Baseline  :", pred_base)

    # üîπ SAVE ALL FILES
    results.append([file, pred_ft, pred_base])


df = pd.DataFrame(
    results,
    columns=["filename", "fine_tuned_collabora", "baseline_collabora"]
)

csv_path = "/kaggle/working/collabora_finetune_vs_baseline.csv"
df.to_csv(csv_path, index=False)

print("\nSaved comparison CSV at:", csv_path)


# Compute WER from CSV

In [None]:
!pip install jiwer -q
import pandas as pd
from jiwer import wer
import re


In [None]:
pred_df = pd.read_csv("/kaggle/working/collabora_finetune_vs_baseline.csv")
ref_df  = pd.read_csv("/kaggle/input/metadata-clean-header/metadata-clean-header.csv")


In [None]:
print(pred_df.columns)
print(ref_df.columns)


In [None]:
ref_df["filename"] = ref_df["Audio_no"]
ref_df = ref_df.rename(columns={"Transcript": "reference"})
ref_df.head()

In [None]:
df = pred_df.merge(
    ref_df[["filename", "reference"]],
    on="filename",
    how="inner"
)


In [None]:
df.head()

In [None]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^\u0900-\u097F\s]", "", text)  # keep Hindi only
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [None]:
df["ref_norm"] = df["reference"].apply(normalize)
df["baseline_norm"] = df["baseline_collabora"].apply(normalize)
df["finetuned_norm"] = df["fine_tuned_collabora"].apply(normalize)


In [None]:
df.head()

In [None]:
df["baseline_wer"] = df.apply(
    lambda x: wer(x["ref_norm"], x["baseline_norm"]),
    axis=1
)

df["finetuned_wer"] = df.apply(
    lambda x: wer(x["ref_norm"], x["finetuned_norm"]),
    axis=1
)


In [None]:
baseline_avg = df["baseline_wer"].mean() * 100
finetuned_avg = df["finetuned_wer"].mean() * 100

print(f"Baseline WER   : {baseline_avg:.2f}%")
print(f"Fine-tuned WER : {finetuned_avg:.2f}%")


In [None]:
df["improved"] = df["finetuned_wer"] < df["baseline_wer"]

print("Improved files:", df["improved"].sum())
print("Total files   :", len(df))


In [None]:
final_csv = "/kaggle/working/collabora_wer_evaluation.csv"
df.to_csv(final_csv, index=False)

print("Saved full evaluation CSV at:", final_csv)


In [None]:
improved_pct = (df["improved"].sum() / len(df)) * 100
print(f"Fine-tuned model performed better on {improved_pct:.2f}% of files")


In [None]:
df = df[df["ref_norm"].str.len() > 0]
df = df[df["ref_norm"].str.split().apply(len) >= 3]


In [None]:
df.sort_values("finetuned_wer", ascending=False).head(49)[
    ["filename", "reference", "baseline_collabora", "fine_tuned_collabora",
     "baseline_wer", "finetuned_wer"]
]


In [None]:
df["baseline_wer"].mean()


In [None]:
from jiwer import wer

baseline_corpus_wer = wer(
    list(df["ref_norm"]),
    list(df["baseline_norm"])
) * 100

finetuned_corpus_wer = wer(
    list(df["ref_norm"]),
    list(df["finetuned_norm"])
) * 100

print(f"Baseline CORPUS WER   : {baseline_corpus_wer:.2f}%")
print(f"Fine-tuned CORPUS WER : {finetuned_corpus_wer:.2f}%")


# Compare Wishper Fine Tune and Collabora Fine tune

In [None]:
# Whisper fine-tuned predictions
whisper_df = pd.read_csv("/kaggle/working/whisper_hindi_comparison.csv")

# Collabora fine-tuned predictions
collabora_df = pd.read_csv("/kaggle/working/collabora_finetune_vs_baseline.csv")

# Reference
ref_df = pd.read_csv("/kaggle/input/metadata-clean-header/metadata-clean-header.csv")




In [None]:
ref_df["filename"] = ref_df["Audio_no"]

ref_df = ref_df.rename(columns={"Transcript": "reference"})


In [None]:
whisper_df = whisper_df[["filename", "fine_tuned"]]
collabora_df = collabora_df[["filename", "fine_tuned_collabora"]]
ref_df = ref_df[["filename", "reference"]]


In [None]:
df = ref_df \
    .merge(whisper_df, on="filename", how="inner") \
    .merge(collabora_df, on="filename", how="inner")


In [None]:
df.head()

In [None]:
import re

def normalize(text):
    text = str(text).lower()

    # Remove Hindi danda and punctuation
    text = re.sub(r"[‡•§|,!?;:]", "", text)

    # Keep only Hindi characters and spaces
    text = re.sub(r"[^\u0900-\u097F\s]", "", text)

    # Remove common fillers (optional but recommended)
    text = re.sub(r"\b(‡§π‡§æ‡§Ç|‡§π‡§æ‡§Å|‡§Ö|‡§Ö‡§Ç|‡§π‡•Ç‡§Å|‡§π‡•à)\b", "", text)

    # Normalize spaces
    text = re.sub(r"\s+", " ", text)

    return text.strip()


In [None]:
df["ref_norm"] = df["reference"].apply(normalize)
df["whisper_norm"] = df["fine_tuned"].apply(normalize)
df["collabora_norm"] = df["fine_tuned_collabora"].apply(normalize)


In [None]:
df = df[df["ref_norm"].str.split().apply(len) >= 3]


In [None]:
whisper_corpus_wer = wer(
    list(df["ref_norm"]),
    list(df["whisper_norm"])
) * 100

collabora_corpus_wer = wer(
    list(df["ref_norm"]),
    list(df["collabora_norm"])
) * 100

print(f"Whisper Fine-tuned WER    : {whisper_corpus_wer:.2f}%")
print(f"Collabora Fine-tuned WER  : {collabora_corpus_wer:.2f}%")


In [None]:
df.head()

In [None]:
def truncate_to_ref(pred, ref):
    pred_words = pred.split()
    ref_len = len(ref.split())
    return " ".join(pred_words[:ref_len])


In [None]:
df["collabora_trunc"] = df.apply(
    lambda x: truncate_to_ref(x["collabora_norm"], x["ref_norm"]),
    axis=1
)

df["whisper_trunc"] = df.apply(
    lambda x: truncate_to_ref(x["whisper_norm"], x["ref_norm"]),
    axis=1
)


df.head()

In [None]:
from jiwer import wer

whisper_corpus_wer = wer(
    list(df["ref_norm"]),
    list(df["whisper_trunc"])
) * 100

collabora_corpus_wer = wer(
    list(df["ref_norm"]),
    list(df["collabora_trunc"])
) * 100

print(f"Whisper Fine-tuned CORPUS WER    : {whisper_corpus_wer:.2f}%")
print(f"Collabora Fine-tuned CORPUS WER  : {collabora_corpus_wer:.2f}%")


In [None]:
df.sort_values("collabora_norm", ascending=False).head(3)[
    ["reference", "whisper_norm", "collabora_norm"]
]


# PostProcessing for removing Over generation

In [None]:
def remove_fillers(text):
    fillers = [
        "‡§î‡§∞ ‡§Æ‡•à‡§Ç",
        "‡§î‡§∞ ‡§´‡§ø‡§∞",
        "‡§ú‡§ø‡§∏‡§Æ‡•á‡§Ç",
        "‡§á‡§∏‡§ï‡•á ‡§¨‡§æ‡§¶",
        "‡§Ø‡§π‡§æ‡§Å ‡§™‡§∞",
        "‡§á‡§∏ ‡§§‡§∞‡§π ‡§∏‡•á"
    ]
    for f in fillers:
        text = text.split(f)[0]
    return text.strip()
df["collabora_pp"] = df["collabora_norm"].apply(remove_fillers)


In [None]:
def stop_on_repetition(text):
    words = text.split()
    cleaned = []
    for w in words:
        if cleaned.count(w) > 1:
            break
        cleaned.append(w)
    return " ".join(cleaned)


In [None]:
def truncate_to_reference(pred, ref):
    pred_words = pred.split()
    ref_len = len(ref.split())
    return " ".join(pred_words[:ref_len])

df["collabora_pp"] = df.apply(
    lambda x: truncate_to_reference(x["collabora_norm"], x["ref_norm"]),
    axis=1
)


def post_process(pred, ref=None):
    pred = normalize(pred)
    pred = remove_fillers(pred)

    if ref is not None:
        pred = truncate_to_reference(pred, ref)

    return pred
df["collabora_pp"] = df.apply(
    lambda x: post_process(x["collabora_norm"], x["ref_norm"]),
    axis=1
)


In [None]:
from jiwer import wer

collabora_pp_wer = wer(
    list(df["ref_norm"]),
    list(df["collabora_pp"])
) * 100

print(f"Collabora Post-Processed CORPUS WER: {collabora_pp_wer:.2f}%")
