We will use laurefindele-o-catto/whisperX 

whisper model - tsugitsugi converted
alignment model - sohan.wav2vec2 - from dl sprint 22 

In [None]:
# Clone YOUR modified WhisperX repo with Bengali alignment support
!git clone https://github.com/laurefindele-o-catto/whisperX
%cd whisperX


# Install your modified WhisperX
!pip install -e .

# Install other dependencies
!pip install transformers torch torchaudio
!pip install huggingface_hub pandas numpy librosa soundfile
!pip install accelerate evaluate jiwer tqdm

print("All dependencies installed!")
print("Using YOUR modified WhisperX with Bengali wav2vec2 alignment")

**RESTART HERE!**

**Optimzed Environment**

In [1]:
# Handle cuDNN loading issues for GPU optimization
import os

original_ld_path = os.environ.get("LD_LIBRARY_PATH", "")
cudnn_paths = [
    "/opt/conda/lib/python3.10/site-packages/nvidia/cudnn/lib/",
    "/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/lib/",
    "/usr/local/cuda/lib64/"
]

for cudnn_path in cudnn_paths:
    if os.path.exists(cudnn_path):
        os.environ['LD_LIBRARY_PATH'] = original_ld_path + ":" + cudnn_path
        print(f"cuDNN path added: {cudnn_path}")
        break

# Suppress warnings for cleaner output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Environment optimized!")

cuDNN path added: /usr/local/cuda/lib64/
Environment optimized!


**Import and Setup**

In [2]:
import whisperx
import torch
import torchaudio
import numpy as np
import pandas as pd
import os
import gc
import glob
from tqdm.auto import tqdm
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("Running on CPU - will be slower but functional")

print(f"PyTorch Version: {torch.__version__}")

Using device: cuda
GPU: Tesla P100-PCIE-16GB
GPU Memory: 17.1 GB
CUDA Version: 12.6
PyTorch Version: 2.8.0+cu126


**Configs and Paths**

In [4]:
# Your Custom Bengali Whisper Model (CTranslate2 format on HuggingFace)
BENGALI_WHISPER_MODEL = "pawmeow/whisper-tugstugi-bengali-ct2"


# Data Paths - UPDATE THESE WITH ACTUAL PATHS
TRAIN_AUDIO_PATH = "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/train/audio"
TRAIN_ANNOTATION_PATH = "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/train/annotation"
TEST_AUDIO_PATH = "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/test/audio"

# Output Configuration
OUTPUT_CSV_PATH = "submission.csv"

# Processing Configuration
CONFIG = {
    'batch_size': 1 if device == "cuda" else 1,
    'compute_type': "float32" if device == "cuda" else "int8",
    'chunk_size': 30,
    'use_alignment': True, 
    #'beam_size': 5, default 5, higher increases accuracy
    'language': 'bn',
    'max_audio_length': None,
    'vad_onset': 0.500,
    'vad_offset': 0.363,
}

print("Configuration:")
print(f"Whisper Model: {BENGALI_WHISPER_MODEL}")
print(f"Alignment: Bengali wav2vec2 (from alignment.py)")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Verify paths exist
paths_to_check = {
    "Train Audio": TRAIN_AUDIO_PATH,
    "Train Annotations": TRAIN_ANNOTATION_PATH,
    "Test Audio": TEST_AUDIO_PATH
}

print("\nPath Verification:")
for name, path in paths_to_check.items():
    if os.path.exists(path):
        count = len([f for f in os.listdir(path) if f.endswith('.wav' if 'audio' in name.lower() else '.txt')])
        print(f"{name}: {path} ({count} files)")
    else:
        print(f"{name}: {path} (UPDATE THIS PATH!)")

Configuration:
Whisper Model: pawmeow/whisper-tugstugi-bengali-ct2
Alignment: Bengali wav2vec2 (from alignment.py)
  batch_size: 1
  compute_type: float32
  chunk_size: 30
  use_alignment: True
  language: bn
  max_audio_length: None
  vad_onset: 0.5
  vad_offset: 0.363

Path Verification:
Train Audio: /kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/train/audio (113 files)
Train Annotations: /kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/train/annotation (113 files)
Test Audio: /kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition/transcription/transcription/test/audio (24 files)


**Model**

In [27]:
print("ü§ñ Loading the Custom Bengali Whisper Model...")


from omegaconf import ListConfig
from omegaconf.base import ContainerMetadata, Metadata
from omegaconf.nodes import AnyNode
from typing import Any, List, Dict, Optional
from collections import defaultdict
import torch
from torch.torch_version import TorchVersion
from pyannote.audio.core.model import Introspection
from pyannote.audio.core.task import Specifications, Problem, Resolution

torch.serialization.add_safe_globals([
    ListConfig,ContainerMetadata,Metadata,
    AnyNode,    Any,
    List,    Dict,
    Optional,    list,   
    dict,  tuple, set,
    defaultdict,   # collections.defaultdict
    TorchVersion, Introspection, Specifications, Problem, Resolution
])


whisper_model = None
try:
    whisper_model = whisperx.load_model(
        BENGALI_WHISPER_MODEL,
        device=device,
        compute_type=CONFIG['compute_type'],
        language=CONFIG['language']
    )
    print(f"Custom Bengali Whisper model loaded: {BENGALI_WHISPER_MODEL}")
except Exception as e:
    print(f"‚ùå Error loading custom model: {e}")
    # whisper_model = whisperx.load_model(
    #     "large-v2",
    #     device=device,
    #     compute_type=CONFIG['compute_type'],
    #     language=CONFIG['language']
    # )


ü§ñ Loading the Custom Bengali Whisper Model...
2026-02-10 18:42:37 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint whisperX/whisperx/assets/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu126. Bad things might happen unless you revert torch to 1.x.
‚ùå Error loading custom model: Inference.__init__() got an unexpected keyword argument 'token'
Falling back to standard large-v2...


**Alignment Model**

In [None]:
alignment_model = None
metadata = None
alignment_available = False

if CONFIG['use_alignment']:
    try:
        alignment_model, metadata = whisperx.load_align_model(
            language_code=CONFIG['language'],
            device=device
        )
        alignment_available = True
        print(f"Bengali Alignment Model loaded")
    except Exception as e:
        print(f"Bengali alignment model unavailable: {e}")
        print("Proceeding with segment-level timestamps only")
        alignment_available = False
        CONFIG['use_alignment'] = False
else:
    print("üìù Alignment disabled in configuration")


**Audio Processing Pipeline**

In [None]:
def transcribe_bengali_audio(audio_path, use_alignment=True):
    try:
        # Step 1: Load audio
        print(f"Loading: {os.path.basename(audio_path)}")
        audio = whisperx.load_audio(audio_path)
        
        audio_duration = len(audio) / 16000

        #add preprocessing here
        
        if CONFIG['max_audio_length'] is not None and audio_duration > CONFIG['max_audio_length']:
            print(f"Audio too long ({audio_duration:.1f}s), truncating to {CONFIG['max_audio_length']}s")
            audio = audio[:CONFIG['max_audio_length'] * 16000]
        else:
            print(f"üìä Audio duration: {audio_duration:.1f}s")
        
        # Step 2: Transcribe with custom Bengali Whisper model

        result = whisper_model.transcribe(
            audio,
            batch_size=CONFIG['batch_size'],
            language=CONFIG['language'],
            chunk_size=CONFIG['chunk_size'],
            print_progress=False
        )
        
        # Step 3: Apply WhisperX's built-in alignment (uses your wav2vec2 from alignment.py)
        
        if use_alignment and alignment_available and alignment_model is not None:
            try:
                result = whisperx.align(
                    result["segments"],
                    alignment_model,
                    metadata,
                    audio,
                    device,
                    return_char_alignments=False,
                    print_progress=False
                )
            except Exception as align_error:
                print(f"Alignment failed: {align_error}")
        
        # Step 4: Extract transcription text
        full_transcription = ""
        
        if "segments" in result and result["segments"]:
            segment_texts = []
            for segment in result["segments"]:
                if "text" in segment and segment["text"].strip():
                    segment_texts.append(segment["text"].strip())
            full_transcription = " ".join(segment_texts)
        
        # Step 5: Clean up transcription
        if full_transcription:
            full_transcription = " ".join(full_transcription.split()).strip()
        
        print(f"Transcription complete ({len(full_transcription)} chars)")
        return full_transcription
        
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return ""

print("Audio transcription pipeline ready!")

**Check Test Files**

In [None]:
test_files = []
if os.path.exists(TEST_AUDIO_PATH):
    # Find all .wav files in test directory
    wav_files = glob.glob(os.path.join(TEST_AUDIO_PATH, "*.wav"))
    test_files = [os.path.basename(f) for f in wav_files]
    test_files.sort()  # Sort for consistent processing order
    
    print(f"Found {len(test_files)} test audio files")
    
    # Show first few files as preview
    print(f"\nFirst 5 test files:")
    for i, fname in enumerate(test_files[:5]):
        fpath = os.path.join(TEST_AUDIO_PATH, fname)
        if os.path.exists(fpath):
            # Get file size
            size_mb = os.path.getsize(fpath) / (1024*1024)
            print(f"  {i+1:2d}. {fname} ({size_mb:.1f} MB)")
        else:
            print(f"  {i+1:2d}. {fname} (file not found)")
    
    if len(test_files) > 5:
        print(f"     ... and {len(test_files) - 5} more files")
        
else:
    print(f"Test audio directory not found: {TEST_AUDIO_PATH}")
    print("Please update TEST_AUDIO_PATH in the configuration section")
    

if len(test_files) == 0:
    print("No test files found! Please check your path configuration.")

**Inference Loop**

In [None]:

if len(test_files) == 0:
    print("No test files found. Please check your paths and rerun.")
else:
    results = []
    processed_count = 0
    error_count = 0
    
    for idx, test_filename in enumerate(tqdm(test_files, desc="Processing audio files")):
        try:
            audio_file_path = os.path.join(TEST_AUDIO_PATH, test_filename)
            
            if not os.path.exists(audio_file_path):
                print(f"File not found: {audio_file_path}")
                results.append({
                    "filename": test_filename,
                    "transcription": ""
                })
                error_count += 1
                continue
            
            # Transcribe the audio file
            transcription = transcribe_bengali_audio(
                audio_file_path, 
                use_alignment=CONFIG['use_alignment']
            )
            
            # Store result
            results.append({
                "filename": test_filename,
                "transcription": transcription
            })
            
            processed_count += 1
            
            # Show progress and sample result
            if idx == 0:  # Show first result as example
                print(f"\nFirst Sample:")
                print(f"   Text: {transcription[:100]}{'...' if len(transcription) > 100 else ''}")
            
            # Memory cleanup every 10 files
            if (idx + 1) % 10 == 0:
                print(f"Memory cleanup... ({idx + 1}/{len(test_files)} processed)")
                gc.collect()
                if device == "cuda":
                    torch.cuda.empty_cache()
            
        except KeyboardInterrupt:
            print("\nProcessing interrupted by user")
            break
            
        except Exception as e:
            print(f"Error processing {test_filename}: {e}")
            results.append({
                "filename": test_filename,
                "transcription": ""
            })
            error_count += 1
            continue
    
    # Final processing summary
    print(f"Successfully processed: {processed_count}/{len(test_files)}")
    print(f"Errors encountered: {error_count}")

**Submission CSV**

In [None]:
if len(test_files) > 0 and 'results' in locals():
    print("üìù Creating competition submission file...")
    
    submission_df = pd.DataFrame(results)
    
    # Ensure correct column names for competition
    submission_df.columns = ['filename', 'transcription']
    
    # Show submission preview
    print("Submission Preview:")
    print(submission_df.head(10))
    
    submission_df.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8')
    
    print(f"\nSubmission saved to: {OUTPUT_CSV_PATH}")
        
else:
    print("Error.")