In [None]:
!pip install whisper_timestamped

Collecting whisper_timestamped
  Downloading whisper_timestamped-1.15.9-py3-none-any.whl.metadata (1.4 kB)
Collecting dtw-python (from whisper_timestamped)
  Downloading dtw_python-1.7.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (7.5 kB)
Collecting openai-whisper (from whisper_timestamped)
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton>=2 (from openai-whisper->whisper_timestamped)
  Downloading triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Downloading whisper_timestamped-1.15.9-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Set
import requests
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Audio processing
from pydub import AudioSegment
from pydub.silence import detect_silence

# ASR with timestamps
import whisper_timestamped as whisper

In [None]:
class HindiDisfluencyDetectorV2:
    def __init__(self, disfluency_list_path: str = "/content/Speech Disfluencies List - Sheet1.csv"):
        self.load_disfluency_patterns(disfluency_list_path)
        self.whisper_model = None

        self.min_filler_duration = 0.15
        self.min_prolongation_duration = 0.25
        self.min_repetition_duration = 0.30
        self.min_hesitation_duration = 0.40
        self.min_false_start_duration = 0.15

        self.min_confidence = 0.60

        self.max_repetition_gap = 0.5

        self.common_words = {
            'जी', 'हाँ', 'नहीं', 'है', 'था', 'थे', 'हम', 'को', 'जो',
            'के', 'ने', 'तो', 'पर', 'से', 'में', 'और', 'या', 'भी',
            'अच्छा', 'ठीक', 'सही', 'अरे', 'अच्छी', 'पूरी', 'पराठा'
        }

        self.known_fillers = {
            'उम्म', 'हम्म', 'अम्म', 'एम्म', 'मतलब', 'जैसे', 'वैसे',
            'यानी', 'क्या', 'अरे', 'हाँ', 'तो', 'वो', 'ये', 'उह',
            'आह', 'ओह', 'एह', 'इह'
        }

    def load_disfluency_patterns(self, csv_path: str):
        df = pd.read_csv(csv_path)

        self.fillers = self._extract_patterns(df['Filled Pause'])
        self.repetitions = self._extract_patterns(df['Repetition'])
        self.false_starts = self._extract_patterns(df['False Start'])
        self.prolongations = self._extract_patterns(df['Prolongation'])
        self.self_corrections = self._extract_patterns(df['Self-Correction'])

        print(f"✓ Loaded {len(self.fillers)} filler patterns")

    def _extract_patterns(self, series: pd.Series) -> List[str]:
        return [str(x).strip() for x in series.dropna() if str(x).strip()]

    def load_whisper_model(self, model_size: str = "medium"):
        print(f"Loading Whisper {model_size} model...")
        self.whisper_model = whisper.load_model(model_size)
        print("✓ Whisper model loaded")

    def transcribe_with_timestamps(self, audio_path: str) -> Dict:
        if self.whisper_model is None:
            self.load_whisper_model()

        audio_path = os.path.abspath(audio_path)
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        initial_prompt = (
            "उम्म... मतलब... जैसे... तो... हम्म... मैं... मैं... "
            "क्या... क्या बोल रहे थे... अरे... हाँ... सोoooo... "
            "अच्छ्छ्छा... वो... वो... ये... ये..."
        )

        try:
            result = whisper.transcribe(
                self.whisper_model,
                audio_path,
                language="hi",
                detect_disfluencies=True,
                initial_prompt=initial_prompt,
                vad=False
            )
        except Exception as e:
            print(f"Error during transcription: {e}")
            raise

        return result

    def normalize_word(self, word: str) -> str:
        normalized = re.sub(r'[।,\.…\-—!?;:]', '', word)
        normalized = normalized.strip().lower()
        return normalized

    def is_common_word(self, word: str) -> bool:
        normalized = self.normalize_word(word)
        return normalized in self.common_words

    def is_known_filler(self, word: str) -> bool:
        normalized = self.normalize_word(word)
        return normalized in self.known_fillers

    def detect_fillers(self, word_timestamps: List[Dict]) -> List[Dict]:
        detections = []

        for word_info in word_timestamps:
            word = word_info.get('text', '').strip()
            start = word_info.get('start', 0)
            end = word_info.get('end', 0)
            duration = end - start
            confidence = word_info.get('confidence', 0.0)

            if duration < self.min_filler_duration:
                continue
            if confidence < self.min_confidence:
                continue

            if self.is_known_filler(word):
                detections.append({
                    'type': 'filler',
                    'subtype': self.normalize_word(word),
                    'text': word,
                    'start': start,
                    'end': end,
                    'confidence': confidence
                })
                continue

            normalized_word = self.normalize_word(word)
            for filler in self.fillers:
                normalized_filler = self.normalize_word(filler)
                if normalized_word == normalized_filler:
                    detections.append({
                        'type': 'filler',
                        'subtype': filler,
                        'text': word,
                        'start': start,
                        'end': end,
                        'confidence': confidence
                    })
                    break

        return detections

    def detect_repetitions(self, word_timestamps: List[Dict]) -> List[Dict]:
        detections = []
        seen = set()

        for i in range(len(word_timestamps) - 1):
            current = word_timestamps[i].get('text', '').strip()
            next_word = word_timestamps[i + 1].get('text', '').strip()

            current_norm = self.normalize_word(current)
            next_norm = self.normalize_word(next_word)

            if not current_norm or not next_norm or len(current_norm) < 2:
                continue

            if self.is_common_word(current):
                continue

            if current_norm == next_norm:
                start = word_timestamps[i].get('start', 0)
                end = word_timestamps[i + 1].get('end', 0)
                duration = end - start

                if duration < self.min_repetition_duration:
                    continue

                conf1 = word_timestamps[i].get('confidence', 0.0)
                conf2 = word_timestamps[i + 1].get('confidence', 0.0)
                avg_conf = (conf1 + conf2) / 2

                if avg_conf < self.min_confidence:
                    continue

                gap = word_timestamps[i + 1].get('start', 0) - word_timestamps[i].get('end', 0)
                if gap > self.max_repetition_gap:
                    continue

                key = (round(start, 2), round(end, 2))
                if key in seen:
                    continue
                seen.add(key)

                detections.append({
                    'type': 'repetition',
                    'subtype': 'immediate',
                    'text': f"{current} {next_word}",
                    'start': start,
                    'end': end,
                    'confidence': avg_conf
                })

        return detections

    def detect_prolongations(self, word_timestamps: List[Dict]) -> List[Dict]:
        detections = []
        seen = set()

        for word_info in word_timestamps:
            word = word_info.get('text', '').strip()
            start = word_info.get('start', 0)
            end = word_info.get('end', 0)
            duration = end - start
            confidence = word_info.get('confidence', 0.0)

            if duration < self.min_prolongation_duration:
                continue
            if confidence < self.min_confidence:
                continue

            if self.is_common_word(word):
                continue

            has_elongation = False

            if '...' in word or '…' in word:
                has_elongation = True
            elif re.search(r'([a-zA-Z])\1{4,}', word):
                has_elongation = True
            elif re.search(r'([ाीुूेैोौं])\1{4,}', word):
                has_elongation = True

            if has_elongation:
                key = (round(start, 2), round(end, 2))
                if key in seen:
                    continue
                seen.add(key)

                detections.append({
                    'type': 'prolongation',
                    'subtype': 'elongation',
                    'text': word,
                    'start': start,
                    'end': end,
                    'confidence': confidence
                })

        return detections

    def detect_false_starts(self, word_timestamps: List[Dict]) -> List[Dict]:
        detections = []
        seen = set()

        for word_info in word_timestamps:
            word = word_info.get('text', '').strip()
            start = word_info.get('start', 0)
            end = word_info.get('end', 0)
            duration = end - start
            confidence = word_info.get('confidence', 0.0)

            if duration < self.min_false_start_duration:
                continue
            if confidence < self.min_confidence:
                continue

            if self.is_common_word(word):
                continue

            has_truncation = word.endswith('—') or word.endswith('-')

            word_clean = self.normalize_word(word)
            is_suspicious = (
                len(word_clean) <= 2 and
                not word_clean.isdigit() and
                duration < 0.25
            )

            if has_truncation or is_suspicious:
                key = (round(start, 2), round(end, 2))
                if key in seen:
                    continue
                seen.add(key)

                detections.append({
                    'type': 'false_start',
                    'subtype': 'truncated',
                    'text': word,
                    'start': start,
                    'end': end,
                    'confidence': confidence
                })

        return detections

    def detect_hesitations(self, audio_path: str, word_timestamps: List[Dict]) -> List[Dict]:
        detections = []
        seen = set()

        try:
            audio = AudioSegment.from_file(audio_path)

            silences = detect_silence(
                audio,
                min_silence_len=int(self.min_hesitation_duration * 1000),
                silence_thresh=audio.dBFS - 16
            )

            for i in range(len(word_timestamps) - 1):
                end_current = word_timestamps[i].get('end', 0) * 1000
                start_next = word_timestamps[i + 1].get('start', 0) * 1000

                gap_duration = start_next - end_current

                if gap_duration >= self.min_hesitation_duration * 1000:
                    start_sec = end_current / 1000
                    end_sec = start_next / 1000

                    key = (round(start_sec, 2), round(end_sec, 2))
                    if key in seen:
                        continue
                    seen.add(key)

                    detections.append({
                        'type': 'hesitation',
                        'subtype': 'pause',
                        'text': '[PAUSE]',
                        'start': start_sec,
                        'end': end_sec,
                        'confidence': 1.0,
                        'duration_ms': gap_duration
                    })

        except Exception as e:
            print(f"Warning: Could not detect hesitations: {e}")

        return detections

    def remove_overlaps(self, detections: List[Dict]) -> List[Dict]:
        if not detections:
            return []

        sorted_detections = sorted(
            detections,
            key=lambda x: (x['start'], -x.get('confidence', 0))
        )

        filtered = []

        for detection in sorted_detections:
            overlaps = False

            for existing in filtered:
                overlap_start = max(detection['start'], existing['start'])
                overlap_end = min(detection['end'], existing['end'])

                if overlap_start < overlap_end:
                    overlap_duration = overlap_end - overlap_start
                    det_duration = detection['end'] - detection['start']
                    exist_duration = existing['end'] - existing['start']

                    overlap_pct_det = overlap_duration / det_duration if det_duration > 0 else 0
                    overlap_pct_exist = overlap_duration / exist_duration if exist_duration > 0 else 0

                    if overlap_pct_det > 0.5 or overlap_pct_exist > 0.5:
                        if detection.get('confidence', 0) > existing.get('confidence', 0):
                            filtered.remove(existing)
                            filtered.append(detection)
                        overlaps = True
                        break

            if not overlaps:
                filtered.append(detection)

        return filtered

    def detect_all_disfluencies(self, audio_path: str) -> Tuple[List[Dict], Dict]:
        print(f"\nProcessing: {audio_path}")

        print("  → Transcribing with Whisper...")
        result = self.transcribe_with_timestamps(audio_path)

        word_timestamps = []
        for segment in result.get('segments', []):
            for word in segment.get('words', []):
                word_timestamps.append(word)

        print(f"  → Found {len(word_timestamps)} words")

        all_detections = []

        print("  → Detecting fillers...")
        fillers = self.detect_fillers(word_timestamps)
        all_detections.extend(fillers)
        print(f"     Found {len(fillers)} fillers")

        print("  → Detecting repetitions...")
        repetitions = self.detect_repetitions(word_timestamps)
        all_detections.extend(repetitions)
        print(f"     Found {len(repetitions)} repetitions")

        print("  → Detecting prolongations...")
        prolongations = self.detect_prolongations(word_timestamps)
        all_detections.extend(prolongations)
        print(f"     Found {len(prolongations)} prolongations")

        print("  → Detecting false starts...")
        false_starts = self.detect_false_starts(word_timestamps)
        all_detections.extend(false_starts)
        print(f"     Found {len(false_starts)} false starts")

        print("  → Detecting hesitations...")
        hesitations = self.detect_hesitations(audio_path, word_timestamps)
        all_detections.extend(hesitations)
        print(f"     Found {len(hesitations)} hesitations")

        print("  → Removing overlaps...")
        all_detections = self.remove_overlaps(all_detections)

        all_detections.sort(key=lambda x: x['start'])

        print(f"  ✓ Total unique disfluencies: {len(all_detections)}")

        return all_detections, result


In [None]:
class AudioClipper:
    def __init__(self, output_dir: str = "output/disfluency_clips"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def extract_clip(self, audio_path: str, start_sec: float, end_sec: float,
                     output_filename: str, padding_ms: int = 200) -> str:
        try:
            audio = AudioSegment.from_file(audio_path)
            start_ms = max(0, int(start_sec * 1000) - padding_ms)
            end_ms = min(len(audio), int(end_sec * 1000) + padding_ms)
            clip = audio[start_ms:end_ms]
            clip = clip.normalize()
            output_path = self.output_dir / output_filename
            clip.export(output_path, format="wav")
            return str(output_path)
        except Exception as e:
            print(f"Error extracting clip: {e}")
            return None

    def process_disfluencies(self, audio_path: str, disfluencies: List[Dict],
                            recording_id: str) -> List[Dict]:
        print(f"\nExtracting {len(disfluencies)} clips...")
        for idx, disf in enumerate(tqdm(disfluencies)):
            disf_type = disf['type']
            subtype = disf.get('subtype', 'unknown')
            filename = f"{recording_id}_disf_{idx:03d}_{disf_type}_{subtype}.wav"
            clip_path = self.extract_clip(
                audio_path,
                disf['start'],
                disf['end'],
                filename
            )
            disf['clip_filename'] = filename
            disf['clip_path'] = clip_path
            disf['duration_sec'] = disf['end'] - disf['start']
        return disfluencies


In [None]:
class AudioPreprocessor:
    @staticmethod
    def preprocess_audio(input_path: str, output_path: str = None,
                         target_sr: int = 16000, normalize: bool = True) -> str:
        try:
            audio = AudioSegment.from_file(input_path)

            if audio.channels > 1:
                audio = audio.set_channels(1)

            if audio.frame_rate != target_sr:
                audio = audio.set_frame_rate(target_sr)

            if normalize:
                audio = audio.normalize()

            if output_path is None:
                output_path = input_path.replace('.wav', '_preprocessed.wav')

            audio.export(output_path, format="wav")
            return output_path
        except Exception as e:
            print(f"Error preprocessing audio: {e}")
            return input_path

    @staticmethod
    def download_audio(url: str, output_path: str) -> bool:
        url_patterns = [url]

        if 'joshtalks-data-collection/hq_data' in url:
            match = re.search(r'joshtalks-data-collection/hq_data/[a-z]{2}/(.*)', url)
            if match:
                remaining_path = match.group(1)
                transformed_url = url.split('joshtalks-data-collection')[0] + f'upload_goai/{remaining_path}'
                url_patterns.append(transformed_url)

        for attempt_url in url_patterns:
            try:
                response = requests.get(attempt_url, stream=True, timeout=30)
                response.raise_for_status()

                with open(output_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)

                return True
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:
                    continue
                else:
                    print(f"Error downloading audio: {e}")
                    return False
            except Exception as e:
                print(f"Error downloading audio: {e}")
                return False

        print(f"Error: Audio file not found at any URL pattern")
        return False


In [None]:
def main():

    import shutil
    if not shutil.which("ffmpeg"):
        print("="*60)
        print("❌ ERROR: ffmpeg is not installed or not in PATH")
        print("="*60)
        print("\nSee INSTALLATION_GUIDE.md for installation instructions.")
        return

    pipeline = DisfluencyPipelineV2(
        dataset_path="/content/FT_Data_-_data.csv",
        disfluency_list_path="/content/Speech Disfluencies List - Sheet1.csv",
        output_dir="output"
    )

    results_df = pipeline.process_dataset(
        max_recordings=5,
        start_idx=0
    )

    print("\n✓ Pipeline V2 completed successfully!")
    print(f"✓ Check output/ directory for results")

    if results_df is None or len(results_df) == 0:
        print("\n⚠️  WARNING: No disfluencies detected or no audio files available")


In [None]:
class DisfluencyPipelineV2:
    """Complete pipeline for disfluency detection - Production Version."""

    def __init__(self, dataset_path: str = "/content/FT_Data_-_data.csv",
                 disfluency_list_path: str = "/content/Speech Disfluencies List - Sheet1.csv",
                 output_dir: str = "output",
                 local_audio_dir: str = None):
        """Initialize pipeline."""
        self.dataset_path = dataset_path
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.local_audio_dir = Path(local_audio_dir) if local_audio_dir else None

        # Initialize components
        self.detector = HindiDisfluencyDetectorV2(disfluency_list_path)
        self.clipper = AudioClipper(output_dir=str(self.output_dir / "disfluency_clips"))
        self.preprocessor = AudioPreprocessor()

        # Audio cache
        self.audio_cache = self.output_dir / "audio_files"
        self.audio_cache.mkdir(exist_ok=True)

    def process_recording(self, recording_id: str, audio_url: str,
                         user_id: str = None) -> Tuple[List[Dict], str]:
        """Process a single recording."""
        print(f"\n{'='*60}")
        print(f"Processing Recording: {recording_id}")
        print(f"{'='*60}")

        # Check for local audio
        audio_filename = f"{recording_id}_audio.wav"
        audio_path = self.audio_cache / audio_filename

        if self.local_audio_dir and not audio_path.exists():
            local_path = self.local_audio_dir / audio_filename
            if local_path.exists():
                print(f"Using local audio file: {local_path}")
                import shutil
                shutil.copy(local_path, audio_path)

        # Download if needed
        if not audio_path.exists():
            print(f"Downloading audio from: {audio_url}")
            success = self.preprocessor.download_audio(audio_url, str(audio_path))
            if not success:
                print(f"Failed to download audio for {recording_id}")
                return [], None
        else:
            print(f"Using cached audio: {audio_path}")

        # Preprocess
        print("Preprocessing audio...")
        preprocessed_path = self.preprocessor.preprocess_audio(str(audio_path))

        # Detect disfluencies
        disfluencies, transcription = self.detector.detect_all_disfluencies(preprocessed_path)

        # Extract clips
        disfluencies = self.clipper.process_disfluencies(
            preprocessed_path,
            disfluencies,
            recording_id
        )

        # Add metadata
        for disf in disfluencies:
            disf['recording_id'] = recording_id
            disf['user_id'] = user_id

        return disfluencies, preprocessed_path

    def process_dataset(self, max_recordings: int = None,
                       start_idx: int = 0) -> pd.DataFrame:
        """Process entire dataset."""
        df = pd.read_csv(self.dataset_path)
        print(f"\nLoaded dataset: {len(df)} recordings")

        if max_recordings:
            df = df.iloc[start_idx:start_idx + max_recordings]
            print(f"Processing {len(df)} recordings (from index {start_idx})")

        # Load Whisper model once
        self.detector.load_whisper_model(model_size="medium")

        # Process each recording
        all_disfluencies = []

        for idx, row in df.iterrows():
            try:
                disfluencies, _ = self.process_recording(
                    recording_id=str(row['recording_id']),
                    audio_url=row['rec_url_gcp'],
                    user_id=str(row['user_id'])
                )

                all_disfluencies.extend(disfluencies)
            except Exception as e:
                print(f"Error processing recording {row['recording_id']}: {e}")
                continue

        # Create results DataFrame
        results_df = pd.DataFrame(all_disfluencies)

        if len(results_df) == 0:
            print("\n⚠️  No disfluencies detected!")
            return results_df

        # Reorder columns
        column_order = [
            'recording_id', 'user_id', 'type', 'subtype',
            'start', 'end', 'duration_sec',
            'text', 'confidence',
            'clip_filename', 'clip_path'
        ]

        for col in column_order:
            if col not in results_df.columns:
                results_df[col] = None

        results_df = results_df[column_order]

        # Save results
        output_csv = self.output_dir / "disfluency_results_v2.csv"
        results_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
        print(f"\n✓ Results saved to: {output_csv}")
        print(f"✓ Total disfluencies detected: {len(results_df)}")

        # Print summary
        self._print_summary(results_df)

        return results_df

    def _print_summary(self, df: pd.DataFrame):
        """Print summary statistics."""
        print(f"\n{'='*60}")
        print("SUMMARY STATISTICS")
        print(f"{'='*60}")

        print(f"\nTotal disfluencies: {len(df)}")
        print(f"Unique recordings: {df['recording_id'].nunique()}")

        print("\nDisfluency types:")
        type_counts = df['type'].value_counts()
        for disf_type, count in type_counts.items():
            percentage = (count / len(df)) * 100
            print(f"  {disf_type:15s}: {count:4d} ({percentage:5.1f}%)")

        print(f"\nDuration statistics:")
        print(f"  Mean:   {df['duration_sec'].mean():.3f}s")
        print(f"  Median: {df['duration_sec'].median():.3f}s")
        print(f"  Min:    {df['duration_sec'].min():.3f}s")
        print(f"  Max:    {df['duration_sec'].max():.3f}s")

        print(f"\nConfidence statistics:")
        print(f"  Mean:   {df['confidence'].mean():.3f}")
        print(f"  Median: {df['confidence'].median():.3f}")
        print(f"  Min:    {df['confidence'].min():.3f}")
        print(f"  Max:    {df['confidence'].max():.3f}")

        # Quality checks
        print(f"\n{'='*60}")
        print("QUALITY CHECKS")
        print(f"{'='*60}")

        micro_segments = len(df[df['duration_sec'] < 0.05])
        low_conf = len(df[df['confidence'] < 0.6])

        print(f"\nMicro-segments (<0.05s): {micro_segments}")
        print(f"Low confidence (<0.6): {low_conf}")

        if micro_segments == 0 and low_conf == 0:
            print("\n✅ All quality checks passed!")
        else:
            print("\n⚠️  Some quality issues detected")


In [None]:
import os
import pandas as pd
from datetime import datetime

print("="*70)
print("RUNNING PRODUCTION VERSION (V2) - STRICT VALIDATION")
print("="*70)

# Backup old results if they exist
if os.path.exists('disfluency_results.csv'):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_name = f'disfluency_results_v1_{timestamp}.csv'
    import shutil
    shutil.copy('disfluency_results.csv', backup_name)
    print(f"\n✓ Backed up V1 results to: {backup_name}")

# Run V2 pipeline
print("\nRunning V2 Pipeline...")
print("-"*70)


main()

print("\n" + "="*70)
print("COMPARISON: V1 vs V2")
print("="*70)

# Load results
v2_df = pd.read_csv('output/disfluency_results_v2.csv')

# Find V1 backup
v1_files = [f for f in os.listdir('.') if f.startswith('disfluency_results_v1_')]
if v1_files:
    v1_df = pd.read_csv(v1_files[-1])

    print(f"\n{'Metric':<30} {'V1 (Old)':<15} {'V2 (New)':<15} {'Change':<15}")
    print("-"*70)

    # Total detections
    v1_total = len(v1_df)
    v2_total = len(v2_df)
    change_pct = ((v2_total - v1_total) / v1_total * 100) if v1_total > 0 else 0
    print(f"{'Total Detections':<30} {v1_total:<15} {v2_total:<15} {change_pct:+.1f}%")

    # By type
    print(f"\n{'By Type:':<30}")
    all_types = set(v1_df['type'].unique()) | set(v2_df['type'].unique())
    for dtype in sorted(all_types):
        v1_count = len(v1_df[v1_df['type'] == dtype])
        v2_count = len(v2_df[v2_df['type'] == dtype])
        change = ((v2_count - v1_count) / v1_count * 100) if v1_count > 0 else (100 if v2_count > 0 else 0)
        print(f"  {dtype:<28} {v1_count:<15} {v2_count:<15} {change:+.1f}%")

    # Quality metrics
    print(f"\n{'Quality Metrics:':<30}")

    v1_micro = len(v1_df[v1_df['duration_sec'] < 0.05])
    v2_micro = len(v2_df[v2_df['duration_sec'] < 0.05])
    print(f"  {'Micro-segments (<0.05s)':<28} {v1_micro:<15} {v2_micro:<15} {'-100%' if v2_micro == 0 else f'{((v2_micro - v1_micro) / v1_micro * 100):+.1f}%'}")

    v1_avg_dur = v1_df['duration_sec'].mean()
    v2_avg_dur = v2_df['duration_sec'].mean()
    print(f"  {'Avg Duration (s)':<28} {v1_avg_dur:<15.3f} {v2_avg_dur:<15.3f} {((v2_avg_dur - v1_avg_dur) / v1_avg_dur * 100):+.1f}%")

    v1_avg_conf = v1_df['confidence'].mean()
    v2_avg_conf = v2_df['confidence'].mean()
    print(f"  {'Avg Confidence':<28} {v1_avg_conf:<15.3f} {v2_avg_conf:<15.3f} {((v2_avg_conf - v1_avg_conf) / v1_avg_conf * 100):+.1f}%")

    print("\n" + "="*70)
    print("IMPROVEMENTS")
    print("="*70)

    improvements = []
    if v2_total < v1_total * 0.5:
        improvements.append(f"✅ Reduced false positives by {100 - (v2_total/v1_total*100):.1f}%")
    if v2_micro == 0 and v1_micro > 0:
        improvements.append(f"✅ Eliminated all {v1_micro} micro-segments")
    if len(v2_df[v2_df['type'] == 'filler']) > 0 and len(v1_df[v1_df['type'] == 'filler']) == 0:
        improvements.append(f"✅ Now detecting fillers ({len(v2_df[v2_df['type'] == 'filler'])} found)")
    if v2_avg_conf > v1_avg_conf:
        improvements.append(f"✅ Improved average confidence by {((v2_avg_conf - v1_avg_conf) / v1_avg_conf * 100):.1f}%")
    if v2_avg_dur > v1_avg_dur:
        improvements.append(f"✅ Increased average duration (more meaningful segments)")

    if improvements:
        for imp in improvements:
            print(f"\n{imp}")
    else:
        print("\n⚠️  No significant improvements detected")

else:
    print("\nV2 Results:")
    print(f"  Total detections: {len(v2_df)}")
    print(f"  By type:")
    for dtype, count in v2_df['type'].value_counts().items():
        print(f"    {dtype}: {count}")

print("\n" + "="*70)
print("✓ V2 Pipeline Complete!")
print(f"✓ Results saved to: output/disfluency_results_v2.csv")
print("="*70)


RUNNING PRODUCTION VERSION (V2) - STRICT VALIDATION

Running V2 Pipeline...
----------------------------------------------------------------------

    ╔══════════════════════════════════════════════════════════════╗
    ║   Hindi Speech Disfluency Detection Pipeline V2             ║
    ║   PRODUCTION VERSION - Strict Validation                    ║
    ╚══════════════════════════════════════════════════════════════╝
    
✓ Loaded 33 filler patterns

Loaded dataset: 104 recordings
Processing 5 recordings (from index 0)
Loading Whisper medium model...


100%|█████████████████████████████████████| 1.42G/1.42G [00:17<00:00, 86.8MiB/s]


✓ Whisper model loaded

Processing Recording: 825780
Downloading audio from: https://storage.googleapis.com/joshtalks-data-collection/hq_data/hi/967179/825780_audio.wav
Preprocessing audio...

Processing: output/audio_files/825780_audio_preprocessed.wav
  → Transcribing with Whisper...


100%|██████████| 38896/38896 [21:37<00:00, 29.98frames/s]


  → Found 347 words
  → Detecting fillers...
     Found 0 fillers
  → Detecting repetitions...
     Found 0 repetitions
  → Detecting prolongations...
     Found 0 prolongations
  → Detecting false starts...
     Found 0 false starts
  → Detecting hesitations...
     Found 2 hesitations
  → Removing overlaps...
  ✓ Total unique disfluencies: 2

Extracting 2 clips...


100%|██████████| 2/2 [00:00<00:00, 89.96it/s]


Processing Recording: 825727
Downloading audio from: https://storage.googleapis.com/joshtalks-data-collection/hq_data/hi/967179/825727_audio.wav





Preprocessing audio...

Processing: output/audio_files/825727_audio_preprocessed.wav
  → Transcribing with Whisper...


100%|██████████| 39031/39031 [22:08<00:00, 29.37frames/s]


  → Found 751 words
  → Detecting fillers...
     Found 6 fillers
  → Detecting repetitions...
     Found 1 repetitions
  → Detecting prolongations...
     Found 0 prolongations
  → Detecting false starts...
     Found 5 false starts
  → Detecting hesitations...
     Found 5 hesitations
  → Removing overlaps...
  ✓ Total unique disfluencies: 16

Extracting 16 clips...


100%|██████████| 16/16 [00:00<00:00, 130.00it/s]


Processing Recording: 988596
Downloading audio from: https://storage.googleapis.com/joshtalks-data-collection/hq_data/hi/1147542/988596_audio.wav





Preprocessing audio...

Processing: output/audio_files/988596_audio_preprocessed.wav
  → Transcribing with Whisper...


100%|██████████| 39298/39298 [22:40<00:00, 28.88frames/s]


  → Found 368 words
  → Detecting fillers...
     Found 0 fillers
  → Detecting repetitions...
     Found 0 repetitions
  → Detecting prolongations...
     Found 2 prolongations
  → Detecting false starts...
     Found 0 false starts
  → Detecting hesitations...
     Found 2 hesitations
  → Removing overlaps...
  ✓ Total unique disfluencies: 4

Extracting 4 clips...


100%|██████████| 4/4 [00:00<00:00, 113.12it/s]


Processing Recording: 990175
Downloading audio from: https://storage.googleapis.com/joshtalks-data-collection/hq_data/hi/1147542/990175_audio.wav





Preprocessing audio...

Processing: output/audio_files/990175_audio_preprocessed.wav
  → Transcribing with Whisper...


100%|██████████| 40054/40054 [18:25<00:00, 36.22frames/s]


  → Found 513 words
  → Detecting fillers...
     Found 6 fillers
  → Detecting repetitions...
     Found 7 repetitions
  → Detecting prolongations...
     Found 6 prolongations
  → Detecting false starts...
     Found 2 false starts
  → Detecting hesitations...
     Found 23 hesitations
  → Removing overlaps...
  ✓ Total unique disfluencies: 39

Extracting 39 clips...


100%|██████████| 39/39 [00:00<00:00, 130.12it/s]



Processing Recording: 526266
Downloading audio from: https://storage.googleapis.com/joshtalks-data-collection/hq_data/hi/639950/526266_audio.wav
Preprocessing audio...

Processing: output/audio_files/526266_audio_preprocessed.wav
  → Transcribing with Whisper...


100%|██████████| 47603/47603 [20:40<00:00, 38.37frames/s]


  → Found 395 words
  → Detecting fillers...
     Found 11 fillers
  → Detecting repetitions...
     Found 0 repetitions
  → Detecting prolongations...
     Found 0 prolongations
  → Detecting false starts...
     Found 15 false starts
  → Detecting hesitations...
     Found 50 hesitations
  → Removing overlaps...
  ✓ Total unique disfluencies: 76

Extracting 76 clips...


100%|██████████| 76/76 [00:00<00:00, 93.43it/s]



✓ Results saved to: output/disfluency_results_v2.csv
✓ Total disfluencies detected: 137

SUMMARY STATISTICS

Total disfluencies: 137
Unique recordings: 5

Disfluency types:
  hesitation     :   82 ( 59.9%)
  filler         :   23 ( 16.8%)
  false_start    :   21 ( 15.3%)
  prolongation   :    8 (  5.8%)
  repetition     :    3 (  2.2%)

Duration statistics:
  Mean:   4.299s
  Median: 1.200s
  Min:    0.160s
  Max:    30.000s

Confidence statistics:
  Mean:   0.923
  Median: 1.000
  Min:    0.614
  Max:    1.000

QUALITY CHECKS

Micro-segments (<0.05s): 0
Low confidence (<0.6): 0

✅ All quality checks passed!

✓ Pipeline V2 completed successfully!
✓ Check output/ directory for results

COMPARISON: V1 vs V2

V2 Results:
  Total detections: 137
  By type:
    hesitation: 82
    filler: 23
    false_start: 21
    prolongation: 8
    repetition: 3

✓ V2 Pipeline Complete!
✓ Results saved to: output/disfluency_results_v2.csv
