<a href="https://colab.research.google.com/github/kavish-24/Konkani_Mentall_Health/blob/main/DataPreparationPrudentMedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install evaluate odfpy pydub


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting odfpy
  Downloading odfpy-1.4.1.tar.gz (717 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/717.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/717.0 kB[0m [31m23.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.0/717.0 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: odfpy
  Building wheel for odfpy (setup.py) ... [?25l[?25hdone
  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160673 sha256=b7cfddab34573cc66ae5e2184355776e8038788af4edb902ddc1314d9388d1f

In [None]:
from odf import text, teletype
from odf.opendocument import OpenDocumentText
from odf.style import Style, TextProperties
from odf.text import P
import os
import re

def extract_text_from_odt(odt_path, skip_bold=True, debug=False):
    """
    Extract text from ODT file, skipping all bold text (including letters, spaces, and punctuation)
    and removing all noise (punctuation, extra spaces) from non-bold text.

    Args:
        odt_path: Path to input ODT file
        skip_bold: If True, skip all bold text
        debug: If True, print debug information

    Returns:
        List of cleaned non-bold text paragraphs
    """
    from odf.opendocument import load

    if not os.path.exists(odt_path):
        raise FileNotFoundError(f"ODT file not found: {odt_path}")

    try:
        doc = load(odt_path)
    except Exception as e:
        raise ValueError(f"Failed to load ODT file: {e}")

    bold_styles = set()

    # Check automatic styles for bold
    for style in doc.automaticstyles.getElementsByType(Style):
        style_name = style.getAttribute('name')
        for prop in style.getElementsByType(TextProperties):
            font_weight = prop.getAttribute('fontweight')
            if font_weight and 'bold' in str(font_weight).lower():
                bold_styles.add(style_name)
                if debug:
                    print(f"   Found bold style: {style_name}")

    # Check named styles for bold
    for style in doc.styles.getElementsByType(Style):
        style_name = style.getAttribute('name')
        for prop in style.getElementsByType(TextProperties):
            font_weight = prop.getAttribute('fontweight')
            if font_weight and 'bold' in str(font_weight).lower():
                bold_styles.add(style_name)
                if debug:
                    print(f"   Found bold style: {style_name}")

    print(f"\n🔍 Detected {len(bold_styles)} bold styles: {bold_styles}")

    extracted_parts = []
    skipped_count = 0
    kept_count = 0
    skipped_text_samples = []
    kept_text_samples = []

    def get_style_name(node):
        """Get style name from a node using multiple methods"""
        # Try different attribute names
        for attr_name in ['stylename', 'style-name']:
            try:
                style = node.getAttribute(attr_name)
                if style:
                    return style
            except:
                pass

        # Try namespace-aware retrieval
        try:
            style = node.getAttrNS(
                "urn:oasis:names:tc:opendocument:xmlns:text:1.0",
                "style-name"
            )
            if style:
                return style
        except:
            pass

        return None

    def is_node_bold(node):
        """Check if a node has bold styling"""
        style_name = get_style_name(node)
        if style_name and style_name in bold_styles:
            return True
        return False

    def process_node(node, parent_is_bold=False):
        """Recursively process a node and its children"""
        nonlocal skipped_count, kept_count, skipped_text_samples, kept_text_samples

        result_text = ""

        # Check if current node is bold
        current_is_bold = parent_is_bold or is_node_bold(node)

        if node.nodeType == node.TEXT_NODE:
            node_text = node.data
            if node_text.strip():
                if skip_bold and current_is_bold:
                    skipped_count += 1
                    if len(skipped_text_samples) < 5:
                        skipped_text_samples.append(node_text.strip()[:50])
                    if debug:
                        print(f"   [SKIP] TEXT_NODE (bold): {repr(node_text.strip()[:50])}")
                else:
                    result_text += node_text
                    kept_count += 1
                    if len(kept_text_samples) < 5:
                        kept_text_samples.append(node_text.strip()[:50])
                    if debug:
                        print(f"   [KEEP] TEXT_NODE: {repr(node_text.strip()[:50])}")

        elif node.nodeType == node.ELEMENT_NODE:
            # For span elements, check if they're bold
            if node.tagName == "text:span":
                span_is_bold = current_is_bold or is_node_bold(node)

                # Get all text from this span (including nested elements)
                span_full_text = teletype.extractText(node)

                if skip_bold and span_is_bold:
                    skipped_count += 1
                    if span_full_text.strip() and len(skipped_text_samples) < 5:
                        skipped_text_samples.append(span_full_text.strip()[:50])
                    if debug:
                        style_name = get_style_name(node)
                        print(f"   [SKIP] SPAN (bold - {style_name}): {repr(span_full_text[:50])}")
                    # Don't process children if parent span is bold
                    return ""
                else:
                    if debug and span_full_text.strip():
                        style_name = get_style_name(node)
                        print(f"   [KEEP] SPAN ({style_name}): {repr(span_full_text[:50])}")
                    # Process children with current bold status
                    for child in node.childNodes:
                        result_text += process_node(child, span_is_bold)
            else:
                # For other elements, process children
                for child in node.childNodes:
                    result_text += process_node(child, current_is_bold)

        return result_text

    for paragraph in doc.getElementsByType(text.P):
        # Check if paragraph itself is bold
        para_is_bold = is_node_bold(paragraph)

        if debug and para_is_bold:
            print(f"\n⚠️  Paragraph itself is BOLD - will skip all content")

        para_text = ""
        for node in paragraph.childNodes:
            para_text += process_node(node, para_is_bold)

        if para_text.strip():
            # Clean text: remove all punctuation and normalize spaces
            para_text = re.sub(r'\.{2,}', ' ', para_text)  # Replace 2+ dots with single space
            para_text = re.sub(r'[!?,;:"\'()\[\]{}\-—*]+', '', para_text)  # Remove other punctuation

            para_text = re.sub(r'\s+', ' ', para_text)  # Normalize spaces
            para_text = para_text.strip()
            if para_text:
                extracted_parts.append(para_text)

    print(f"\n📊 Extraction summary:")
    print(f"   Kept: {kept_count} elements")
    print(f"   Skipped (bold): {skipped_count} elements")
    print(f"   Extracted paragraphs: {len(extracted_parts)}")

    if skipped_count > 0:
        print(f"\n❌ Sample SKIPPED bold text:")
        for sample in skipped_text_samples:
            print(f"   • {repr(sample)}...")

    if kept_count > 0:
        print(f"\n✅ Sample KEPT non-bold text:")
        for sample in kept_text_samples:
            print(f"   • {repr(sample)}...")

    if extracted_parts:
        print(f"\n🧼 Sample CLEANED paragraphs:")
        for i, para in enumerate(extracted_parts[:3]):
            print(f"   • [{i}] {repr(para)[:100]}...")

    if skipped_count == 0 and skip_bold:
        print(f"\n⚠️  WARNING: No bold text was found to skip! Check ODT styles.")

    return extracted_parts

def create_new_odt(output_path, paragraphs):
    """
    Create a new ODT file with the given paragraphs.

    Args:
        output_path: Path to save the new ODT file
        paragraphs: List of text paragraphs to include
    """
    doc = OpenDocumentText()

    for para_text in paragraphs:
        p = P(text=para_text)
        doc.text.addElement(p)

    try:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        doc.save(output_path)
        print(f"\n💾 New ODT file created: {output_path}")
    except Exception as e:
        raise ValueError(f"Failed to save new ODT file: {e}")

def main():
    # Configuration
    INPUT_ODT_PATH = os.getenv("TRANSCRIPT_FILE_PATH", "/content/drive/MyDrive/Anju Project (1)/Audio Prudent media (1)/August 2017 (1)/dataset/10  AUG PRIME.odt")
    OUTPUT_ODT_PATH = os.getenv("OUTPUT_ODT_PATH", "/content/drive/MyDrive/Anju Project (1)/Audio Prudent media (1)/August 2017 (1)/dataset/10 AUG PRIME_non_bold.odt")
    DEBUG = True

    print("="*70)
    print("EXTRACTING NON-BOLD TEXT AND REMOVING ALL NOISE")
    print("="*70)

    try:
        print("\n📄 Extracting text from ODT file...")
        paragraphs = extract_text_from_odt(INPUT_ODT_PATH, skip_bold=True, debug=DEBUG)

        print(f"\n✓ Extracted {len(paragraphs)} paragraphs")
        if paragraphs:
            print(f"   Sample cleaned paragraph: {repr(paragraphs[0])[:150]}...")

        print("\n📝 Creating new ODT file...")
        create_new_odt(OUTPUT_ODT_PATH, paragraphs)

        print("\n" + "="*70)
        print("✅ PROCESS COMPLETE!")
        print("="*70)

    except Exception as e:
        print(f"\n❌ Process failed: {e}")
        raise

if __name__ == "__main__":
    main()

In [None]:
"""
Simplified Audio Segmentation Using Whisper-Only Approach
=========================================================

This script uses Whisper's built-in capabilities for accurate audio-text alignment
without complex manual matching. Perfect for Google Colab.

Approach:
1. Use Silero VAD to detect speech segments
2. Let Whisper transcribe each segment with word timestamps
3. Validate and create training manifest

No complex alignment needed - Whisper handles it!
"""

# ============================================================================
# INSTALLATION (Run this first in Colab)
# ============================================================================
"""
!pip install -q faster-whisper
!pip install -q torch torchaudio
!pip install -q librosa soundfile
!pip install -q odfpy
!pip install -q tqdm
!apt-get install -y ffmpeg
"""

# ============================================================================
# IMPORTS
# ============================================================================
import os
import json
import re
import unicodedata
from pathlib import Path
from typing import List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import librosa
import soundfile as sf
import torch
from tqdm import tqdm

# For ODT reading
from odf.opendocument import load
from odf import text, teletype

# Whisper
from faster_whisper import WhisperModel

# ============================================================================
# CONFIGURATION
# ============================================================================
class Config:
    """Configuration for audio segmentation."""

    # Paths (modify these for your Colab setup)
    OUTPUT_DIR = "/content/drive/MyDrive/dataset/whisper_segments"

    # Whisper settings
    WHISPER_MODEL = "small"  # Options: tiny, base, small, medium, large
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"

    # VAD settings
    VAD_THRESHOLD = 0.5
    MIN_SPEECH_DURATION = 0.5  # seconds
    MIN_SILENCE_DURATION = 0.3  # seconds

    # Segment settings
    MIN_SEGMENT_DURATION = 1.0
    MAX_SEGMENT_DURATION = 30.0
    TARGET_SEGMENT_DURATION = 10.0  # Ideal segment length

    # Language
    LANGUAGE = "mr"  # Marathi (closest to Konkani in Whisper)

    # Quality thresholds
    MIN_CONFIDENCE = 0.3  # Minimum word probability
    MIN_WORDS_PER_SEGMENT = 3


# ============================================================================
# SILERO VAD (Voice Activity Detection)
# ============================================================================
class SileroVAD:
    """Silero VAD for detecting speech segments."""

    def __init__(self):
        print("Loading Silero VAD model...")
        try:
            self.model, utils = torch.hub.load(
                repo_or_dir='snakers4/silero-vad',
                model='silero_vad',
                force_reload=False,
                onnx=False
            )
            self.get_speech_timestamps = utils[0]
            print("✓ Silero VAD loaded")
        except Exception as e:
            print(f"⚠ Could not load Silero VAD: {e}")
            print("Falling back to energy-based VAD")
            self.model = None

    def detect_speech(self, audio_path, threshold=0.5, min_speech_ms=250, min_silence_ms=100):
        """Detect speech segments in audio file."""

        # Load audio at 16kHz (required by Silero)
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)

        if self.model is not None:
            # Use Silero VAD
            audio_tensor = torch.from_numpy(audio)

            speech_timestamps = self.get_speech_timestamps(
                audio_tensor,
                self.model,
                threshold=threshold,
                min_speech_duration_ms=min_speech_ms,
                min_silence_duration_ms=min_silence_ms,
                sampling_rate=16000
            )

            segments = [
                {
                    'start': ts['start'] / 16000,
                    'end': ts['end'] / 16000
                }
                for ts in speech_timestamps
            ]
        else:
            # Fallback: Energy-based VAD
            segments = self._energy_based_vad(audio, sr)

        return segments

    def _energy_based_vad(self, audio, sr, frame_length=2048, hop_length=512):
        """Fallback energy-based VAD."""
        # Calculate RMS energy
        rms = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0]

        # Threshold
        threshold = np.mean(rms) * 1.5

        # Find speech frames
        speech_frames = rms > threshold

        # Convert to time segments
        times = librosa.frames_to_time(np.arange(len(speech_frames)), sr=sr, hop_length=hop_length)

        segments = []
        in_speech = False
        start_time = 0

        for i, is_speech in enumerate(speech_frames):
            if is_speech and not in_speech:
                start_time = times[i]
                in_speech = True
            elif not is_speech and in_speech:
                segments.append({'start': start_time, 'end': times[i]})
                in_speech = False

        if in_speech:
            segments.append({'start': start_time, 'end': times[-1]})

        return segments


# ============================================================================
# WHISPER ALIGNER
# ============================================================================
class WhisperAligner:
    """Simplified aligner using only Whisper."""

    def __init__(self, config: Config = None):
        self.config = config or Config()

        # Create output directories
        self.output_dir = Path(self.config.OUTPUT_DIR)
        self.audio_dir = self.output_dir / "audio"
        self.text_dir = self.output_dir / "text"
        self.metadata_dir = self.output_dir / "metadata"

        for dir_path in [self.audio_dir, self.text_dir, self.metadata_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)

        # Initialize VAD
        self.vad = SileroVAD()

        # Initialize Whisper
        print(f"Loading Whisper model: {self.config.WHISPER_MODEL}...")
        self.whisper = WhisperModel(
            self.config.WHISPER_MODEL,
            device=self.config.DEVICE,
            compute_type=self.config.COMPUTE_TYPE
        )
        print(f"✓ Whisper loaded on {self.config.DEVICE}")

    def load_transcript_from_odt(self, odt_path: str) -> str:
        """Load transcript from ODT file."""
        print(f"Loading transcript: {Path(odt_path).name}")

        doc = load(str(odt_path))
        paragraphs = []

        for paragraph in doc.getElementsByType(text.P):
            para_text = teletype.extractText(paragraph)
            if para_text.strip():
                paragraphs.append(para_text.strip())

        full_text = " ".join(paragraphs)
        print(f"✓ Loaded {len(paragraphs)} paragraphs, {len(full_text)} characters")
        return full_text

    def process_audio_file(
        self,
        audio_path: str,
        odt_path: str = None,
        session_id: str = None
    ) -> List[Dict[str, Any]]:
        """
        Process audio file using Whisper-only approach.

        Args:
            audio_path: Path to audio file
            odt_path: Optional path to reference transcript (for validation only)
            session_id: Session identifier

        Returns:
            List of segment metadata
        """
        audio_path = Path(audio_path)

        if session_id is None:
            session_id = audio_path.stem

        print(f"\n{'='*80}")
        print(f"PROCESSING: {audio_path.name}")
        print(f"Session: {session_id}")
        print(f"{'='*80}\n")

        # Load reference transcript if provided
        reference_transcript = None
        if odt_path:
            reference_transcript = self.load_transcript_from_odt(odt_path)

        # Step 1: Detect speech segments with VAD
        print("Step 1: Detecting speech segments...")
        vad_segments = self.vad.detect_speech(
            str(audio_path),
            threshold=self.config.VAD_THRESHOLD,
            min_speech_ms=int(self.config.MIN_SPEECH_DURATION * 1000),
            min_silence_ms=int(self.config.MIN_SILENCE_DURATION * 1000)
        )
        print(f"✓ Found {len(vad_segments)} speech segments")

        # Step 2: Merge short segments
        merged_segments = self._merge_short_segments(vad_segments)
        print(f"✓ Merged to {len(merged_segments)} segments")

        # Step 3: Transcribe each segment with Whisper
        print("\nStep 2: Transcribing with Whisper...")
        all_segments = []

        for idx, vad_seg in enumerate(tqdm(merged_segments, desc="Transcribing")):
            # Extract audio segment
            segment_audio, sr = librosa.load(
                str(audio_path),
                sr=16000,
                offset=vad_seg['start'],
                duration=vad_seg['end'] - vad_seg['start']
            )

            # Save temporary audio file for Whisper
            temp_audio = self.output_dir / f"temp_{idx}.wav"
            sf.write(temp_audio, segment_audio, sr)

            try:
                # Transcribe with Whisper
                segments, info = self.whisper.transcribe(
                    str(temp_audio),
                    language=self.config.LANGUAGE,
                    word_timestamps=True,
                    beam_size=5,
                    best_of=5,
                    temperature=0.0,
                    vad_filter=False  # We already did VAD
                )

                # Process segments
                for seg in segments:
                    if not seg.text.strip():
                        continue

                    # Adjust timestamps to original audio
                    adjusted_start = vad_seg['start'] + seg.start
                    adjusted_end = vad_seg['start'] + seg.end

                    # Extract word-level info
                    words = []
                    if hasattr(seg, 'words') and seg.words:
                        words = [
                            {
                                'word': w.word.strip(),
                                'start': vad_seg['start'] + w.start,
                                'end': vad_seg['start'] + w.end,
                                'probability': w.probability
                            }
                            for w in seg.words
                        ]

                    all_segments.append({
                        'start': adjusted_start,
                        'end': adjusted_end,
                        'duration': adjusted_end - adjusted_start,
                        'text': seg.text.strip(),
                        'words': words,
                        'avg_logprob': seg.avg_logprob if hasattr(seg, 'avg_logprob') else 0.0
                    })

            finally:
                # Clean up temp file
                if temp_audio.exists():
                    temp_audio.unlink()

        print(f"\n✓ Transcribed {len(all_segments)} segments")

        # Step 3: Validate and filter segments
        print("\nStep 3: Validating segments...")
        valid_segments = self._validate_segments(all_segments)
        print(f"✓ {len(valid_segments)}/{len(all_segments)} segments passed validation")

        # Step 4: Save segments
        print("\nStep 4: Saving segments...")
        saved_segments = self._save_segments(
            valid_segments,
            audio_path,
            session_id,
            reference_transcript
        )

        print(f"\n{'='*80}")
        print(f"✓ COMPLETE! Created {len(saved_segments)} segments")
        print(f"{'='*80}\n")

        return saved_segments

    def _merge_short_segments(self, segments: List[Dict]) -> List[Dict]:
        """Merge segments that are too short."""
        if not segments:
            return []

        merged = []
        current = segments[0].copy()

        for next_seg in segments[1:]:
            current_duration = current['end'] - current['start']
            gap = next_seg['start'] - current['end']

            # Merge if current is too short and gap is small
            if current_duration < self.config.TARGET_SEGMENT_DURATION and gap < 1.0:
                current['end'] = next_seg['end']
            else:
                merged.append(current)
                current = next_seg.copy()

        merged.append(current)
        return merged

    def _validate_segments(self, segments: List[Dict]) -> List[Dict]:
        """Validate and filter segments based on quality criteria."""
        valid = []

        for seg in segments:
            # Check duration
            if seg['duration'] < self.config.MIN_SEGMENT_DURATION:
                continue
            if seg['duration'] > self.config.MAX_SEGMENT_DURATION:
                continue

            # Check text quality
            text = seg['text'].strip()
            if not text:
                continue

            words = text.split()
            if len(words) < self.config.MIN_WORDS_PER_SEGMENT:
                continue

            # Check word-level confidence
            if seg.get('words'):
                avg_prob = np.mean([w['probability'] for w in seg['words']])
                if avg_prob < self.config.MIN_CONFIDENCE:
                    continue

            valid.append(seg)

        return valid

    def _save_segments(
        self,
        segments: List[Dict],
        audio_path: Path,
        session_id: str,
        reference_transcript: str = None
    ) -> List[Dict]:
        """Save segments to disk and create manifest."""
        saved_segments = []

        # Load full audio once
        print("Loading audio for extraction...")
        audio, sr = librosa.load(str(audio_path), sr=16000, mono=True)

        for idx, seg in enumerate(tqdm(segments, desc="Saving segments")):
            segment_id = f"{session_id}_{idx:04d}"

            # Paths
            audio_file = self.audio_dir / f"{segment_id}.wav"
            text_file = self.text_dir / f"{segment_id}.txt"

            try:
                # Extract audio segment
                start_sample = int(seg['start'] * sr)
                end_sample = int(seg['end'] * sr)
                segment_audio = audio[start_sample:end_sample]

                # Save audio
                sf.write(audio_file, segment_audio, sr)

                # Save text
                clean_text = self._clean_text(seg['text'])
                with open(text_file, 'w', encoding='utf-8') as f:
                    f.write(clean_text)

                # Create metadata entry
                metadata = {
                    'segment_id': segment_id,
                    'audio_filepath': f"audio/{segment_id}.wav",
                    'text_filepath': f"text/{segment_id}.txt",
                    'text': clean_text,
                    'start_time': float(seg['start']),
                    'end_time': float(seg['end']),
                    'duration': float(seg['duration']),
                    'word_count': len(clean_text.split()),
                    'language': self.config.LANGUAGE,
                    'avg_confidence': float(np.mean([w['probability'] for w in seg.get('words', [])])) if seg.get('words') else 0.0
                }

                saved_segments.append(metadata)

            except Exception as e:
                print(f"\n⚠ Error saving segment {segment_id}: {e}")
                continue

        # Save manifest
        manifest = {
            'session_id': session_id,
            'audio_file': str(audio_path.name),
            'total_segments': len(saved_segments),
            'total_duration': sum(s['duration'] for s in saved_segments),
            'language': self.config.LANGUAGE,
            'segments': saved_segments
        }

        manifest_file = self.metadata_dir / f"{session_id}_manifest.json"
        with open(manifest_file, 'w', encoding='utf-8') as f:
            json.dump(manifest, f, indent=2, ensure_ascii=False)

        # Also save JSONL format for Whisper fine-tuning
        jsonl_file = self.metadata_dir / f"{session_id}_train.jsonl"
        with open(jsonl_file, 'w', encoding='utf-8') as f:
            for seg in saved_segments:
                entry = {
                    'audio': seg['audio_filepath'],
                    'text': seg['text'],
                    'duration': seg['duration']
                }
                f.write(json.dumps(entry, ensure_ascii=False) + '\n')

        print(f"\n✓ Saved manifest to: {manifest_file}")
        print(f"✓ Saved JSONL to: {jsonl_file}")

        return saved_segments

    @staticmethod
    def _clean_text(text: str) -> str:
        """Clean and normalize text."""
        # Normalize Unicode
        text = unicodedata.normalize("NFC", text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Strip
        text = text.strip()

        return text


# ============================================================================
# MAIN EXECUTION
# ============================================================================
def main():
    """Main execution function."""

    # Configuration
    config = Config()

    # Initialize aligner
    aligner = WhisperAligner(config)

    # File list - MODIFY THESE PATHS FOR YOUR SETUP
    files_to_process = [
        {
            'audio_path': '/content/drive/MyDrive/dataset/Konkani Prime News_100817.wav',  # Your audio file
            'odt_path': '/content/drive/MyDrive/dataset/10 AUG PRIME_non_bold (1).odt',  # Optional: reference transcript
            'session_id': 'session_001'
        }
    ]

    # Process files
    all_segments = []

    for file_info in files_to_process:
        try:
            # Check if files exist
            if not os.path.exists(file_info['audio_path']):
                print(f"❌ Audio file not found: {file_info['audio_path']}")
                continue

            # Process
            segments = aligner.process_audio_file(
                audio_path=file_info['audio_path'],
                odt_path=file_info.get('odt_path'),
                session_id=file_info.get('session_id')
            )

            all_segments.extend(segments)

        except Exception as e:
            print(f"❌ Error processing {file_info['audio_path']}: {e}")
            import traceback
            traceback.print_exc()

    # Summary
    print(f"\n{'='*80}")
    print(f"FINAL SUMMARY")
    print(f"{'='*80}")
    print(f"Total segments created: {len(all_segments)}")
    print(f"Total duration: {sum(s['duration'] for s in all_segments):.2f}s")
    print(f"Output directory: {config.OUTPUT_DIR}")
    print(f"{'='*80}\n")


# ============================================================================
# RUN
# ============================================================================
if __name__ == "__main__":
    main()

Loading Silero VAD model...


Using cache found in /root/.cache/torch/hub/snakers4_silero-vad_master


✓ Silero VAD loaded
Loading Whisper model: small...
✓ Whisper loaded on cuda

PROCESSING: Konkani Prime News_100817.wav
Session: session_001

Loading transcript: 10 AUG PRIME_non_bold (1).odt
✓ Loaded 27 paragraphs, 6076 characters
Step 1: Detecting speech segments...
✓ Found 71 speech segments
✓ Merged to 37 segments

Step 2: Transcribing with Whisper...


Transcribing: 100%|██████████| 37/37 [01:00<00:00,  1.63s/it]



✓ Transcribed 75 segments

Step 3: Validating segments...
✓ 68/75 segments passed validation

Step 4: Saving segments...
Loading audio for extraction...


Saving segments: 100%|██████████| 68/68 [00:01<00:00, 38.34it/s]



✓ Saved manifest to: /content/drive/MyDrive/dataset/whisper_segments/metadata/session_001_manifest.json
✓ Saved JSONL to: /content/drive/MyDrive/dataset/whisper_segments/metadata/session_001_train.jsonl

✓ COMPLETE! Created 68 segments


FINAL SUMMARY
Total segments created: 68
Total duration: 336.08s
Output directory: /content/drive/MyDrive/dataset/whisper_segments



In [None]:
!pip install numpy librosa faster-whisper tqdm odfpy soundfile python-Levenshtein



In [None]:
import os
import re
from pathlib import Path
from odf import text, teletype
from odf.opendocument import load

class PhoneticTranscriptMatcher:
    def __init__(self, whisper_dir, odt_path, output_dir, noise_threshold=10):
        self.whisper_dir = Path(whisper_dir)
        self.odt_path = Path(odt_path)
        self.output_dir = Path(output_dir)
        self.noise_threshold = noise_threshold  # Minimum word count
        self.reference_paragraphs = []
        self.used_paragraphs = set()  # Track which paragraphs have been used

    def read_odt(self):
        """Extract text from ODT file"""
        print("Reading ODT reference document...")
        try:
            doc = load(self.odt_path)
            all_paragraphs = doc.getElementsByType(text.P)

            for para in all_paragraphs:
                para_text = teletype.extractText(para)
                if para_text.strip():
                    self.reference_paragraphs.append(para_text.strip())

            print(f"Loaded {len(self.reference_paragraphs)} paragraphs from ODT")
            return True
        except Exception as e:
            print(f"Error reading ODT: {e}")
            return False

    def normalize_marathi_phonetic(self, char):
        """Normalize Marathi/Konkani characters to their phonetic equivalents"""
        # Remove vowel marks (matras)
        vowel_marks = ['ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', 'ं', 'ः', '़', 'ृ', 'ॅ']
        if char in vowel_marks:
            return ''

        # Phonetic normalization mapping
        phonetic_map = {
            # थ / ट / ठ -> ट
            'थ': 'ट', 'ठ': 'ट',
            # ध / ड / ढ -> ड
            'ध': 'ड', 'ढ': 'ड',
            # फ / प -> प
            'फ': 'प',
            # भ / ब -> ब
            'भ': 'ब',
            # छ / च -> च
            'छ': 'च',
            # झ / ज -> ज
            'झ': 'ज',
            # ख / क / घ / ग -> क
            'ख': 'क', 'घ': 'क', 'ग': 'क',
            # ण / न -> न
            'ण': 'न',
            # ष / श / स -> स
            'ष': 'स', 'श': 'स',
            # ळ / ल -> ल
            'ळ': 'ल',
        }

        return phonetic_map.get(char, char.lower())

    def get_first_letters(self, text):
        """Extract first letters of each word for phonetic matching"""
        words = re.findall(r'\S+', text)
        first_letters = []
        for word in words:
            # Remove punctuation from start
            clean_word = re.sub(r'^[^\w]+', '', word)
            if clean_word:
                first_char = clean_word[0]
                normalized = self.normalize_marathi_phonetic(first_char)
                if normalized:  # Only add if not empty (vowel marks return '')
                    first_letters.append(normalized)
        return first_letters

    def is_noisy_transcript(self, content):
        """Detect if transcript is likely noise/useless"""
        if not content:
            return True

        words = content.split()
        if len(words) < self.noise_threshold:
            return True

        # Check for excessive repetition
        unique_words = set(words)
        repetition_ratio = len(unique_words) / len(words)
        if repetition_ratio < 0.2:  # Too repetitive
            return True

        return False

    def match_by_first_letters(self, transcript_text):
        """Match transcript to reference paragraph based on first letter sounds"""
        transcript_letters = self.get_first_letters(transcript_text)

        if len(transcript_letters) < 3:
            return None, 0, -1

        # Compare first 5-10 letters
        compare_length = min(10, len(transcript_letters))
        transcript_signature = transcript_letters[:compare_length]

        best_match = None
        best_score = 0
        best_idx = -1

        for idx, ref_para in enumerate(self.reference_paragraphs):
            # Skip if this paragraph has already been used
            if idx in self.used_paragraphs:
                continue

            ref_letters = self.get_first_letters(ref_para)

            if len(ref_letters) < 3:
                continue

            ref_signature = ref_letters[:compare_length]

            # Calculate matching score
            matches = sum(1 for i, letter in enumerate(transcript_signature)
                         if i < len(ref_signature) and letter == ref_signature[i])

            score = matches / compare_length

            if score > best_score:
                best_score = score
                best_match = ref_para
                best_idx = idx

        return best_match, best_score, best_idx

    def process_transcripts(self):
        """Process all transcript files"""
        if not self.read_odt():
            print("Failed to read ODT file. Exiting.")
            return

        # Create output directory
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # Check if directory exists
        if not self.whisper_dir.exists():
            print(f"ERROR: Directory not found: {self.whisper_dir}")
            return

        # Get all text files (try different patterns)
        transcript_files = list(self.whisper_dir.glob('*.txt'))
        if not transcript_files:
            transcript_files = list(self.whisper_dir.glob('**/*.txt'))  # Search subdirectories

        if not transcript_files:
            print(f"\nNo .txt files found in: {self.whisper_dir}")
            print("Contents of directory:")
            try:
                for item in self.whisper_dir.iterdir():
                    print(f"  - {item.name}")
            except Exception as e:
                print(f"  Cannot read directory: {e}")
            return

        print(f"\nFound {len(transcript_files)} transcript files")

        processed = 0
        skipped_noise = 0
        no_match = 0

        for txt_file in sorted(transcript_files):
            print(f"\nProcessing: {txt_file.name}")

            # Read transcript
            try:
                with open(txt_file, 'r', encoding='utf-8') as f:
                    transcript_content = f.read()
            except Exception as e:
                print(f"  ⚠️  Error reading file: {e}")
                continue

            # Match based on first letter sounds (process all files, no noise filtering)
            matched_text, score, idx = self.match_by_first_letters(transcript_content)

            # Get first letters for display
            trans_letters = ''.join(self.get_first_letters(transcript_content)[:15])

            if score < 0.1:  # Low confidence match
                print(f"  ⚠️  No good match (score: {score:.2f}, letters: {trans_letters})")
                no_match += 1
                # Skip this file - no good match found
                continue
            else:
                ref_letters = ''.join(self.get_first_letters(matched_text)[:15]) if matched_text else ''
                print(f"  ✓ Match found (score: {score:.2f}, para: {idx})")
                print(f"    Transcript letters: {trans_letters}")
                print(f"    Reference letters:  {ref_letters}")

                # Mark this paragraph as used
                self.used_paragraphs.add(idx)

                # Output only the reference text from ODT
                output_content = matched_text

            # Save to output
            output_file = self.output_dir / txt_file.name
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(output_content)

            processed += 1

        # Summary
        print("\n" + "="*60)
        print("PROCESSING SUMMARY")
        print("="*60)
        print(f"Total files found: {len(transcript_files)}")
        print(f"Successfully processed: {processed}")
        print(f"Skipped (noise): {skipped_noise}")
        print(f"No good match: {no_match}")
        print(f"Unique paragraphs used: {len(self.used_paragraphs)}/{len(self.reference_paragraphs)}")
        print(f"\nOutput saved to: {self.output_dir}")


# Usage
if __name__ == "__main__":
    # Configure paths
    WHISPER_DIR = "/content/drive/MyDrive/dataset/whisper_segments/text"
    ODT_FILE = "/content/drive/MyDrive/dataset/10 AUG PRIME_non_bold (1).odt"
    OUTPUT_DIR = "/content/drive/MyDrive/dataset/final_text"

    # Initialize and run
    matcher = PhoneticTranscriptMatcher(
        whisper_dir=WHISPER_DIR,
        odt_path=ODT_FILE,
        output_dir=OUTPUT_DIR,
        noise_threshold=10  # Minimum words to not be considered noise
    )

    matcher.process_transcripts()

Reading ODT reference document...
Loaded 27 paragraphs from ODT

Found 68 transcript files

Processing: session_001_0000.txt
  ✓ Match found (score: 0.50, para: 0)
    Transcript letters: नबपअ
    Reference letters:  नपपक

Processing: session_001_0001.txt
  ✓ Match found (score: 0.10, para: 1)
    Transcript letters: तवदकबवतआटडकसअउप
    Reference letters:  टनडकवटटडकसउपउलस

Processing: session_001_0002.txt
  ✓ Match found (score: 0.20, para: 8)
    Transcript letters: कसबआबकपवपजततपमब
    Reference letters:  कवटआटडसऑकपउसआकन

Processing: session_001_0003.txt
  ✓ Match found (score: 0.30, para: 9)
    Transcript letters: असकसकरबपककककललउ
    Reference letters:  कपकककपनसकहकमबअस

Processing: session_001_0004.txt
  ✓ Match found (score: 0.30, para: 4)
    Transcript letters: आजतममदलदचसवतहकत
    Reference letters:  आजममदडकवहबदललएप

Processing: session_001_0005.txt
  ✓ Match found (score: 0.20, para: 7)
    Transcript letters: चकबदवसउआमतहन
    Reference letters:  बपयबदएआएमतकपडआ

Processing: sess

In [None]:
!pip install odfpy


Collecting odfpy
  Downloading odfpy-1.4.1.tar.gz (717 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/717.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.0/717.0 kB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: odfpy
  Building wheel for odfpy (setup.py) ... [?25l[?25hdone
  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160673 sha256=4ac8bd269b5e770819bec1763e8a0525afdf4ee7d67515975dfb130d9ce8b277
  Stored in directory: /root/.cache/pip/wheels/36/5d/63/8243a7ee78fff0f944d638fd0e66d7278888f5e2285d7346b6
Successfully built odfpy
Installing collected packages: odfpy
Successfully installed odfpy-1.4.1


In [None]:
import requests
import json

API_KEY = "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"

url = "https://openrouter.ai/api/v1/chat/completions"

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "HTTP-Referer": "http://localhost",  # optional, but recommended
    "X-Title": "Gemini Test Script",      # optional
    "Content-Type": "application/json"
}

data = {
    "model": "google/gemini-2.5-flash-lite",
    "messages": [
        {"role": "user", "content": "Hello! Test message. What is 2+2?"}
    ]
}

response = requests.post(url, headers=headers, data=json.dumps(data))

print("Status:", response.status_code)
print("Response:")
print(response.text)

try:
    print("\nAssistant reply:")
    print(json.loads(response.text)["choices"][0]["message"]["content"])
except:
    print("Could not parse response.")


Status: 200
Response:
{"id":"gen-1761842604-WsSfAkC7SjgZvLVTZIT5","provider":"Google","model":"google/gemini-2.5-flash-lite","object":"chat.completion","created":1761842604,"choices":[{"logprobs":null,"finish_reason":"stop","native_finish_reason":"STOP","index":0,"message":{"role":"assistant","content":"Hello! Test message acknowledged.\n\n2 + 2 = **4**","refusal":null,"reasoning":null}}],"usage":{"prompt_tokens":12,"completion_tokens":15,"total_tokens":27,"prompt_tokens_details":{"cached_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"image_tokens":0}}}

Assistant reply:
Hello! Test message acknowledged.

2 + 2 = **4**


In [None]:
import os
import json
import requests
from pathlib import Path
from datetime import datetime
import re
from collections import deque

class SemanticWhisperAligner:
    def __init__(self, gemini_api_key, whisper_folder, odt_path, output_folder):
        """
        Semantic aligner that matches based on meaning, not word count.
        Now includes validation to skip noise/null data.

        Args:
            gemini_api_key: API key for Gemini via OpenRouter
            whisper_folder: Path to folder with Whisper segments
            odt_path: Path to ODT file with correct transcript
            output_folder: Path to save aligned correct transcripts
        """
        self.gemini_api_key = gemini_api_key
        self.whisper_folder = Path(whisper_folder)
        self.odt_path = Path(odt_path)
        self.output_folder = Path(output_folder)
        self.url = "https://openrouter.ai/api/v1/chat/completions"

        # Validate API key
        if not self.gemini_api_key:
            raise ValueError("ERROR: GEMINI_API_KEY is not set!")

        print(f"✓ API Key loaded: {self.gemini_api_key[:8]}...{self.gemini_api_key[-4:]}")

        # Create output folder
        self.output_folder.mkdir(parents=True, exist_ok=True)

        # Progress tracking
        self.progress_file = self.output_folder / "alignment_progress.json"
        self.progress = self._load_progress()

        # Read ODT as full text (not split into words yet)
        self.odt_full_text = self._read_odt()
        self.odt_words = self.odt_full_text.split()
        self.current_position = self.progress.get("current_position", 0)

        print(f"✓ Loaded {len(self.odt_words)} words from ODT reference")

    def _load_progress(self):
        """Load or create progress tracker"""
        if self.progress_file.exists():
            with open(self.progress_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # Ensure all required keys exist
                data.setdefault("processed_files", [])
                data.setdefault("skipped_files", [])
                data.setdefault("current_position", 0)
                data.setdefault("alignment_log", [])
                if "statistics" not in data:
                    data["statistics"] = {}
                stats = data["statistics"]
                stats.setdefault("total_files", 0)
                stats.setdefault("exact_match", 0)
                stats.setdefault("semantic_match", 0)
                stats.setdefault("fallback", 0)
                stats.setdefault("skipped_noise", 0)
                stats.setdefault("failed_validation", 0)
                stats.setdefault("words_used", 0)
                stats.setdefault("words_remaining", 0)
                return data
        return {
            "processed_files": [],
            "skipped_files": [],
            "current_position": 0,
            "alignment_log": [],
            "statistics": {
                "total_files": 0,
                "exact_match": 0,
                "semantic_match": 0,
                "fallback": 0,
                "skipped_noise": 0,
                "failed_validation": 0,
                "words_used": 0,
                "words_remaining": 0
            }
        }

    def _save_progress(self):
        """Save progress"""
        self.progress["current_position"] = self.current_position
        self.progress["statistics"]["words_remaining"] = len(self.odt_words) - self.current_position
        self.progress["last_updated"] = datetime.now().isoformat()

        with open(self.progress_file, 'w', encoding='utf-8') as f:
            json.dump(self.progress, f, ensure_ascii=False, indent=2)

    def _read_odt(self):
        """Read ODT and return as full text"""
        try:
            from odf import text, teletype
            from odf.opendocument import load

            doc = load(self.odt_path)
            all_text = []

            for paragraph in doc.getElementsByType(text.P):
                para_text = teletype.extractText(paragraph)
                if para_text.strip():
                    all_text.append(para_text.strip())

            full_text = " ".join(all_text)
            return full_text

        except ImportError:
            print("ERROR: odfpy not installed. Run: pip install odfpy")
            return ""
        except Exception as e:
            print(f"ERROR reading ODT: {e}")
            return ""

    def get_odt_context(self, num_words=50):
        """Get next N words from current position as context"""
        end_pos = min(self.current_position + num_words, len(self.odt_words))
        return ' '.join(self.odt_words[self.current_position:end_pos])

    def normalize_text(self, text):
        """Normalize text for comparison"""
        normalized = re.sub(r'[।,;!?\.\-]', '', text)
        normalized = ' '.join(normalized.split())
        return normalized.strip().lower()

    def validate_whisper_input(self, whisper_text):
        """
        Validate if whisper input is meaningful Konkani/Marathi text or just noise.

        Returns:
            Tuple of (is_valid, confidence, reason)
        """
        # Quick checks for obvious noise
        if not whisper_text or len(whisper_text.strip()) < 5:
            return False, 0.0, "Empty or too short"

        # Check for excessive repetition
        words = whisper_text.split()
        if len(words) > 3 and len(set(words)) == 1:
            return False, 0.0, "Repetitive noise"

        # Use Gemini to validate
        prompt = f"""Analyze if this text is meaningful Konkani or Marathi speech, or just noise/null data.

**Text to analyze:**
{whisper_text}

**Instructions:**
1. Check if this contains actual Konkani/Marathi words and phrases
2. Identify if it's just noise, silence markers, or random characters
3. Look for linguistic patterns that indicate real speech

**Respond with ONLY ONE of these:**
- VALID: Meaningful Konkani/Marathi text
- NOISE: Background noise, silence, or random sounds
- PARTIAL: Mix of valid words and noise

**Response:**"""

        headers = {
            "Authorization": f"Bearer {self.gemini_api_key}",
            "Content-Type": "application/json"
        }

        data = {
            "model": "google/gemini-2.0-flash-exp:free",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.1
        }

        try:
            response = requests.post(self.url, headers=headers, json=data, timeout=30)

            if response.status_code == 200:
                result = response.json()
                validation = result['choices'][0]['message']['content'].strip().upper()

                if "VALID" in validation and "NOISE" not in validation:
                    return True, 0.9, "Valid Konkani/Marathi text"
                elif "PARTIAL" in validation:
                    return True, 0.6, "Partial valid content"
                else:
                    return False, 0.2, "Detected as noise/invalid"
            else:
                print(f"  ⚠ Validation API Error: {response.status_code}")
                return True, 0.5, "API error - proceeding with caution"

        except Exception as e:
            print(f"  ⚠ Validation Exception: {str(e)[:100]}")
            return True, 0.5, "Validation failed - proceeding with caution"

    def validate_output_alignment(self, whisper_text, aligned_text):
        """
        Validate that the aligned output actually matches the whisper input semantically.

        Returns:
            Tuple of (is_valid, confidence, reason)
        """
        prompt = f"""Compare these two texts and determine if they represent the same content in Konkani/Marathi.

**Whisper Transcription (may have errors):**
{whisper_text}

**Aligned ODT Text (reference):**
{aligned_text}

**Task:**
Determine if these texts convey the SAME meaning/content, accounting for:
- Spelling variations and transcription errors
- Word boundary differences
- Minor phonetic variations

**Respond with ONLY ONE of these:**
- MATCH: Texts represent the same content
- MISMATCH: Texts are completely different content
- UNCERTAIN: Cannot determine clearly

**Response:**"""

        headers = {
            "Authorization": f"Bearer {self.gemini_api_key}",
            "Content-Type": "application/json"
        }

        data = {
            "model": "google/gemini-2.0-flash-exp:free",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.1
        }

        try:
            response = requests.post(self.url, headers=headers, json=data, timeout=30)

            if response.status_code == 200:
                result = response.json()
                validation = result['choices'][0]['message']['content'].strip().upper()

                if "MATCH" in validation and "MISMATCH" not in validation:
                    return True, 0.9, "Semantic match confirmed"
                elif "UNCERTAIN" in validation:
                    return True, 0.6, "Uncertain but proceeding"
                else:
                    return False, 0.2, "Semantic mismatch detected"
            else:
                print(f"  ⚠ Output Validation API Error: {response.status_code}")
                return True, 0.5, "API error - cannot validate"

        except Exception as e:
            print(f"  ⚠ Output Validation Exception: {str(e)[:100]}")
            return True, 0.5, "Validation failed - cannot verify"

    def semantic_align(self, whisper_text):
        """
        Use Gemini to find the matching ODT text based on MEANING, not word count.

        Returns:
            Tuple of (matched_odt_text, word_count_used, confidence)
        """
        # Get a large context window from ODT (next 100 words)
        odt_context = self.get_odt_context(100)

        prompt = f"""You are aligning a noisy Whisper transcription with clean reference text in Konkani.

**Whisper Segment (noisy, may have errors):**
{whisper_text}

**Reference ODT Text (next 100 words from current position):**
{odt_context}

**Task:**
1. Find the EXACT portion of the ODT text that corresponds to the Whisper segment
2. The Whisper text has errors: wrong word boundaries, spelling mistakes, etc.
3. Focus on MEANING and PHONETICS, not word count
4. Extract and return ONLY the matching portion from ODT text
5. Do NOT add or change anything - extract the exact matching text from ODT

**Important:**
- Whisper has {len(whisper_text.split())} words, but ODT match may have different word count
- Return ONLY the matched ODT text, nothing else

**Matched ODT Text:**"""

        headers = {
            "Authorization": f"Bearer {self.gemini_api_key}",
            "Content-Type": "application/json"
        }

        data = {
            "model": "google/gemini-2.5-flash-lite",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.1
        }

        try:
            response = requests.post(self.url, headers=headers, json=data, timeout=60)

            if response.status_code == 200:
                result = response.json()
                matched_text = result['choices'][0]['message']['content'].strip()

                # Remove markdown formatting if present
                matched_text = matched_text.replace('**', '').replace('*', '')
                matched_text = matched_text.strip()

                if not matched_text or len(matched_text) < 10:
                    return None, 0, 0

                # Calculate how many ODT words were matched
                word_count = len(matched_text.split())

                # Verify the match exists in ODT context
                if self.normalize_text(matched_text) in self.normalize_text(odt_context):
                    return matched_text, word_count, 0.9
                else:
                    # Try to find best overlap
                    return matched_text, word_count, 0.7

            else:
                error_msg = response.text[:200] if response.text else "No response"
                print(f"  ⚠ API Error {response.status_code}: {error_msg}")
                return None, 0, 0

        except requests.exceptions.Timeout:
            print(f"  ⚠ API Timeout")
            return None, 0, 0
        except Exception as e:
            print(f"  ⚠ Exception: {str(e)[:100]}")
            return None, 0, 0

    def align_segment(self, whisper_text, filename):
        """
        Align a single whisper segment with ODT text.
        Now includes input validation and output verification.

        Returns:
            Tuple of (aligned_text, method_used, confidence, words_consumed, skip_reason)
        """
        whisper_words = whisper_text.split()
        num_whisper_words = len(whisper_words)

        print(f"\n  {'─'*66}")
        print(f"  📄 Whisper Input:")
        print(f"  Text: '{whisper_text[:100]}{'...' if len(whisper_text) > 100 else ''}'")
        print(f"  Words: {num_whisper_words}")
        print(f"  ODT Position: {self.current_position}/{len(self.odt_words)}")

        # VALIDATION STEP 1: Check if whisper input is valid
        print(f"\n  🔍 Validating input...")
        is_valid, val_confidence, val_reason = self.validate_whisper_input(whisper_text)

        if not is_valid:
            print(f"  ❌ SKIPPING: {val_reason}")
            print(f"  Confidence: {val_confidence:.1%}")
            print(f"  {'─'*66}")
            return None, "skipped_noise", 0, 0, val_reason

        print(f"  ✓ Input validated: {val_reason} (confidence: {val_confidence:.1%})")

        # Check if we have enough ODT words left
        if self.current_position >= len(self.odt_words):
            print(f"  ⚠ No more ODT words available!")
            print(f"  {'─'*66}")
            return None, "no_words", 0, 0, "No ODT words remaining"

        # STEP 1: Try exact match with same word count
        odt_same_count = ' '.join(self.odt_words[self.current_position:self.current_position + num_whisper_words])

        if self.normalize_text(whisper_text) == self.normalize_text(odt_same_count):
            aligned_text = odt_same_count
            method = "exact"
            confidence = 1.0
            words_consumed = num_whisper_words
        else:
            # STEP 2: Use semantic alignment with Gemini
            print(f"  🤖 Using semantic alignment...")
            matched_text, word_count, confidence = self.semantic_align(whisper_text)

            if matched_text and word_count > 0:
                aligned_text = matched_text
                method = "semantic"
                words_consumed = word_count
            else:
                # STEP 3: Fallback - use same word count
                aligned_text = odt_same_count
                method = "fallback"
                confidence = 0.3
                words_consumed = num_whisper_words

        # VALIDATION STEP 2: Verify output alignment
        print(f"\n  🔍 Validating output alignment...")
        output_valid, out_confidence, out_reason = self.validate_output_alignment(whisper_text, aligned_text)

        if not output_valid:
            print(f"  ❌ OUTPUT VALIDATION FAILED: {out_reason}")
            print(f"  Confidence: {out_confidence:.1%}")
            print(f"  {'─'*66}")
            return None, "failed_validation", 0, 0, out_reason

        print(f"  ✓ Output validated: {out_reason} (confidence: {out_confidence:.1%})")

        # Print alignment results
        print(f"\n  ✓ Match Type: {method.upper()}")
        print(f"  📝 ODT Output: '{aligned_text[:100]}{'...' if len(aligned_text) > 100 else ''}'")

        if method == "semantic":
            print(f"\n  🔍 WORD COUNT ANALYSIS:")
            print(f"  ┌──────────────────────────────────────────────┐")
            print(f"  │ Whisper: {num_whisper_words:3d} words → ODT: {words_consumed:3d} words           │")

            if words_consumed < num_whisper_words:
                diff = num_whisper_words - words_consumed
                print(f"  │ ⚠ Whisper split {diff} word(s) incorrectly    │")
            elif words_consumed > num_whisper_words:
                diff = words_consumed - num_whisper_words
                print(f"  │ ⚠ Whisper merged {diff} word(s) incorrectly   │")
            else:
                print(f"  │ ✓ Word counts match                          │")

            print(f"  │ ✓ Consuming {words_consumed} ODT words (CORRECT)       │")
            print(f"  └──────────────────────────────────────────────┘")

        print(f"  Words consumed: {words_consumed}")
        print(f"  Confidence: {confidence:.2%}")
        print(f"  {'─'*66}")

        return aligned_text, method, confidence, words_consumed, None

    def get_whisper_files(self):
        """Get all whisper files sorted"""
        files = list(self.whisper_folder.glob("*.txt"))

        def sort_key(filepath):
            match = re.search(r'session_(\d+)_(\d+)', filepath.name)
            if match:
                return (int(match.group(1)), int(match.group(2)))
            return (0, 0)

        files.sort(key=sort_key)

        # Print files found
        print(f"\n{'='*70}")
        print(f"FILES FOUND IN INPUT DIRECTORY")
        print(f"{'='*70}")
        print(f"Directory: {self.whisper_folder}")
        print(f"Total files found: {len(files)}\n")

        if files:
            print("First 10 files:")
            for i, file in enumerate(files[:10], 1):
                size = file.stat().st_size
                print(f"  {i:2d}. {file.name:<30s} ({size:,} bytes)")

            if len(files) > 10:
                print(f"  ... and {len(files) - 10} more files")
        else:
            print("  ⚠ No .txt files found!")

        print(f"{'='*70}\n")
        return files

    def process_all_files(self, skip_processed=True):
        """Process all whisper segments with validation"""
        whisper_files = self.get_whisper_files()
        self.progress["statistics"]["total_files"] = len(whisper_files)

        print(f"\n{'='*70}")
        print(f"SEMANTIC WHISPER-ODT ALIGNMENT (with Validation)")
        print(f"{'='*70}")
        print(f"Total Whisper segments: {len(whisper_files)}")
        print(f"Total ODT words: {len(self.odt_words)}")
        print(f"Starting position: {self.current_position}")
        print(f"Words remaining: {len(self.odt_words) - self.current_position}")
        print(f"{'='*70}\n")

        for i, whisper_file in enumerate(whisper_files):
            filename = whisper_file.name

            # Skip if processed
            if skip_processed and filename in self.progress["processed_files"]:
                print(f"[{i+1}/{len(whisper_files)}] ⊘ Skipping {filename} (already processed)")
                continue

            # Skip if already marked as skipped
            if skip_processed and filename in self.progress["skipped_files"]:
                print(f"[{i+1}/{len(whisper_files)}] ⊘ Skipping {filename} (marked as noise/invalid)")
                continue

            print(f"\n{'█'*70}")
            print(f"[{i+1}/{len(whisper_files)}] Processing: {filename}")
            print(f"{'█'*70}")

            # Read whisper text
            with open(whisper_file, 'r', encoding='utf-8') as f:
                whisper_text = f.read().strip()

            # Align segment (with validation)
            aligned_text, method, confidence, words_consumed, skip_reason = self.align_segment(whisper_text, filename)

            # Handle skipped files
            if method == "skipped_noise":
                print(f"\n  🚫 File SKIPPED - Reason: {skip_reason}")
                self.progress["skipped_files"].append(filename)
                self.progress["statistics"]["skipped_noise"] += 1
                self.progress["alignment_log"].append({
                    "file": filename,
                    "method": method,
                    "reason": skip_reason,
                    "status": "skipped"
                })
                self._save_progress()
                continue

            # Handle failed validation
            if method == "failed_validation":
                print(f"\n  ❌ File FAILED VALIDATION - Reason: {skip_reason}")
                self.progress["skipped_files"].append(filename)
                self.progress["statistics"]["failed_validation"] += 1
                self.progress["alignment_log"].append({
                    "file": filename,
                    "method": method,
                    "reason": skip_reason,
                    "status": "failed_validation"
                })
                self._save_progress()
                continue

            # Save successfully aligned text
            if aligned_text and words_consumed > 0:
                # Save aligned text
                output_file = self.output_folder / filename
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(aligned_text)

                print(f"\n  💾 Saved to: {output_file.name}")

                # Update position
                self.current_position += words_consumed
                self.progress["statistics"]["words_used"] = self.current_position

                print(f"  📊 ODT Position: {self.current_position}/{len(self.odt_words)} ({len(self.odt_words) - self.current_position} remaining)")

                # Show comparison
                print(f"\n  📋 COMPARISON:")
                print(f"  {'─'*66}")
                print(f"  Whisper words:     {len(whisper_text.split()):3d}")
                print(f"  ODT words matched: {len(aligned_text.split()):3d}")
                print(f"  ODT words consumed: {words_consumed:3d}")
                print(f"  Method:            {method.upper()}")
                print(f"  Confidence:        {confidence:.1%}")
                print(f"  {'─'*66}")

                # Log alignment
                self.progress["alignment_log"].append({
                    "file": filename,
                    "method": method,
                    "confidence": confidence,
                    "whisper_words": len(whisper_text.split()),
                    "odt_words": len(aligned_text.split()),
                    "odt_consumed": words_consumed,
                    "position": self.current_position,
                    "status": "success"
                })

                # Update statistics
                self.progress["processed_files"].append(filename)
                if method == "exact":
                    self.progress["statistics"]["exact_match"] += 1
                elif method == "semantic":
                    self.progress["statistics"]["semantic_match"] += 1
                else:
                    self.progress["statistics"]["fallback"] += 1

                self._save_progress()

            # Warn if running low
            remaining = len(self.odt_words) - self.current_position
            if remaining < 100:
                print(f"\n{'⚠'*35}")
                print(f"⚠ WARNING: Only {remaining} ODT words remaining!")
                print(f"{'⚠'*35}")

        self._print_statistics()
        self._print_detailed_log()

    def _print_statistics(self):
        """Print statistics"""
        stats = self.progress["statistics"]
        total_processed = stats['exact_match'] + stats['semantic_match'] + stats['fallback']

        print(f"\n{'='*70}")
        print("ALIGNMENT STATISTICS")
        print(f"{'='*70}")
        print(f"Total Files:        {stats['total_files']}")
        print(f"\n✓ Successfully Processed: {total_processed}")
        print(f"  Exact Matches:      {stats['exact_match']:3d} ({stats['exact_match']/max(total_processed,1)*100:5.1f}%)")
        print(f"  Semantic Matches:   {stats['semantic_match']:3d} ({stats['semantic_match']/max(total_processed,1)*100:5.1f}%)")
        print(f"  Fallback:           {stats['fallback']:3d} ({stats['fallback']/max(total_processed,1)*100:5.1f}%)")
        print(f"\n❌ Skipped/Failed:")
        print(f"  Noise/Invalid:      {stats['skipped_noise']:3d}")
        print(f"  Failed Validation:  {stats['failed_validation']:3d}")
        print(f"\n📊 ODT Progress:")
        print(f"  Words Used:         {stats['words_used']:,}")
        print(f"  Words Remaining:    {stats['words_remaining']:,}")
        print(f"{'='*70}\n")

    def _print_detailed_log(self):
        """Print detailed log"""
        if not self.progress["alignment_log"]:
            return

        print(f"\n{'='*70}")
        print("DETAILED ALIGNMENT LOG (Last 10 files)")
        print(f"{'='*70}\n")

        recent_logs = self.progress["alignment_log"][-10:]

        for i, log in enumerate(recent_logs, 1):
            status_icon = "✓" if log.get("status") == "success" else "❌"
            print(f"{status_icon} {i}. {log['file']}")
            print(f"   Method: {log['method'].upper():<12s}", end="")

            if log.get("status") == "success":
                print(f" Confidence: {log.get('confidence', 0):.1%}")
                print(f"   Whisper: {log.get('whisper_words')} words → ODT: {log.get('odt_words')} words (consumed {log.get('odt_consumed')})")
                print(f"   Position: {log.get('position')}")
            else:
                print(f" Reason: {log.get('reason', 'Unknown')}")
            print()

        print(f"{'='*70}\n")


# Example usage
if __name__ == "__main__":
    # Install: !pip install odfpy

    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
    WHISPER_FOLDER = "/content/drive/MyDrive/dataset/whisper_segments/text"
    ODT_PATH = "/content/drive/MyDrive/dataset/10 AUG PRIME_non_bold (1).odt"
    OUTPUT_FOLDER = "/content/drive/MyDrive/dataset/corrected_segments"

    aligner = SemanticWhisperAligner(
        gemini_api_key=GEMINI_API_KEY,
        whisper_folder=WHISPER_FOLDER,
        odt_path=ODT_PATH,
        output_folder=OUTPUT_FOLDER
    )

    aligner.process_all_files(skip_processed=True)

    print("\n✓ Alignment complete!")
    print(f"  Input (Whisper): {WHISPER_FOLDER}")
    print(f"  Output (Aligned): {OUTPUT_FOLDER}")
    print(f"\n📊 Check 'alignment_progress.json' for detailed logs")

✓ API Key loaded: sk-or-v1...e4cf
✓ Loaded 832 words from ODT reference

FILES FOUND IN INPUT DIRECTORY
Directory: /content/drive/MyDrive/dataset/whisper_segments/text
Total files found: 68

First 10 files:
   1. session_001_0000.txt           (75 bytes)
   2. session_001_0001.txt           (509 bytes)
   3. session_001_0002.txt           (475 bytes)
   4. session_001_0003.txt           (431 bytes)
   5. session_001_0004.txt           (243 bytes)
   6. session_001_0005.txt           (145 bytes)
   7. session_001_0006.txt           (295 bytes)
   8. session_001_0007.txt           (252 bytes)
   9. session_001_0008.txt           (208 bytes)
  10. session_001_0009.txt           (99 bytes)
  ... and 58 more files


SEMANTIC WHISPER-ODT ALIGNMENT (with Validation)
Total Whisper segments: 68
Total ODT words: 832
Starting position: 0
Words remaining: 832


██████████████████████████████████████████████████████████████████████
[1/68] Processing: session_001_0000.txt
███████████████████████████

KeyboardInterrupt: 

In [None]:
# Set your API key (after revoking the old one!)
%env GEMINI_API_KEY=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


env: GEMINI_API_KEY=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


In [2]:
from odf.opendocument import load
from odf import text, teletype
import re

def count_konkani_words_from_odt(file_path):
    # Load ODT file
    doc = load(file_path)
    content = []

    # Extract text paragraphs
    for p in doc.getElementsByType(text.P):
        para = teletype.extractText(p).strip()
        if para:
            content.append(para)

    full_text = " ".join(content)

    # Regex for Konkani (Devanagari) + English/Numbers
    pattern = r'[\u0900-\u097F]+|[a-zA-Z0-9]+'

    words = re.findall(pattern, full_text)
    return len(words), words

# Example usage
file_path = "/content/drive/MyDrive/dataset/10 AUG PRIME_non_bold (1).odt"
count, words = count_konkani_words_from_odt(file_path)

print(f"Total Konkani Words: {count}")
print("Sample:", words[:844])


Total Konkani Words: 844
Sample: ['नमस्कार', 'पळोवया', 'प्रुडंट', 'खबरो', 'टॅंकरवाल्यांक', 'ना', 'धरबांद', 'गोंयभरच्या', 'वॉटर', 'टॅंकरांचें', 'ट्रान्स्पोर्ट', 'डिपोर्टमेन्ट', 'करतलो', 'सेफ्टीऑडिट', 'उदका', 'पुरवणे', 'उपरांत', 'लोकांकडच्यान', 'सय', 'घेवपाची', 'सिस्टम', 'प्रुडंटाचे', 'ऑपा', 'वॉट', 'टॅंकर', 'एक्स्पोजेचो', 'इम्पॅक्ट', 'कळसाभंडुरा', 'प्रकल्पाक', 'केंद्राची', 'परवानगी', 'ना', 'प्रॉजॅक्ट', 'जाला', 'म्हण्टा', 'ती', 'फट', 'म्हादय', 'बचाव', 'अभियानान', 'कर्नाटकाक', 'सुप्रिम', 'कोर्टांत', 'केलें', 'एक्स्पोज', 'कोर्टान', 'मागलें', 'केंद्र', 'कर्नाटकाकडच्यान', 'एफिडॅव्हिट', 'एक्टिव्हिस्टांचो', 'कॉंग्रेसीक', 'तेंको', 'रायबंदरा', 'पर्रीकारान', 'कांयच', 'कामां', 'केल्लीं', 'ना', 'लायटउदकाच्यो', 'कटकटी', 'गिरीशाचो', 'रायबंदरा', 'प्रचार', 'पर्रीकार', 'खंय', 'सगळ्यात', 'भ्रष्ट', 'मुख्यमंत्री', 'आयरीशाक', 'जीतो', 'मारता', 'म्हूण', 'दिल्ले', 'धमकेचे', 'केशींत', 'विश्वजीतान', 'हायकोर्टांत', 'भल्लो', 'देड', 'लाख', 'लीगल', 'एड', 'फंड', 'चार्जशीट', 'रद्द', 'केस', 'बंद', '10', 'वर्सां', 'उपरां

In [3]:
import os

# path to your folder containing txt files
folder_path = "/content/drive/MyDrive/dataset/whisper_segments/text"

# loop through the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):  # only .txt files
        file_path = os.path.join(folder_path, file_name)

        # open file with utf-8 encoding
        with open(file_path, 'r', encoding='utf-8') as file:
            print(f"--- {file_name} ---")
            content = file.read()
            print(content)
            print("\n")  # spacing between files


--- session_001_0000.txt ---
नमस्कार बोई प्रुद अंखोब्रों


--- session_001_0001.txt ---
तेंकर वाल्यक्ना दर्बान, गवाई भर्चा वाटर तेंकर आचा ट्रास्पोट धिपार्ट्मन करतोला सिझ्टी अडिट, उद्का पर वने अप्रान्त लगा कर च्यान से ग्योपाची सिस्टम प्रुडन्दाचा अपा वाटर तेंकर अप्स्पोजेच अ इंपैक.


--- session_001_0002.txt ---
कर सा बन्दूर आप्रकल बाख केंद्राजी पर वान्गिना, प्रोज्यक्त जालम ते ती फुट, माधे बचा अभियनान करनाटकाक सुप्रिम कोटान, के लें अप्स्पोस कोटान मागलें केंद्र करनाटकाक गड्जाने अप्टीडेवेई.


--- session_001_0003.txt ---
अक्तिविस्टान सो कोंगरे सिक्ते को राई बन्रा पर्रे करान काईईच कामा के लिना लाई उद्काचो कद्कती गिरिशान सो राई बन्रा प्राचार पर्रे कर कही सग्यान्त ब्रष्ट मुक्यमंद्री


--- session_001_0004.txt ---
आईरीशाग जी तो मार्टा मुन दिल ले दमके चके शींज विष्वाजी तान है को ताक बल लग देड लाख लिगल एड फुंड


--- session_001_0005.txt ---
चार्षिट्रद, केस बंद, दा वर सा उपरानत आईरीश मन ता हरकोत ना


--- session_001_0006.txt ---
ॐ ॐ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ ौ

In [4]:
"""
Transcript Segment Matcher
Matches segmented transcript files to their corresponding lines in the full transcript
using fuzzy string matching to handle OCR errors and text variations.
"""

from difflib import SequenceMatcher
import re
from typing import List, Tuple, Dict
import json
import os
from zipfile import ZipFile
from xml.etree import ElementTree as ET


def normalize_text(text: str) -> str:
    """
    Normalize text by removing extra whitespace and special characters
    while preserving the core content for matching.
    """
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text


def fuzzy_match_score(str1: str, str2: str) -> float:
    """
    Calculate fuzzy matching score between two strings.
    Returns a value between 0 and 1, where 1 is perfect match.
    """
    return SequenceMatcher(None, str1, str2).ratio()


def find_best_match(segment: str, transcript_lines: List[str],
                    min_threshold: float = 0.6) -> Tuple[int, float, str]:
    """
    Find the best matching line in the transcript for a given segment.

    Args:
        segment: The segment text to match
        transcript_lines: List of lines from the full transcript
        min_threshold: Minimum similarity score to consider a match

    Returns:
        Tuple of (line_index, similarity_score, matched_line)
    """
    segment_norm = normalize_text(segment)
    best_score = 0
    best_index = -1
    best_line = ""

    for idx, line in enumerate(transcript_lines):
        line_norm = normalize_text(line)

        # Skip empty lines
        if not line_norm:
            continue

        # Calculate similarity score
        score = fuzzy_match_score(segment_norm, line_norm)

        # Also check if segment is contained within the line (for partial matches)
        if segment_norm in line_norm:
            score = max(score, 0.85)  # Boost score for containment

        if score > best_score:
            best_score = score
            best_index = idx
            best_line = line

    if best_score < min_threshold:
        return -1, best_score, ""

    return best_index, best_score, best_line


def match_segments_to_transcript(segments: Dict[str, str],
                                 full_transcript: str,
                                 min_threshold: float = 0.5) -> List[Dict]:
    """
    Match all segments to their corresponding lines in the full transcript.

    Args:
        segments: Dictionary mapping segment filenames to segment text
        full_transcript: The complete transcript text
        min_threshold: Minimum similarity threshold for matching

    Returns:
        List of dictionaries containing match results
    """
    # Split transcript into lines
    transcript_lines = [line.strip() for line in full_transcript.split('\n')
                       if line.strip()]

    results = []
    used_lines = set()  # Track which lines have been matched to avoid duplicates

    # Sort segments by filename to process in order
    sorted_segments = sorted(segments.items())

    for segment_file, segment_text in sorted_segments:
        # Skip if segment is mostly special characters or empty
        if not normalize_text(segment_text) or len(normalize_text(segment_text)) < 3:
            results.append({
                'segment_file': segment_file,
                'segment_text': segment_text,
                'matched_line_index': -1,
                'matched_line': '',
                'similarity_score': 0.0,
                'status': 'SKIPPED_EMPTY'
            })
            continue

        # Find best match in transcript
        line_idx, score, matched_line = find_best_match(
            segment_text, transcript_lines, min_threshold
        )

        # Check if this line was already matched
        if line_idx in used_lines and line_idx != -1:
            # Find next best match
            temp_lines = transcript_lines.copy()
            temp_lines[line_idx] = ""  # Temporarily remove used line
            line_idx, score, matched_line = find_best_match(
                segment_text, temp_lines, min_threshold
            )

        if line_idx != -1:
            used_lines.add(line_idx)
            status = 'MATCHED'
        else:
            status = 'NO_MATCH'

        results.append({
            'segment_file': segment_file,
            'segment_text': segment_text[:100] + '...' if len(segment_text) > 100 else segment_text,
            'matched_line_index': line_idx,
            'matched_line': matched_line[:100] + '...' if len(matched_line) > 100 else matched_line,
            'similarity_score': round(score, 3),
            'status': status
        })

    return results


def display_results(results: List[Dict]):
    """Display matching results in a readable format."""
    print("\n" + "="*100)
    print("SEGMENT MATCHING RESULTS")
    print("="*100)

    matched_count = sum(1 for r in results if r['status'] == 'MATCHED')
    total_count = len([r for r in results if r['status'] != 'SKIPPED_EMPTY'])

    print(f"\nSummary: {matched_count}/{total_count} segments matched successfully")
    print(f"Match rate: {matched_count/total_count*100:.1f}%\n")

    for result in results:
        print(f"\nSegment: {result['segment_file']}")
        print(f"Status: {result['status']}")
        print(f"Similarity Score: {result['similarity_score']}")
        if result['matched_line_index'] != -1:
            print(f"Matched Line Index: {result['matched_line_index']}")
        print(f"Segment Text: {result['segment_text']}")
        if result['matched_line']:
            print(f"Matched Line: {result['matched_line']}")
        print("-" * 100)


def load_segments_from_folder(folder_path: str) -> Dict[str, str]:
    """
    Load all segment files from the specified folder.

    Args:
        folder_path: Path to the folder containing segment .txt files

    Returns:
        Dictionary mapping filenames to their content
    """
    segments = {}

    # Check if folder exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' not found!")
        return segments

    # Loop through all files in the folder
    for file_name in sorted(os.listdir(folder_path)):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)

            try:
                # Open and read file with utf-8 encoding
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()
                    segments[file_name] = content
                    print(f"✓ Loaded: {file_name}")
            except Exception as e:
                print(f"✗ Error loading {file_name}: {e}")

    print(f"\nTotal segments loaded: {len(segments)}\n")
    return segments


def load_odt_file(odt_path: str) -> str:
    """
    Load and extract text from an ODT (OpenDocument Text) file.

    Args:
        odt_path: Path to the .odt file

    Returns:
        Extracted text content as a string
    """
    try:
        # ODT files are zip archives containing XML
        with ZipFile(odt_path, 'r') as odt_zip:
            # Extract content.xml which contains the text
            content_xml = odt_zip.read('content.xml')

        # Parse XML
        root = ET.fromstring(content_xml)

        # Define namespace for ODT
        namespaces = {
            'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',
            'office': 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'
        }

        # Extract all text elements
        text_elements = []

        # Find all paragraph and heading elements
        for elem in root.iter():
            # Check if it's a text paragraph or heading
            if elem.tag.endswith('}p') or elem.tag.endswith('}h'):
                # Get all text content from this element and its children
                text_content = ''.join(elem.itertext()).strip()
                if text_content:
                    text_elements.append(text_content)

        # Join all text with newlines
        full_text = '\n'.join(text_elements)

        print(f"✓ Successfully loaded ODT file")
        print(f"  Total characters: {len(full_text)}")
        print(f"  Total lines: {len(text_elements)}\n")

        return full_text

    except Exception as e:
        print(f"✗ Error loading ODT file: {e}")
        return ""


# Example usage
if __name__ == "__main__":
    # Path to your folder containing txt files
    folder_path = "/content/drive/MyDrive/dataset/whisper_segments/text"

    # Load all segments from folder
    print("Loading segments from folder...")
    print("="*100)
    segments = load_segments_from_folder(folder_path)

    if not segments:
        print("No segments loaded. Please check the folder path.")
        exit(1)

    # Path to your ODT transcript file
    odt_file_path = "/content/drive/MyDrive/dataset/10 AUG PRIME_non_bold (1).odt"

    # Load transcript from ODT file
    print("Loading transcript from ODT file...")
    print("="*100)
    full_transcript = load_odt_file(odt_file_path)

    if not full_transcript:
        print("Error: Could not load transcript from ODT file.")
        exit(1)

    # Perform matching with adjustable threshold
    print("Starting matching process...")
    print("="*100)
    results = match_segments_to_transcript(segments, full_transcript, min_threshold=0.5)

    # Display results
    display_results(results)

    # Save results to JSON
    output_file = 'matching_results.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\n✓ Results saved to '{output_file}'")

Loading segments from folder...
✓ Loaded: session_001_0000.txt
✓ Loaded: session_001_0001.txt
✓ Loaded: session_001_0002.txt
✓ Loaded: session_001_0003.txt
✓ Loaded: session_001_0004.txt
✓ Loaded: session_001_0005.txt
✓ Loaded: session_001_0006.txt
✓ Loaded: session_001_0007.txt
✓ Loaded: session_001_0008.txt
✓ Loaded: session_001_0009.txt
✓ Loaded: session_001_0010.txt
✓ Loaded: session_001_0011.txt
✓ Loaded: session_001_0012.txt
✓ Loaded: session_001_0013.txt
✓ Loaded: session_001_0014.txt
✓ Loaded: session_001_0015.txt
✓ Loaded: session_001_0016.txt
✓ Loaded: session_001_0017.txt
✓ Loaded: session_001_0018.txt
✓ Loaded: session_001_0019.txt
✓ Loaded: session_001_0020.txt
✓ Loaded: session_001_0021.txt
✓ Loaded: session_001_0022.txt
✓ Loaded: session_001_0023.txt
✓ Loaded: session_001_0024.txt
✓ Loaded: session_001_0025.txt
✓ Loaded: session_001_0026.txt
✓ Loaded: session_001_0027.txt
✓ Loaded: session_001_0028.txt
✓ Loaded: session_001_0029.txt
✓ Loaded: session_001_0030.txt
✓ Loade

In [6]:
import os

# Directory where files will be created
output_dir = "/content/drive/MyDrive/dataset/matched_segments"

# Create folder if not exists
os.makedirs(output_dir, exist_ok=True)

# Mapping of file_name → matched_text
segments = {
    "session_001_0000.txt": "नमस्कार पळोवया प्रुडंट खबरो",
    "session_001_0001.txt": "टॅंकरवाल्यांक ना धरबांद गोंयभरच्या वॉटर टॅंकरांचें ट्रान्स्पोर्ट डिपोर्टमेन्ट करतलो सेफ्टीऑडिट उदका पुरवणे उपरांत लोकांकडच्यान सय घेवपाची सिस्टम प्रुडंटाचे ऑपा वॉट टॅंकर एक्स्पोजेचो इम्पॅक्ट.",
    "session_001_0002.txt": "कळसाभंडुरा प्रकल्पाक केंद्राची परवानगी ना प्रॉजॅक्ट जाला म्हण्टा ती फट म्हादय बचाव अभियानान कर्नाटकाक सुप्रिम कोर्टांत केलें एक्स्पोज कोर्टान मागलें केंद्र कर्नाटकाकडच्यान एफिडॅव्हिट.",
    "session_001_0003.txt": "एक्टिव्हिस्टांचो कॉंग्रेसीक तेंको रायबंदरा पर्रीकारान कांयच कामां केल्लीं ना लायटउदकाच्यो कटकटी गिरीशाचो रायबंदरा प्रचार पर्रीकार खंय सगळ्यात भ्रष्ट मुख्यमंत्री",
    "session_001_0004.txt": "आयरीशाक जीतो मारता म्हूण दिल्ले धमकेचे केशींत विश्वजीतान हायकोर्टांत भल्लो देड लाख लीगल एड फंड",
    "session_001_0005.txt": "चार्जशीट रद्द केस बंद 10 वर्सां उपरांत आयरिश म्हण्टा हरकत ना.",
    "session_001_0007.txt": "आनी भारतांत फुटसालाक येतले बरे दिस ए.एफ.सी आनी ए.आय.एफ.एफ मेळून तयार करता फुटसाल डॅव्हलापमॅन्टाचो आराखडो.",
    "session_001_0008.txt": "एखाद्या सुवातीर उदका पुरवण केली काय टॅंकरवाल्यान थंयचे लोकांची सय घेवपाची आसता",
    "session_001_0009.txt": "आनी डिपार्टमेंटाक ट्रिपींचो रिपोर्ट दिवपाचो आसता.",
    "session_001_0010.txt": "मात टॅंकरवाल्यांकधरबांद ना म्हूण",
    "session_001_0011.txt": "प्रुडंटान ओपाची टॅंकर वॉटर सप्लाय एक्स्पोजे केल्लो.",
    "session_001_0013.txt": "कळसाभंडुरा प्रकल्पाचे काम कर्नाटकान केंद्राची परवानगी नासतना सुरु केलां ही गजाल म्हादय बचाव अभियानान सुप्रिम कोर्टांत एक्पोज केल्या.",
    "session_001_0014.txt": "कळसा बंडुरा प्रकल्पाचे काम जवळ जवळ पुराय जायत आयलां म्हूण कर्नाटकान कोर्टाक सांगलां.",
    "session_001_0015.txt": "मात काम जाल्लें ना आनी तें नेटान चल्लां म्हूण अभियानान कोर्टाक क्लियर केलें.",
    "session_001_0016.txt": "कर्नाटकान असो अर्ज केला आनी केंद्रान तांकां परवानगी दिल्या जाल्यार दोगांनीय ती कोर्टाक एफिडॅव्हिटाचेर दाखोवची पडटली.",
    "session_001_0017.txt": "रायबंदराचे एक्टिव्हिस्ट कॉंग्रेसीफाटल्यान.",
    "session_001_0018.txt": "मुख्यमंत्री पर्रीकारान रायबंदरा कांयच कामां केल्लीं ना.",
    "session_001_0019.txt": "रायबंदरा लोकांक लायटउदकाच्यो कटकटी.",
    "session_001_0020.txt": "हिंदुक मसुंडी सारकी ना.",
    "session_001_0021.txt": "आपल्या प्रचाराक लोकांचो बरो प्रदिसात मेळटा",
    "session_001_0022.txt": "म्हूण काँग्रेसीचो उमेदवार गिरीश चोडणकार मिडियाकडेन उलयलो.",
    "session_001_0024.txt": "काँग्रेस हावजांत मिडियामुखार पावलो.",
    "session_001_0026.txt": "विश्वजीता फंड डिपॉजिटूय केलो.",
    "session_001_0027.txt": "शाळांच्या मॅनेजमॅन्टांनी रिटायर स्टाफाच्या पॅन्शनचो बेगोबेग सॅटलमेंट करुंक जाय.",
    "session_001_0028.txt": "ना जाल्यार काद्यान तांका दंड म्हूण शिक्षण संचालक गजाजन भट उलयला.",
    "session_001_0029.txt": "एडमीशनावेळार भुरग्यांचे आर्धारकार्ड करुन घेयात.",
    "session_001_0030.txt": "तशें सर्कुलरुय हे पयली खंय खात्यान काडलां.",
    "session_001_0031.txt": "इलेक्शनाचोकोड सुरु जावचें पयली रिक्रुटमेंट जाला",
    "session_001_0032.txt": "तांच्योच अपॉयंटमेंट ऑर्डरी खंय रिझल्टा उपरांत दिवपाच्यो.",
    "session_001_0033.txt": "कोंबा रिंग रोडावेल्या 15 फामिलींक चवथी उपरांत हालोवचे",
    "session_001_0034.txt": "तांचें हावसींग बोर्डांत पुनर्वसन जावंक जाय म्हूण टीसीपी मंत्री विजय उलयला.",
    "session_001_0035.txt": "थंयचे फामीलींची फातोड्डेच्या आमदारासयत मडगांव कलॅक्टरेटांत बसका जाली.",
    "session_001_0036.txt": "रस्त्याचे काम जावंक जाय म्हूण पंदरशीभीतर तांकां हालोवपाचो कोर्टान आदेश दिला.",
    "session_001_0037.txt": "मात चवथ आनी फामिलींचो विचार करून कांय बदलांखातीर कोर्टाक विनवणी करपाचो विजयाचो विचार.",
    "session_001_0039.txt": "आशिल्ले सात आठ जाणूच. करंजाळे निदर्शनां करता म्हूण सांगून म्हापालिका पोलीस कलॅक्टराचें परमिशन घेतलें,",
    "session_001_0040.txt": "इलॅक्शन कमिशनाक सांगलेंच ना. निदर्शनां केलीं पणजे.",
    "session_001_0041.txt": "बीजेपीवाल्यांचे कंप्लेनीचेर पुलिसांनी तांकां बिडकी कवळूंक लायली.",
    "session_001_0042.txt": "गोंयात सायबर क्रायम वाडाटा. वेबसायटींचेर सॅक्स टुरिजमाचें प्रोमोशन चल्लां म्हूण गोवा वुमन्स फोरमान तिडक उक्तायली. शिंदळकेखातीर चलयां आनी बायलांची पुरवण करपी वॅबसाटीं चलतात.",
    "session_001_0043.txt": "असले प्रकार बंद जावंक जाय म्हूण गोवा वूमन्स फोरम मागता.",
    "session_001_0044.txt": "फोरमान मडगावां जागृती रॅली काडली.",
    "session_001_0045.txt": "दाबोळे केशव स्मृतीच्या अकरावेच्या भुरग्यांक हायर सेकंडरीचे तीन मजली बिल्डिंगेचे तेर्रासाचेर चडोवन टायल्स नितळ करूंक लायिल्ले प्रकरणाची शिक्षण खात्यान दखल घेतल्या.",
    "session_001_0046.txt": "शिक्षण संचालकान एडीईआय कडच्यान रिपोर्ट मागला.",
    "session_001_0047.txt": "रिपोर्ट आयलो काय हायर सेकंडरीच्या प्रिन्सिपलाक नोटीस वतली",
    "session_001_0048.txt": "म्हूण शिक्षण संचालक गजानन भट प्रुडंटाकडेन उलयलो.",
    "session_001_0049.txt": "सिटिझन जर्नलिस्टाचे व्हिडिओवेल्यान प्रुडंटान हो प्रकार एक्स्पोज केल्लो.",
    "session_001_0050.txt": "ऑल इंडिया फुटबॉल फॅडरेशन आतां फुटसाल डेव्हलाप करपाचेर लक्ष घालतलें.",
    "session_001_0051.txt": "ह्या कामांत आतां तांकां एशीयन फुटबॉल कन्फॅडरेशन आदार करतलें.",
    "session_001_0052.txt": "इन्फ्रास्ट्रक्चर कशें आसा कांय पळोवंक सद्या एएफसी आनी एआयएफएफाचें शिश्टमंडळ देशभर भोंवतले.",
    "session_001_0053.txt": "गोंयात ब्रेस्तारा इन्स्पेक्शन जालें.",
    "session_001_0055.txt": "मुरगांवचो बीजेपी आमदार मिलींद नायकान आंगवण पुराय केली",
    "session_001_0056.txt": "वास्को ते बोरये 28 किलोमिटर तो साई बाबाचें दर्शनाक चलत गेलो.",
    "session_001_0057.txt": "बरोबर धायेक कार्यकर्त्यांचीय वारी",
    "session_001_0058.txt": "मिलींदान सगल्यांक बरोबर घेवन फांतोडेर पाचंक वास्कोसावन बोरयेची वाट धल्ली.",
    "session_001_0059.txt": "इलॅक्शनाकडेन मिलिंद जिकल्यार बोरये वारीची साईबाबाक ताचे कार्य़कर्त्यान आंगवण केल्ली.",
    "session_001_0060.txt": "मिलिंद 143 मतांनी जिकलो",
    "session_001_0061.txt": "स्वतंत्रताय दीस तेंकलो आयकपाक येना उलोवंक कळना अश्या दिव्यांग लोकांखातीर आता सायन लॅग्वेजीन राष्ट्रगीत लाँच जालां.",
    "session_001_0062.txt": "ब्रदरहुड न्यु दिल्ली आनी डिसॅबीलीटी रायट्स असोसिएशन ऑफ गोवा",
    "session_001_0063.txt": "कला अकादमीक बरोबर घेवन सायन लॅग्वीजींतलो नॅशनल एन्थम व्हिडियो लाँच केलो.",
    "session_001_0064.txt": "अमिताभ बच्चनाक घेवन हें खास राष्ट्रगीत केलां.",
    "session_001_0065.txt": "हें बुलॅटिन हांगांच सोंपता",
    "session_001_0066.txt": "ह्यो खबरो तुमकां प्रुडंट वॉट्सॅप ट्विटर फेसबूक आनी प्रुडंट व्हॅबसायटीचेर लायव्ह मेळटात",
}

for file_name, text in segments.items():
    file_path = os.path.join(output_dir, file_name)

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)

print(f"✅ Created {len(segments)} files in folder: {output_dir}")


✅ Created 61 files in folder: /content/drive/MyDrive/dataset/matched_segments


In [5]:
import re

raw = r'''
file_name,matched_text
--- session_001_0000.txt ---,"नमस्कार पळोवया प्रुडंट खबरो"
--- session_001_0001.txt ---,"टॅंकरवाल्यांक ना धरबांद गोंयभरच्या वॉटर टॅंकरांचें ट्रान्स्पोर्ट डिपोर्टमेन्ट करतलो सेफ्टीऑडिट उदका पुरवणे उपरांत लोकांकडच्यान सय घेवपाची सिस्टम प्रुडंटाचे ऑपा वॉट टॅंकर एक्स्पोजेचो इम्पॅक्ट."
--- session_001_0002.txt ---,"कळसाभंडुरा प्रकल्पाक केंद्राची परवानगी ना प्रॉजॅक्ट जाला म्हण्टा ती फट म्हादय बचाव अभियानान कर्नाटकाक सुप्रिम कोर्टांत केलें एक्स्पोज कोर्टान मागलें केंद्र कर्नाटकाकडच्यान एफिडॅव्हिट."
--- session_001_0003.txt ---,"एक्टिव्हिस्टांचो कॉंग्रेसीक तेंको रायबंदरा पर्रीकारान कांयच कामां केल्लीं ना लायटउदकाच्यो कटकटी गिरीशाचो रायबंदरा प्रचार पर्रीकार खंय सगळ्यात भ्रष्ट मुख्यमंत्री"
--- session_001_0004.txt ---,"आयरीशाक जीतो मारता म्हूण दिल्ले धमकेचे केशींत विश्वजीतान हायकोर्टांत भल्लो देड लाख लीगल एड फंड"
--- session_001_0005.txt ---,"चार्जशीट रद्द केस बंद 10 वर्सां उपरांत आयरिश म्हण्टा हरकत ना."
--- session_001_0007.txt ---,"आनी भारतांत फुटसालाक येतले बरे दिस ए.एफ.सी आनी ए.आय.एफ.एफ मेळून तयार करता फुटसाल डॅव्हलापमॅन्टाचो आराखडो."
--- session_001_0008.txt ---,"एखाद्या सुवातीर उदका पुरवण केली काय टॅंकरवाल्यान थंयचे लोकांची सय घेवपाची आसता"
--- session_001_0009.txt ---,"आनी डिपार्टमेंटाक ट्रिपींचो रिपोर्ट दिवपाचो आसता."
--- session_001_0010.txt ---,"मात टॅंकरवाल्यांकधरबांद ना म्हूण"
--- session_001_0011.txt ---,"प्रुडंटान ओपाची टॅंकर वॉटर सप्लाय एक्स्पोजे केल्लो."
--- session_001_0013.txt ---,"कळसाभंडुरा प्रकल्पाचे काम कर्नाटकान केंद्राची परवानगी नासतना सुरु केलां ही गजाल म्हादय बचाव अभियानान सुप्रिम कोर्टांत एक्पोज केल्या."
--- session_001_0014.txt ---,"कळसा बंडुरा प्रकल्पाचे काम जवळ जवळ पुराय जायत आयलां म्हूण कर्नाटकान कोर्टाक सांगलां."
--- session_001_0015.txt ---,"मात काम जाल्लें ना आनी तें नेटान चल्लां म्हूण अभियानान कोर्टाक क्लियर केलें."
--- session_001_0016.txt ---,"कर्नाटकान असो अर्ज केला आनी केंद्रान तांकां परवानगी दिल्या जाल्यार दोगांनीय ती कोर्टाक एफिडॅव्हिटाचेर दाखोवची पडटली."
--- session_001_0017.txt ---,"रायबंदराचे एक्टिव्हिस्ट कॉंग्रेसीफाटल्यान."
--- session_001_0018.txt ---,"मुख्यमंत्री पर्रीकारान रायबंदरा कांयच कामां केल्लीं ना."
--- session_001_0019.txt ---,"रायबंदरा लोकांक लायटउदकाच्यो कटकटी."
--- session_001_0020.txt ---,"हिंदुक मसुंडी सारकी ना."
--- session_001_0021.txt ---,"आपल्या प्रचाराक लोकांचो बरो प्रदिसात मेळटा"
--- session_001_0022.txt ---,"म्हूण काँग्रेसीचो उमेदवार गिरीश चोडणकार मिडियाकडेन उलयलो."
--- session_001_0024.txt ---,"काँग्रेस हावजांत मिडियामुखार पावलो."
--- session_001_0026.txt ---,"विश्वजीता फंड डिपॉजिटूय केलो."
--- session_001_0027.txt ---,"शाळांच्या मॅनेजमॅन्टांनी रिटायर स्टाफाच्या पॅन्शनचो बेगोबेग सॅटलमेंट करुंक जाय."
--- session_001_0028.txt ---,"ना जाल्यार काद्यान तांका दंड म्हूण शिक्षण संचालक गजाजन भट उलयला."
--- session_001_0029.txt ---,"एडमीशनावेळार भुरग्यांचे आर्धारकार्ड करुन घेयात."
--- session_001_0030.txt ---,"तशें सर्कुलरुय हे पयली खंय खात्यान काडलां."
--- session_001_0031.txt ---,"इलेक्शनाचोकोड सुरु जावचें पयली रिक्रुटमेंट जाला"
--- session_001_0032.txt ---,"तांच्योच अपॉयंटमेंट ऑर्डरी खंय रिझल्टा उपरांत दिवपाच्यो."
--- session_001_0033.txt ---,"कोंबा रिंग रोडावेल्या 15 फामिलींक चवथी उपरांत हालोवचे"
--- session_001_0034.txt ---,"तांचें हावसींग बोर्डांत पुनर्वसन जावंक जाय म्हूण टीसीपी मंत्री विजय उलयला."
--- session_001_0035.txt ---,"थंयचे फामीलींची फातोड्डेच्या आमदारासयत मडगांव कलॅक्टरेटांत बसका जाली."
--- session_001_0036.txt ---,"रस्त्याचे काम जावंक जाय म्हूण पंदरशीभीतर तांकां हालोवपाचो कोर्टान आदेश दिला."
--- session_001_0037.txt ---,"मात चवथ आनी फामिलींचो विचार करून कांय बदलांखातीर कोर्टाक विनवणी करपाचो विजयाचो विचार."
--- session_001_0039.txt ---,"आशिल्ले सात आठ जाणूच. करंजाळे निदर्शनां करता म्हूण सांगून म्हापालिका पोलीस कलॅक्टराचें परमिशन घेतलें,"
--- session_001_0040.txt ---,"इलॅक्शन कमिशनाक सांगलेंच ना. निदर्शनां केलीं पणजे."
--- session_001_0041.txt ---,"बीजेपीवाल्यांचे कंप्लेनीचेर पुलिसांनी तांकां बिडकी कवळूंक लायली."
--- session_001_0042.txt ---,"गोंयात सायबर क्रायम वाडाटा. वेबसायटींचेर सॅक्स टुरिजमाचें प्रोमोशन चल्लां म्हूण गोवा वुमन्स फोरमान तिडक उक्तायली. शिंदळकेखातीर चलयां आनी बायलांची पुरवण करपी वॅबसाटीं चलतात."
--- session_001_0043.txt ---,"असले प्रकार बंद जावंक जाय म्हूण गोवा वूमन्स फोरम मागता."
--- session_001_0044.txt ---,"फोरमान मडगावां जागृती रॅली काडली."
--- session_001_0045.txt ---,"दाबोळे केशव स्मृतीच्या अकरावेच्या भुरग्यांक हायर सेकंडरीचे तीन मजली बिल्डिंगेचे तेर्रासाचेर चडोवन टायल्स नितळ करूंक लायिल्ले प्रकरणाची शिक्षण खात्यान दखल घेतल्या."
--- session_001_0046.txt ---,"शिक्षण संचालकान एडीईआय कडच्यान रिपोर्ट मागला."
--- session_001_0047.txt ---,"रिपोर्ट आयलो काय हायर सेकंडरीच्या प्रिन्सिपलाक नोटीस वतली"
--- session_001_0048.txt ---,"म्हूण शिक्षण संचालक गजानन भट प्रुडंटाकडेन उलयलो."
--- session_001_0049.txt ---,"सिटिझन जर्नलिस्टाचे व्हिडिओवेल्यान प्रुडंटान हो प्रकार एक्स्पोज केल्लो."
--- session_001_0050.txt ---,"ऑल इंडिया फुटबॉल फॅडरेशन आतां फुटसाल डेव्हलाप करपाचेर लक्ष घालतलें."
--- session_001_0051.txt ---,"ह्या कामांत आतां तांकां एशीयन फुटबॉल कन्फॅडरेशन आदार करतलें."
--- session_001_0052.txt ---,"इन्फ्रास्ट्रक्चर कशें आसा कांय पळोवंक सद्या एएफसी आनी एआयएफएफाचें शिश्टमंडळ देशभर भोंवतले."
--- session_001_0053.txt ---,"गोंयात ब्रेस्तारा इन्स्पेक्शन जालें."
--- session_001_0055.txt ---,"मुरगांवचो बीजेपी आमदार मिलींद नायकान आंगवण पुराय केली"
--- session_001_0056.txt ---,"वास्को ते बोरये 28 किलोमिटर तो साई बाबाचें दर्शनाक चलत गेलो."
--- session_001_0057.txt ---,"बरोबर धायेक कार्यकर्त्यांचीय वारी"
--- session_001_0058.txt ---,"मिलींदान सगल्यांक बरोबर घेवन फांतोडेर पाचंक वास्कोसावन बोरयेची वाट धल्ली."
--- session_001_0059.txt ---,"इलॅक्शनाकडेन मिलिंद जिकल्यार बोरये वारीची साईबाबाक ताचे कार्य़कर्त्यान आंगवण केल्ली."
--- session_001_0060.txt ---,"मिलिंद 143 मतांनी जिकलो"
--- session_001_0061.txt ---,"स्वतंत्रताय दीस तेंकलो आयकपाक येना उलोवंक कळना अश्या दिव्यांग लोकांखातीर आता सायन लॅग्वेजीन राष्ट्रगीत लाँच जालां."
--- session_001_0062.txt ---,"ब्रदरहुड न्यु दिल्ली आनी डिसॅबीलीटी रायट्स असोसिएशन ऑफ गोवा"
--- session_001_0063.txt ---,"कला अकादमीक बरोबर घेवन सायन लॅग्वीजींतलो नॅशनल एन्थम व्हिडियो लाँच केलो."
--- session_001_0064.txt ---,"अमिताभ बच्चनाक घेवन हें खास राष्ट्रगीत केलां."
--- session_001_0065.txt ---,"हें बुलॅटिन हांगांच सोंपता"
--- session_001_0066.txt ---,"ह्यो खबरो तुमकां प्रुडंट वॉट्सॅप ट्विटर फेसबूक आनी प्रुडंट व्हॅबसायटीचेर लायव्ह मेळटात"
'''

pattern = r"---\s*(session_[\d_]+\.txt)\s*---,\"(.+?)\""
matches = re.findall(pattern, raw, flags=re.DOTALL)

print("{")
for file, text in matches:
    text = text.replace('"', '\\"')  # escape quotes
    print(f'    "{file}": "{text}",')
print("}")


{
    "session_001_0000.txt": "नमस्कार पळोवया प्रुडंट खबरो",
    "session_001_0001.txt": "टॅंकरवाल्यांक ना धरबांद गोंयभरच्या वॉटर टॅंकरांचें ट्रान्स्पोर्ट डिपोर्टमेन्ट करतलो सेफ्टीऑडिट उदका पुरवणे उपरांत लोकांकडच्यान सय घेवपाची सिस्टम प्रुडंटाचे ऑपा वॉट टॅंकर एक्स्पोजेचो इम्पॅक्ट.",
    "session_001_0002.txt": "कळसाभंडुरा प्रकल्पाक केंद्राची परवानगी ना प्रॉजॅक्ट जाला म्हण्टा ती फट म्हादय बचाव अभियानान कर्नाटकाक सुप्रिम कोर्टांत केलें एक्स्पोज कोर्टान मागलें केंद्र कर्नाटकाकडच्यान एफिडॅव्हिट.",
    "session_001_0003.txt": "एक्टिव्हिस्टांचो कॉंग्रेसीक तेंको रायबंदरा पर्रीकारान कांयच कामां केल्लीं ना लायटउदकाच्यो कटकटी गिरीशाचो रायबंदरा प्रचार पर्रीकार खंय सगळ्यात भ्रष्ट मुख्यमंत्री",
    "session_001_0004.txt": "आयरीशाक जीतो मारता म्हूण दिल्ले धमकेचे केशींत विश्वजीतान हायकोर्टांत भल्लो देड लाख लीगल एड फंड",
    "session_001_0005.txt": "चार्जशीट रद्द केस बंद 10 वर्सां उपरांत आयरिश म्हण्टा हरकत ना.",
    "session_001_0007.txt": "आनी भारतांत फुटसालाक येतले बरे दिस ए.एफ.सी आनी ए.आय.एफ.एफ मेळ