In [None]:
# CELL 1 ‚Äî FILENAME SCANNER SETUP (Imports + Helpers)

import json
import re
from pathlib import Path
from Surah_dict import SURAH_DICT

with open("quran-no-tashkeel.json", "r", encoding="utf-8") as f:
    QURAN_DATA = json.load(f)

QURAN_FINGERPRINT = {}
NORM = re.compile(r'[Ÿã-ŸëŸíŸ±ÿ£ÿ•ÿ¢Ÿâÿ©\s]+')

def clean(text):
    return NORM.sub('', text)

print("Building Quran fingerprint from your Surah_dict.py...")

for sid in SURAH_DICT.keys():
    q = next(s for s in QURAN_DATA if f"{s['id']:03d}" == sid)
    verses = q["verses"]
    start = 1 if q["id"] not in [1, 9] else 0
    words = []
    for v in verses[start:start+8]:
        words.extend(v["text"].split())
    fp = clean(' '.join(words[:18]))
    
    QURAN_FINGERPRINT[sid] = {
        "ar": SURAH_DICT[sid]["ar"],
        "en": SURAH_DICT[sid]["en"][0],
        "fp": fp
    }

print(f"Ready: {len(QURAN_FINGERPRINT)} surahs loaded (Bismillah skipped where needed)\n")

In [None]:
# CELL 2 ‚Äî Filename Pattern Matching & Surah Detection

"""
Setup for STRICT filename Surah detector.
Includes imports, normalization, and lookup tables.
Run this once per session.
"""

import re
from pathlib import Path
from rapidfuzz import fuzz, process
from Surah_dict import SURAH_DICT

# ============================================================================
# NORMALIZATION
# ============================================================================

def normalize_arabic(text):
    text = re.sub(r'[\u064B-\u0652\u0670\u0640]', '', text)
    text = re.sub(r'[ÿ£ÿ•ÿ¢ÿß]', 'ÿß', text)
    text = re.sub(r'ÿ©', 'Ÿá', text)
    text = re.sub(r'Ÿâ]', 'Ÿä', text)
    text = re.sub(r'[ÿ°]', '', text)
    return text.strip()

def normalize_english(text):
    text = text.lower()
    text = re.sub(r'[-_\s]', '', text)
    return text.strip()

# ============================================================================
# BUILD LOOKUP TABLES
# ============================================================================

ARABIC_LOOKUP = {}
ENGLISH_LOOKUP = {}
NUMERIC_LOOKUP = {}

for surah_id, data in SURAH_DICT.items():
    # Arabic
    ar_norm = normalize_arabic(data["ar"])
    ARABIC_LOOKUP[ar_norm] = surah_id

    # English variants
    for en in data["en"]:
        en_norm = normalize_english(en)
        ENGLISH_LOOKUP[en_norm] = surah_id

    # Numeric
    NUMERIC_LOOKUP[str(int(surah_id))] = surah_id
    NUMERIC_LOOKUP[surah_id] = surah_id

ALL_ARABIC_NAMES = list(ARABIC_LOOKUP.keys())
ALL_ENGLISH_NAMES = list(ENGLISH_LOOKUP.keys())

print(f"‚úì Loaded {len(SURAH_DICT)} surahs")
print(f"  Arabic names: {len(ARABIC_LOOKUP)}")
print(f"  English variants: {len(ENGLISH_LOOKUP)}\n")

# ============================================================================
# HELPER: Validate if text is a surah name (NOT a reciter)
# ============================================================================

def is_surah_name(name):
    """Check if name matches any surah name (Arabic or English)"""
    if not name:
        return False
    name_norm = normalize_arabic(name) if any('\u0600' <= c <= '\u06FF' for c in name) else normalize_english(name)
    return name_norm in ARABIC_LOOKUP or name_norm in ENGLISH_LOOKUP

# ============================================================================
# CASE DETECTORS
# ============================================================================

def extract_numeric_pattern(filename):
    """Extracts reliable numeric surah reference."""
    match = re.search(r'\b(\d{1,3})\b', filename)
    if not match:
        return None
    num = int(match.group(1))
    if 1 <= num <= 114:
        return NUMERIC_LOOKUP[str(num)]
    return None

def extract_reciter_pattern(filename):
    """Extract reciter + number, rejecting surah names as reciters"""
    patterns = [
        r'([A-Za-z\u0600-\u06FF]+)[-_](\d{1,3})',  # reciter_sura (muzafar_003)
        r'(\d{1,3})[-_]([A-Za-z\u0600-\u06FF]+)',  # sura_reciter (003_muzafar)
    ]
    for pat in patterns:
        m = re.search(pat, filename)
        if m:
            if len(m.groups()) == 2:
                part1, part2 = m.groups()
                if part1.isdigit():
                    num, reciter = int(part1), part2
                else:
                    reciter, num = part1, int(part2)
                if 1 <= num <= 114:
                    # FIX: Don't return surah names as reciters
                    if is_surah_name(reciter):
                        return NUMERIC_LOOKUP[str(num)], None
                    return NUMERIC_LOOKUP[str(num)], reciter.strip()
    return None, None

# ============================================================================
# STRICT FUZZY MATCHERS
# ============================================================================

def fuzzy_match_arabic(text, threshold=85):
    """Arabic fuzzy disabled for any mixed tokens or digits."""
    if re.search(r'\d', text):
        return None, 0  # STRICT ‚Äî no fuzzy if digits exist

    tokens = re.findall(r'[\u0600-\u06FF]+', text)
    if not tokens:
        return None, 0

    best_id = None
    best_score = 0

    for t in tokens:
        norm = normalize_arabic(t)
        if len(norm) < 4:
            continue  # too short to fuzzy accurately

        match = process.extractOne(norm, ALL_ARABIC_NAMES, scorer=fuzz.ratio)
        if match and match[1] >= threshold and match[1] > best_score:
            best_id = ARABIC_LOOKUP[match[0]]
            best_score = match[1]

    return best_id, best_score

def fuzzy_match_english(text, threshold=85):
    if re.search(r'\d', text):
        return None, 0  # STRICT ‚Äî no fuzzy on mixed/digits

    tokens = re.findall(r'[A-Za-z]+', text)
    if not tokens:
        return None, 0

    best_id = None
    best_score = 0

    for t in tokens:
        norm = normalize_english(t)
        if len(norm) < 3:
            continue

        match = process.extractOne(norm, ALL_ENGLISH_NAMES, scorer=fuzz.ratio)
        if match and match[1] >= threshold and match[1] > best_score:
            best_id = ENGLISH_LOOKUP[match[0]]
            best_score = match[1]

    return best_id, best_score

# ============================================================================
# CONFLICT DETECTION
# ============================================================================

def detect_conflict(filename):
    stem = Path(filename).stem

    num_id = extract_numeric_pattern(stem)
    if not num_id:
        return False, None, None

    name_ar, _ = fuzzy_match_arabic(stem, threshold=80)
    name_en, _ = fuzzy_match_english(stem, threshold=80)
    name_id = name_ar or name_en

    if name_id and num_id != name_id:
        return True, num_id, name_id

    return False, None, None

In [None]:
# CELL 2-DEBUG ‚Äî Verify Pattern Matching Setup

print("\n" + "="*70)
print("DEBUG: CELL 2 VERIFICATION")
print("="*70)

# 1. Check lookups were built
print(f"\n‚úì Lookup Tables:")
print(f"   ARABIC_LOOKUP: {len(ARABIC_LOOKUP)} entries")
print(f"   ENGLISH_LOOKUP: {len(ENGLISH_LOOKUP)} entries")
print(f"   NUMERIC_LOOKUP: {len(NUMERIC_LOOKUP)} entries")

# 2. Check GLOBAL_RECITER was set
if 'GLOBAL_RECITER' in globals():
    print(f"\n‚úì GLOBAL_RECITER: '{GLOBAL_RECITER}'")
else:
    print(f"\n‚ùå ERROR: GLOBAL_RECITER not set!")

# 3. Test is_surah_name function
print(f"\n‚úì Testing is_surah_name():")
test_cases = [
    ("ÿßŸÑŸÅÿßÿ™ÿ≠ÿ©", True),
    ("Bakara", True),
    ("Muzafar", False),
    ("ŸÖÿ≠ŸÖÿØ", True),  # This is actually both a name AND a surah!
    ("AlAfasy", False)
]
for name, expected in test_cases:
    result = is_surah_name(name)
    status = "‚úì" if result == expected else "‚ùå"
    print(f"   {status} is_surah_name('{name}') = {result} (expected {expected})")

# 4. Test extract_reciter_pattern
print(f"\n‚úì Testing extract_reciter_pattern():")
test_files = [
    "Muzafar_003.mp3",
    "ÿßŸÑŸÅÿßÿ™ÿ≠ÿ©_001.mp3",  # Should reject ÿßŸÑŸÅÿßÿ™ÿ≠ÿ© as reciter
    "003_AlAfasy.mp3",
    "Bakara_002.mp3"    # Should reject Bakara as reciter
]
for fname in test_files:
    sid, reciter = extract_reciter_pattern(fname)
    print(f"   {fname}")
    print(f"      Surah: {sid}, Reciter: {reciter}")

print("\n" + "="*70)

In [None]:
# CELL 3 ‚Äî FILENAME SCANNER EXECUTION (Matcher + Scan)

"""
Main execution for STRICT filename Surah detector.
Uses helpers from Cell 1.5A.
Re-run this for new files.
"""

# ============================================================================
# MAIN MATCHER
# ============================================================================

def match_filename_to_surah(filename, threshold=85):
    stem = Path(filename).stem

    # Priority 1 ‚Äî deterministic numeric
    num_id = extract_numeric_pattern(stem)
    if num_id:
        return num_id, 100, "numeric", None  # No reciter

    # Priority 2 ‚Äî reciter style numeric, extract reciter
    rec_id, reciter_name = extract_reciter_pattern(stem)
    if rec_id:
        return rec_id, 95, "reciter_pattern", reciter_name

    # Priority 3 ‚Äî Arabic fuzzy
    ar_id, score = fuzzy_match_arabic(stem, threshold)
    if ar_id:
        return ar_id, score, "arabic_fuzzy", None

    # Priority 4 ‚Äî English fuzzy
    en_id, score = fuzzy_match_english(stem, threshold)
    if en_id:
        return en_id, score, "english_fuzzy", None

    return None, 0, None, None

# ============================================================================
# BATCH SCAN
# ============================================================================

def scan_audio_files(directory=".", threshold=85):
    audio_exts = {".mp3", ".wav", ".m4a", ".flac", ".ogg"}
    files = [f for f in Path(directory).iterdir()
             if f.suffix.lower() in audio_exts]

    matched = []
    unmatched = []

    print("="*80)
    print("STRICT SCAN: AUDIO FILES")
    print("="*80, "\n")

    for f in sorted(files):
        is_conflict, num_says, name_says = detect_conflict(f.name)
        if is_conflict:
            print(f"‚ö†Ô∏è  {f.name} ‚Äî CONFLICT: number={num_says} vs name={name_says}")
            unmatched.append(f)
            continue

        surah_id, conf, method, reciter = match_filename_to_surah(f.name, threshold)

        if surah_id:
            data = SURAH_DICT[surah_id]
            matched.append({
                "file": f,
                "surah_id": surah_id,
                "surah_name_ar": data["ar"],
                "surah_name_en": data["en"][0],
                "confidence": conf,
                "method": method,
                "reciter": reciter  # None if not extracted
            })

            emoji = "üéØ" if conf == 100 else "‚úì"
            print(f"{emoji} {f.name}")
            print(f"   ‚Üí {surah_id} {data['ar']} ({data['en'][0]})")
            print(f"   Confidence: {conf}% | Method: {method}")
            if reciter:
                print(f"   Extracted reciter: {reciter}")
            print()

        else:
            print(f"‚ùå {f.name}")
            print("   ‚Üí No match found (Whisper required)\n")
            unmatched.append(f)

    print("="*80)
    print(f"MATCHED: {len(matched)} | REQUIRE WHISPER: {len(unmatched)}")
    print("="*80, "\n")

    return matched, unmatched

# ============================================================================
# EXECUTION
# ============================================================================

if __name__ == "__main__":
    matched, unmatched = scan_audio_files(threshold=85)

    # ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
    # SMART SPLIT: SKIP WHISPER WHEN 100% SURE
    # ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
    HIGH_CONF_FILES = []      # These skip Whisper ‚Üí go straight to folder
    NEED_WHISPER_FILES = []   # Truly unknown names only

    for item in matched:
        if (item["confidence"] >= 97 or 
            item["method"] in ["numeric", "reciter_pattern"]):
            HIGH_CONF_FILES.append({
                "path": str(item["file"]),
                "surah_id": item["surah_id"],
                "ar": item["surah_name_ar"],
                "en": item["surah_name_en"],
                "audio_file": item["file"].name,
                "base_name": f"{item['surah_name_en'].replace(' ', '_')}_{item['surah_id'].lstrip('0')}",
                "reciter": item["reciter"]
            })
        else:
            NEED_WHISPER_FILES.append(item["file"])

    WHISPER_REQUIRED_FILES = unmatched + NEED_WHISPER_FILES

    print(f"\nSKIP WHISPER ‚Üí {len(HIGH_CONF_FILES)} files (bakara.mp3, 002.mp3, muzafar_003.mp3, etc.)")
    print(f"NEED WHISPER ‚Üí {len(WHISPER_REQUIRED_FILES)} files\n")

In [None]:
# CELL 3-DEBUG ‚Äî Verify File Scanning Results

print("\n" + "="*70)
print("DEBUG: CELL 3 VERIFICATION")
print("="*70)

# 1. Check HIGH_CONF_FILES
if 'HIGH_CONF_FILES' in globals():
    print(f"\n‚úì HIGH_CONF_FILES: {len(HIGH_CONF_FILES)} files")
    
    # Show first 3
    for i, item in enumerate(HIGH_CONF_FILES[:3], 1):
        print(f"\n   {i}. {item['audio_file']}")
        print(f"      Surah: {item['surah_id']} - {item['ar']}")
        print(f"      Extracted reciter: {item.get('reciter', 'None')}")
        print(f"      Path: {item['path']}")
    
    if len(HIGH_CONF_FILES) > 3:
        print(f"\n   ... and {len(HIGH_CONF_FILES) - 3} more files")
    
    # Check for surah names in reciter field
    print(f"\n‚úì Checking for surah names incorrectly marked as reciters:")
    surah_as_reciter = [item for item in HIGH_CONF_FILES 
                        if item.get('reciter') and is_surah_name(item['reciter'])]
    if surah_as_reciter:
        print(f"   ‚ùå FOUND {len(surah_as_reciter)} files with surah names as reciters!")
        for item in surah_as_reciter[:3]:
            print(f"      ‚Ä¢ {item['audio_file']} ‚Üí reciter='{item['reciter']}'")
    else:
        print(f"   ‚úì No surah names incorrectly marked as reciters")
else:
    print(f"\n‚ùå ERROR: HIGH_CONF_FILES not created!")

# 2. Check WHISPER_REQUIRED_FILES
if 'WHISPER_REQUIRED_FILES' in globals():
    print(f"\n‚úì WHISPER_REQUIRED_FILES: {len(WHISPER_REQUIRED_FILES)} files")
    if WHISPER_REQUIRED_FILES:
        print(f"   Files needing Whisper:")
        for f in WHISPER_REQUIRED_FILES[:5]:
            print(f"      ‚Ä¢ {f.name}")
        if len(WHISPER_REQUIRED_FILES) > 5:
            print(f"      ... and {len(WHISPER_REQUIRED_FILES) - 5} more")
else:
    print(f"\n‚ùå ERROR: WHISPER_REQUIRED_FILES not created!")

print("\n" + "="*70)

In [None]:
# CELL 4 ‚Äî FOLDER ORGANIZATION ONLY (NO WHISPER)

import shutil
from pathlib import Path

processed_surahs = []
skipped_files = []

# ==============================================================
# HELPER: Check if extracted reciter is actually a surah name
# ==============================================================
def is_surah_name(name):
    """Check if name matches any surah name (Arabic or English)"""
    if not name:
        return False
    name_norm = normalize_arabic(name) if any('\u0600' <= c <= '\u06FF' for c in name) else normalize_english(name)
    
    for surah_id, data in SURAH_DICT.items():
        if normalize_arabic(data["ar"]) == name_norm:
            return True
        for en in data["en"]:
            if normalize_english(en) == name_norm:
                return True
    return False

# ==============================================================
# CHECK IF FILES EXIST
# ==============================================================
if 'HIGH_CONF_FILES' not in globals() or not HIGH_CONF_FILES:
    print("‚ùå No files matched! Run Cell 1.5B first.")
    raise RuntimeError("No HIGH_CONF_FILES found")

print(f"\n{'='*70}")
print(f"ORGANIZING {len(HIGH_CONF_FILES)} MATCHED FILES...")
print(f"{'='*70}\n")

# ==============================================================
# STEP 1: AUTO-DETECT RECITER (OR ASK ONCE)
# ==============================================================
auto_reciter = None

# Look for valid reciter name in ANY file
for item in HIGH_CONF_FILES:
    if item["reciter"] and not is_surah_name(item["reciter"]):
        auto_reciter = item["reciter"].capitalize()
        print(f"‚úì Auto-detected reciter: {auto_reciter}\n")
        break

# If no reciter found, ask ONCE for ALL files
if not auto_reciter:
    auto_reciter = input("Reciter name for ALL files (press Enter for 'Unknown'): ").strip() or "Unknown"
    auto_reciter = auto_reciter.capitalize()
    print()

# Create reciter folder
reciter_folder = Path(auto_reciter)
reciter_folder.mkdir(exist_ok=True)

print(f"‚Üí Saving to: {auto_reciter}/")
print(f"‚Üí Format: SurahName_000_{auto_reciter}.mp3\n")

# ==============================================================
# STEP 2: ORGANIZE ALL FILES (ONE LOOP, NO DUPLICATES)
# ==============================================================
processed_paths = set()

for item in HIGH_CONF_FILES:
    # Skip if already processed
    if item["path"] in processed_paths:
        print(f"‚ö† SKIP (duplicate): {item['path']}")
        continue
    
    sid = item["surah_id"]
    
    # FLAT structure: Muzafar/imran_003_Muzafar.mp3
    new_name = f"{item['en'].replace(' ', '_')}_{sid}_{auto_reciter}{Path(item['path']).suffix}"
    new_path = reciter_folder / new_name
    
    # Skip if file already exists
    if new_path.exists():
        skipped_files.append(str(new_path))
        print(f"‚ö† SKIP (exists): {new_path}")
        continue
    
    # Copy file
    shutil.copy2(item["path"], new_path)
    processed_paths.add(item["path"])
    
    # Add to processed list for Cell 2C
    processed_surahs.append({
        "sid": sid,
        "ar": item["ar"],
        "en": item["en"],
        "organized_audio_path": str(new_path),
        "audio_file": new_name,
        "base_name": f"{item['en'].replace(' ', '_')}_{sid}_{auto_reciter}",
        "path": item["path"],
        "source": "filename_high_confidence"
    })
    
    print(f"‚úì {sid} {item['ar']} ‚Üí {new_name}")

# ==============================================================
# SUMMARY
# ==============================================================
print(f"\n{'='*70}")
print(f"‚úì {len(processed_surahs)} files organized in: {reciter_folder}/")
if skipped_files:
    print(f"‚ö† Skipped {len(skipped_files)} existing files")
print(f"{'='*70}")
print(f"\n‚ú® Ready for Cell 2C")

In [None]:
# CELL 5 ‚Äî FIXED: OpenAI Whisper + Duration Correct
import torch
import gc
import whisper
from pathlib import Path
import json
from tqdm import tqdm

torch.cuda.empty_cache()
gc.collect()
print(f"RTX 5070 clean start")

if 'processed_surahs' not in globals() or not processed_surahs:
    raise RuntimeError("Run CELL 4 first!")

print("\n" + "="*80)
print("TRANSCRIPTION ‚Äî OPENAI WHISPER (RTX 5070 GUARANTEED)")
print("="*80)

model = whisper.load_model("medium", device="cuda")  # Medium = fast, stable
print("Model loaded")

for item in processed_surahs:
    audio_path = Path(item['organized_audio_path'])
    if not audio_path.exists():
        print(f"Missing: {audio_path}")
        continue
    
    print(f"\nSurah: {item['sid']} {item['ar']} | {audio_path.name}")
    
    result = model.transcribe(
        str(audio_path),
        language="ar",
        word_timestamps=True,
        no_speech_threshold=0.2,
        logprob_threshold=-1.0,  # Even more permissive
        compression_ratio_threshold=2.0,
        condition_on_previous_text=False  # Don't stop early
    )
    
    words = []
    for seg in tqdm(result["segments"], desc="Words"):
        for w in seg.get("words", []):
            words.append({
                "word": w["word"].strip(),
                "start_ms": int(w["start"] * 1000),
                "end_ms": int(w["end"] * 1000),
                "confidence": round(w["probability"], 4)
            })
    
    json_file = audio_path.parent / f"{item['base_name']}_DELETE.json"

    metadata = {
        "surah_id": item['sid'],
        "surah_name_ar": item['ar'],
        "surah_name_en": item['en'],
        "audio_file": audio_path.name,
        "base_name": item['base_name'],
        "duration_seconds": round(result['segments'][-1]['end'], 2),  # ‚Üê ACTUALLY FIXED NOW!
        "total_words": len(words),
        "model": "openai-whisper medium"
    }
    
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump({"metadata": metadata, "words": words}, f, ensure_ascii=False, indent=2)
    
    print(f"   Words: {len(words):,} | Duration: {metadata['duration_seconds']/60:.1f} min | Saved: {json_file.name}")
    
    torch.cuda.empty_cache()
    gc.collect()



In [None]:
# CELL 5.5 ‚Äî Extract Clean Quran Text (CORRECT VERSION)

from pathlib import Path
import json

if 'processed_surahs' not in globals() or not processed_surahs:
    raise RuntimeError("‚ùå Run Cell 4 first!")

# Find reciter folder
if 'GLOBAL_RECITER' in globals():
    reciter_folder = Path(GLOBAL_RECITER)
else:
    first_path = Path(processed_surahs[0]['organized_audio_path'])
    reciter_folder = first_path.parent

print("\n" + "="*70)
print("EXTRACTING CLEAN QURAN TEXT")
print("="*70)

# Load Quran data
with open("quran-no-tashkeel.json", "r", encoding="utf-8") as f:
    QURAN = json.load(f)

for item in processed_surahs:
    sid = item['sid']
    base_name = item['base_name']
    
    surah_num = int(sid)
    surah_data = next((s for s in QURAN if s['id'] == surah_num), None)
    
    if not surah_data:
        print(f"‚ùå Surah {sid} not found!")
        continue
    
    # Include ALL verses
    clean_txt = reciter_folder / f"{base_name}_CLEAN.txt"
    
    with open(clean_txt, 'w', encoding='utf-8') as f:
        for verse in surah_data['verses']:
            f.write(f"({verse['id']}) {verse['text']}\n")
    
    print(f"‚úì {sid} {item['ar']} ‚Üí {len(surah_data['verses'])} ayas")

print("\n" + "="*70)
print(f"‚úÖ Clean text files created!")
print("="*70)

In [None]:
# CELL 6 ‚Äî ROBUST AYA ALIGNMENT WITH CONFIDENCE SCORING

from pathlib import Path
import json
import re
from rapidfuzz import fuzz
from typing import List, Dict, Tuple, Optional

print("AUTO ALIGNMENT: aya anchors + smart gap filling + confidence scoring\n")

aligned = 0
processed = 0

# ============================================================================
# NORMALIZATION
# ============================================================================
def normalize(text: str) -> str:
    """Normalize Arabic text for matching"""
    text = re.sub(r'[\u064B-\u0652\u0670]', '', text)  # Remove diacritics
    text = re.sub(r'[ÿ£ÿ•ÿ¢]', 'ÿß', text)  # Normalize alif
    text = re.sub(r'ÿ©', 'Ÿá', text)  # Normalize ta marbuta
    text = re.sub(r'Ÿâ', 'Ÿä', text)  # Normalize alif maksura
    return text.strip()

# ============================================================================
# ANCHOR MATCHING WITH CONFIDENCE - MULTI-WORD CONTEXT
# ============================================================================
def find_aya_anchor(
    aya_words: List[str], 
    whisper_words: List[Dict], 
    start_search_pos: int,
    search_window: int = 20
) -> Tuple[int, float]:
    """
    Find the best starting position for an aya in the whisper transcript.
    Uses multi-word context (first 3 words) for more accurate matching.
    Returns (position, confidence_score)
    """
    if not aya_words or start_search_pos >= len(whisper_words):
        return start_search_pos, 0.0
    
    # Use first 3 words for context (more reliable than single word)
    context_size = min(3, len(aya_words))
    aya_context = " ".join(normalize(w) for w in aya_words[:context_size])
    
    best_pos = start_search_pos
    best_score = 0.0
    
    # Search within window for best match
    search_end = min(start_search_pos + search_window, len(whisper_words))
    
    for i in range(start_search_pos, search_end):
        # Build whisper context of same size
        whisper_context_words = []
        for j in range(i, min(i + context_size, len(whisper_words))):
            whisper_context_words.append(normalize(whisper_words[j]["word"]))
        
        whisper_context = " ".join(whisper_context_words)
        
        # Compare contexts
        score = fuzz.ratio(whisper_context, aya_context)
        
        if score > best_score:
            best_score = score
            best_pos = i
            
            # Early exit for excellent match
            if score >= 85:  # Lowered threshold for multi-word
                break
    
    # Convert to 0-1 confidence scale
    confidence = best_score / 100.0
    
    return best_pos, confidence

# ============================================================================
# WORD ALIGNMENT WITH CONFIDENCE
# ============================================================================
def align_words_with_confidence(
    quran_words: List[str],
    whisper_words: List[Dict],
    whisper_start_pos: int
) -> Tuple[List[Dict], float]:
    """
    Align Quran words to Whisper words with confidence scoring.
    Returns (aligned_words, average_confidence)
    """
    aligned = []
    confidences = []
    whisper_pos = whisper_start_pos
    
    for quran_word in quran_words:
        if whisper_pos >= len(whisper_words):
            # Out of whisper words - will need gap filling
            break
        
        whisper_word = whisper_words[whisper_pos]
        quran_norm = normalize(quran_word)
        whisper_norm = normalize(whisper_word["word"])
        
        # Calculate match confidence
        match_score = fuzz.ratio(quran_norm, whisper_norm) / 100.0
        
        # Combine with whisper's own confidence
        whisper_confidence = whisper_word.get("confidence", 0.95)
        combined_confidence = (match_score * 0.7) + (whisper_confidence * 0.3)
        
        aligned.append({
            "word": quran_word,
            "start_ms": whisper_word["start_ms"],
            "end_ms": whisper_word["end_ms"],
            "confidence": round(combined_confidence, 4),
            "matched": match_score > 0.6  # Flag if direct match
        })
        
        confidences.append(combined_confidence)
        whisper_pos += 1
    
    avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
    
    return aligned, avg_confidence, whisper_pos

# ============================================================================
# SMART GAP FILLING
# ============================================================================
def fill_missing_words(
    aya_words: List[str],
    aligned_words: List[Dict],
    whisper_words: List[Dict],
    next_whisper_pos: int
) -> List[Dict]:
    """
    Fill in missing words at the end of an aya with intelligent timing.
    Returns complete list of aligned words.
    """
    remaining_words = aya_words[len(aligned_words):]
    
    if not remaining_words:
        return aligned_words
    
    # Case 1: We have some aligned words - extend from last word
    if aligned_words:
        last_word = aligned_words[-1]
        last_end = last_word["end_ms"]
        
        # Calculate average word duration from aligned words
        durations = [w["end_ms"] - w["start_ms"] for w in aligned_words]
        avg_duration = sum(durations) // len(durations) if durations else 500
        
        # Find next boundary
        if next_whisper_pos < len(whisper_words):
            next_start = whisper_words[next_whisper_pos]["start_ms"]
            available_time = max(next_start - last_end, 0)
        else:
            # Use average duration for remaining words
            available_time = avg_duration * len(remaining_words)
        
        # Distribute time evenly
        word_duration = available_time // len(remaining_words) if available_time > 0 else avg_duration
        word_duration = max(word_duration, 300)  # Minimum 300ms per word
        
        for i, word in enumerate(remaining_words):
            word_start = last_end + (word_duration * i)
            word_end = word_start + word_duration
            
            aligned_words.append({
                "word": word,
                "start_ms": word_start,
                "end_ms": word_end,
                "confidence": 0.30,  # Low confidence for gap-filled words
                "matched": False
            })
    
    # Case 2: No aligned words at all - complete aya missing
    else:
        # Estimate based on typical word duration
        default_duration = 600  # 600ms per word
        
        # Try to anchor to next available whisper word
        if next_whisper_pos < len(whisper_words):
            anchor_time = whisper_words[next_whisper_pos]["start_ms"]
            # Work backwards
            total_duration = default_duration * len(remaining_words)
            start_time = max(0, anchor_time - total_duration)
        else:
            # No anchor - use 0 or last known time
            start_time = 0
        
        for i, word in enumerate(remaining_words):
            word_start = start_time + (default_duration * i)
            word_end = word_start + default_duration
            
            aligned_words.append({
                "word": word,
                "start_ms": word_start,
                "end_ms": word_end,
                "confidence": 0.20,  # Very low confidence
                "matched": False
            })
    
    return aligned_words

# ============================================================================
# MAIN ALIGNMENT LOOP
# ============================================================================
for whisper_json in sorted(Path(".").rglob("*_DELETE.json")):
    aligned_json = whisper_json.with_name(
        whisper_json.stem.replace("_DELETE", "") + "_aligned.json"
    )
    
    if aligned_json.exists():
        aligned += 1
        continue
    
    processed += 1
    print(f"Aligning ‚Üí {whisper_json.name}")

    # Load whisper transcription
    try:
        with open(whisper_json, encoding="utf-8") as f:
            data = json.load(f)
            whisper_words = data.get("words", [])
            metadata = data.get("metadata", {})
    except Exception as e:
        print(f"  ‚ùå ERROR loading {whisper_json.name}: {e}")
        continue

    # Load clean Quran text
    clean_txt = whisper_json.with_name(
        whisper_json.stem.replace("_DELETE", "") + "_CLEAN.txt"
    )
    
    if not clean_txt.exists():
        print(f"  ‚ùå ERROR: {clean_txt.name} not found!")
        continue
    
    try:
        with open(clean_txt, encoding="utf-8") as f:
            clean_lines = [
                re.sub(r"^\(\d+\)\s*", "", l.strip()) 
                for l in f if l.strip()
            ]
    except Exception as e:
        print(f"  ‚ùå ERROR loading {clean_txt.name}: {e}")
        continue

    # Validate data
    if not whisper_words:
        print(f"  ‚ö†Ô∏è  WARNING: No whisper words found!")
        continue
    
    if not clean_lines:
        print(f"  ‚ö†Ô∏è  WARNING: No Quran text found!")
        continue

    # ========================================================================
    # ALIGN EACH AYA
    # ========================================================================
    ayas = []
    global_whisper_pos = 0
    low_confidence_ayas = []
    
    for aya_num, aya_text in enumerate(clean_lines, 1):
        aya_words = aya_text.split()
        
        if not aya_words:
            continue
        
        # STEP 0: Validate with previous aya's end boundary
        expected_min_start = 0
        if ayas:  # We have previous aya
            prev_end = ayas[-1]["end_ms"]
            expected_min_start = prev_end
            
            # Ensure we don't search before previous aya ended
            if global_whisper_pos < len(whisper_words):
                while (global_whisper_pos < len(whisper_words) and 
                       whisper_words[global_whisper_pos]["start_ms"] < expected_min_start):
                    global_whisper_pos += 1
        
        # STEP 1: Find anchor point for this aya
        anchor_pos, anchor_confidence = find_aya_anchor(
            aya_words, 
            whisper_words, 
            global_whisper_pos,
            search_window=20
        )
        
        # STEP 2: Align words from anchor
        aligned_words, word_confidence, next_pos = align_words_with_confidence(
            aya_words,
            whisper_words,
            anchor_pos
        )
        
        # STEP 3: Fill any missing words
        aligned_words = fill_missing_words(
            aya_words,
            aligned_words,
            whisper_words,
            next_pos
        )
        
        # STEP 4: Calculate overall aya confidence
        word_confidences = [w["confidence"] for w in aligned_words]
        aya_confidence = sum(word_confidences) / len(word_confidences) if word_confidences else 0.0
        
        # Penalize if we had to use gap filling
        gap_filled_count = sum(1 for w in aligned_words if not w.get("matched", False))
        if gap_filled_count > 0:
            gap_penalty = gap_filled_count / len(aligned_words)
            aya_confidence *= (1.0 - (gap_penalty * 0.3))  # Reduce by up to 30%
        
        # STEP 5: Validate alignment quality
        if len(aligned_words) != len(aya_words):
            print(f"  ‚ö†Ô∏è  Aya {aya_num}: Word count mismatch! " +
                  f"Expected {len(aya_words)}, got {len(aligned_words)}")
            # Force correct count by trimming or padding
            if len(aligned_words) > len(aya_words):
                aligned_words = aligned_words[:len(aya_words)]
            else:
                # Should not happen after gap filling, but add safety
                while len(aligned_words) < len(aya_words):
                    last_end = aligned_words[-1]["end_ms"] if aligned_words else 0
                    aligned_words.append({
                        "word": aya_words[len(aligned_words)],
                        "start_ms": last_end,
                        "end_ms": last_end + 500,
                        "confidence": 0.10,
                        "matched": False
                    })
        
        # STEP 5.5: Lookahead validation (check next aya aligns properly)
        lookahead_confidence = 1.0
        if aya_num < len(clean_lines):  # Not the last aya
            next_aya_words = clean_lines[aya_num].split() if aya_num < len(clean_lines) else []
            if next_aya_words and next_pos < len(whisper_words):
                # Quick check: does next aya's first word appear soon?
                next_context = " ".join(normalize(w) for w in next_aya_words[:2])
                found_next = False
                for peek_pos in range(next_pos, min(next_pos + 10, len(whisper_words))):
                    peek_context = " ".join(
                        normalize(whisper_words[peek_pos + j]["word"]) 
                        for j in range(min(2, len(whisper_words) - peek_pos))
                    )
                    if fuzz.ratio(peek_context, next_context) > 75:
                        found_next = True
                        break
                
                if not found_next:
                    lookahead_confidence = 0.7  # Penalty if next aya doesn't align
                    print(f"  ‚ö†Ô∏è  Aya {aya_num}: Next aya boundary uncertain")
        
        # Apply lookahead penalty to aya confidence
        aya_confidence *= lookahead_confidence
        
        # STEP 6: Create aya record
        aya_record = {
            "aya_number": aya_num,
            "text": aya_text,
            "words": aligned_words,
            "start_ms": aligned_words[0]["start_ms"] if aligned_words else 0,
            "end_ms": aligned_words[-1]["end_ms"] if aligned_words else 0,
            "confidence": round(aya_confidence, 4),
            "word_count": len(aligned_words),
            "matched_words": sum(1 for w in aligned_words if w.get("matched", False)),
            "gap_filled_words": gap_filled_count
        }
        
        ayas.append(aya_record)
        
        # Track low confidence for review
        if aya_confidence < 0.7:
            low_confidence_ayas.append(aya_num)
        
        # STEP 7: Update global position
        global_whisper_pos = next_pos
    
    # ========================================================================
    # SAVE ALIGNED DATA
    # ========================================================================
    output_data = {
        "metadata": {
            **metadata,
            "alignment_version": "v2_robust",
            "total_ayas": len(ayas),
            "low_confidence_count": len(low_confidence_ayas),
            "average_confidence": round(
                sum(a["confidence"] for a in ayas) / len(ayas) if ayas else 0.0, 
                4
            )
        },
        "ayas": ayas,
        "low_confidence_ayas": low_confidence_ayas
    }
    
    with open(aligned_json, "w", encoding="utf-8") as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    # Print summary
    avg_conf = output_data["metadata"]["average_confidence"]
    print(f"    ‚úì {len(ayas)} ayas aligned | Avg confidence: {avg_conf:.2%}")
    
    if low_confidence_ayas:
        print(f"    ‚ö†Ô∏è  {len(low_confidence_ayas)} ayas need review: {low_confidence_ayas[:5]}" + 
              ("..." if len(low_confidence_ayas) > 5 else ""))

# ============================================================================
# CLEANUP
# ============================================================================
print(f"\n{'='*70}")
print("CLEANING UP...")
print('='*70)

deleted_count = 0
for delete_json in Path(".").rglob("*_DELETE.json"):
    delete_json.unlink()
    print(f"‚úì Deleted {delete_json.name}")
    deleted_count += 1

print(f"\n‚úì Removed {deleted_count} temporary _DELETE.json files")
print('='*70)

# ============================================================================
# FINAL SUMMARY
# ============================================================================
print(f"\nALIGNMENT COMPLETE")
print(f"   Already aligned  : {aligned}")
print(f"   Newly aligned    : {processed}")
print(f"   Total processed  : {aligned + processed}")
print('='*70)

In [None]:
# CELL 6.5 ‚Äî AUDIO VERIFICATION & RE-ALIGNMENT (LITERAL WHISPER CHECK)

"""
This cell takes low-confidence aligned ayas and LITERALLY re-transcribes 
the audio segment using Whisper to verify/correct the alignment.
"""

import torch
import gc
import whisper
from pathlib import Path
import json
from pydub import AudioSegment
import tempfile
from rapidfuzz import fuzz
import re

# ============================================================================
# CONFIG
# ============================================================================
CONFIDENCE_THRESHOLD = 0.7  # Re-verify ayas below this
MIN_AYA_DURATION_MS = 1000  # Skip very short segments (< 1 second)
VERIFICATION_MODEL = "base"  # Fast model for verification (base/small/medium)

# ============================================================================
# SETUP
# ============================================================================
torch.cuda.empty_cache()
gc.collect()

print("=" * 80)
print("AUDIO VERIFICATION ‚Äî LITERAL WHISPER RE-TRANSCRIPTION")
print("=" * 80)
print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}")
print(f"Verification model: {VERIFICATION_MODEL}")
print()

# Load Whisper model
print("Loading Whisper model...")
model = whisper.load_model(VERIFICATION_MODEL, device="cuda")
print("‚úì Model loaded\n")

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def normalize(text: str) -> str:
    """Normalize Arabic text"""
    text = re.sub(r'[\u064B-\u0652\u0670]', '', text)
    text = re.sub(r'[ÿ£ÿ•ÿ¢]', 'ÿß', text)
    text = re.sub(r'ÿ©', 'Ÿá', text)
    text = re.sub(r'Ÿâ', 'Ÿä', text)
    return text.strip()

def extract_audio_segment(audio_path: Path, start_ms: int, end_ms: int) -> Path:
    """Extract a segment from audio file"""
    audio = AudioSegment.from_file(str(audio_path))
    segment = audio[start_ms:end_ms]
    
    # Save to temp file
    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    segment.export(temp_file.name, format="wav")
    
    return Path(temp_file.name)

def transcribe_segment(audio_path: Path) -> dict:
    """Transcribe audio segment with Whisper"""
    result = model.transcribe(
        str(audio_path),
        language="ar",
        word_timestamps=True,
        verbose=False
    )
    return result

def compare_transcriptions(expected: str, actual: str) -> float:
    """Compare expected vs actual transcription, return similarity score"""
    expected_norm = normalize(expected)
    actual_norm = normalize(actual)
    
    # Use token sort ratio (handles word order differences)
    return fuzz.token_sort_ratio(expected_norm, actual_norm) / 100.0

def realign_aya_words(
    aya_words: list,
    whisper_result: dict,
    original_start_ms: int
) -> list:
    """
    Re-align aya words using fresh Whisper transcription.
    Returns new aligned words with updated timestamps.
    """
    whisper_words = []
    for seg in whisper_result.get("segments", []):
        for w in seg.get("words", []):
            whisper_words.append({
                "word": w["word"].strip(),
                "start_ms": int(w["start"] * 1000) + original_start_ms,  # Offset to absolute time
                "end_ms": int(w["end"] * 1000) + original_start_ms,
                "confidence": round(w.get("probability", 0.95), 4)
            })
    
    # Align aya words to new whisper words
    aligned = []
    whisper_pos = 0
    
    for quran_word in aya_words:
        if whisper_pos >= len(whisper_words):
            # Out of whisper words - use last known time + estimate
            if aligned:
                last_end = aligned[-1]["end_ms"]
                aligned.append({
                    "word": quran_word,
                    "start_ms": last_end,
                    "end_ms": last_end + 500,
                    "confidence": 0.30,
                    "matched": False,
                    "verified": True
                })
            continue
        
        whisper_word = whisper_words[whisper_pos]
        quran_norm = normalize(quran_word)
        whisper_norm = normalize(whisper_word["word"])
        
        # Calculate match confidence
        match_score = fuzz.ratio(quran_norm, whisper_norm) / 100.0
        combined_confidence = (match_score * 0.7) + (whisper_word["confidence"] * 0.3)
        
        aligned.append({
            "word": quran_word,
            "start_ms": whisper_word["start_ms"],
            "end_ms": whisper_word["end_ms"],
            "confidence": round(combined_confidence, 4),
            "matched": match_score > 0.6,
            "verified": True  # Flag that this was verified
        })
        
        whisper_pos += 1
    
    # Fill any remaining words
    while len(aligned) < len(aya_words):
        if aligned:
            last_end = aligned[-1]["end_ms"]
            aligned.append({
                "word": aya_words[len(aligned)],
                "start_ms": last_end,
                "end_ms": last_end + 500,
                "confidence": 0.30,
                "matched": False,
                "verified": True
            })
        else:
            aligned.append({
                "word": aya_words[len(aligned)],
                "start_ms": original_start_ms,
                "end_ms": original_start_ms + 500,
                "confidence": 0.30,
                "matched": False,
                "verified": True
            })
    
    return aligned

# ============================================================================
# MAIN VERIFICATION LOOP
# ============================================================================
verified_count = 0
improved_count = 0
skipped_count = 0
failed_count = 0

for aligned_json in sorted(Path(".").rglob("*_aligned.json")):
    print(f"\nProcessing: {aligned_json.name}")
    
    # Load aligned data
    try:
        with open(aligned_json, encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"  ‚ùå Error loading: {e}")
        failed_count += 1
        continue
    
    # Find corresponding audio file
    audio_path = aligned_json.parent / data["metadata"]["audio_file"]
    if not audio_path.exists():
        print(f"  ‚ùå Audio file not found: {audio_path.name}")
        failed_count += 1
        continue
    
    # Find low-confidence ayas
    ayas = data.get("ayas", [])
    low_conf_ayas = [a for a in ayas if a.get("confidence", 1.0) < CONFIDENCE_THRESHOLD]
    
    if not low_conf_ayas:
        print(f"  ‚úì All ayas above threshold ({CONFIDENCE_THRESHOLD})")
        skipped_count += 1
        continue
    
    print(f"  Found {len(low_conf_ayas)} low-confidence ayas to verify")
    
    # Verify each low-confidence aya
    changes_made = False
    
    for aya in low_conf_ayas:
        aya_num = aya["aya_number"]
        aya_text = aya["text"]
        aya_words = aya_text.split()
        start_ms = aya["start_ms"]
        end_ms = aya["end_ms"]
        duration_ms = end_ms - start_ms
        old_confidence = aya["confidence"]
        
        # Skip very short segments
        if duration_ms < MIN_AYA_DURATION_MS:
            print(f"    Aya {aya_num}: Too short ({duration_ms}ms), skipping")
            continue
        
        print(f"    Aya {aya_num}: Conf={old_confidence:.2%}, Duration={duration_ms/1000:.1f}s")
        
        try:
            # STEP 1: Extract audio segment
            temp_audio = extract_audio_segment(audio_path, start_ms, end_ms)
            
            # STEP 2: Re-transcribe with Whisper
            whisper_result = transcribe_segment(temp_audio)
            actual_text = whisper_result.get("text", "").strip()
            
            # STEP 3: Compare transcriptions
            similarity = compare_transcriptions(aya_text, actual_text)
            
            print(f"      Expected: {aya_text[:50]}...")
            print(f"      Got:      {actual_text[:50]}...")
            print(f"      Match:    {similarity:.2%}")
            
            # STEP 4: Decide action based on similarity
            if similarity < 0.5:
                # Very different - major alignment issue
                print(f"      ‚ö†Ô∏è  MAJOR MISMATCH - Re-aligning...")
                
                # Re-align words with new transcription
                new_aligned_words = realign_aya_words(
                    aya_words,
                    whisper_result,
                    start_ms
                )
                
                # Calculate new confidence
                new_confidences = [w["confidence"] for w in new_aligned_words]
                new_confidence = sum(new_confidences) / len(new_confidences)
                
                # Update aya
                aya["words"] = new_aligned_words
                aya["confidence"] = round(new_confidence, 4)
                aya["end_ms"] = new_aligned_words[-1]["end_ms"]
                aya["verification_status"] = "realigned"
                aya["similarity_before"] = round(similarity, 4)
                
                changes_made = True
                improved_count += 1
                
                print(f"      ‚úì Re-aligned: {old_confidence:.2%} ‚Üí {new_confidence:.2%}")
                
            elif similarity < 0.8:
                # Moderate match - boost confidence but keep timing
                print(f"      ‚ö†Ô∏è  Moderate match - adjusting confidence...")
                
                # Boost confidence based on similarity
                confidence_boost = similarity * 0.3  # Up to 30% boost
                new_confidence = min(old_confidence + confidence_boost, 0.95)
                
                aya["confidence"] = round(new_confidence, 4)
                aya["verification_status"] = "confidence_adjusted"
                aya["similarity_score"] = round(similarity, 4)
                
                changes_made = True
                improved_count += 1
                
                print(f"      ‚úì Adjusted: {old_confidence:.2%} ‚Üí {new_confidence:.2%}")
                
            else:
                # Good match - confirm alignment is correct
                print(f"      ‚úì VERIFIED - Alignment correct")
                
                aya["verification_status"] = "verified_correct"
                aya["similarity_score"] = round(similarity, 4)
                changes_made = True
            
            # Cleanup temp file
            temp_audio.unlink()
            
            # Clear CUDA cache
            torch.cuda.empty_cache()
            gc.collect()
            
        except Exception as e:
            print(f"      ‚ùå Verification failed: {e}")
            aya["verification_status"] = "verification_failed"
            aya["error"] = str(e)
            continue
    
    # Save updated data if changes were made
    if changes_made:
        # Update metadata
        data["metadata"]["verification_version"] = "v1_audio_check"
        data["metadata"]["verified_ayas"] = sum(
            1 for a in ayas if "verification_status" in a
        )
        
        # Recalculate average confidence
        confidences = [a["confidence"] for a in ayas]
        data["metadata"]["average_confidence"] = round(
            sum(confidences) / len(confidences), 4
        )
        
        # Update low confidence list
        data["low_confidence_ayas"] = [
            a["aya_number"] for a in ayas 
            if a.get("confidence", 1.0) < CONFIDENCE_THRESHOLD
        ]
        data["metadata"]["low_confidence_count"] = len(data["low_confidence_ayas"])
        
        # Save
        with open(aligned_json, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        verified_count += 1
        print(f"  ‚úì Updated and saved")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("VERIFICATION COMPLETE")
print("=" * 80)
print(f"Files verified and updated: {verified_count}")
print(f"Ayas improved/corrected:    {improved_count}")
print(f"Files skipped (all good):   {skipped_count}")
print(f"Files failed:               {failed_count}")
print("=" * 80)

# Cleanup
del model
torch.cuda.empty_cache()
gc.collect()
print("\n‚úì GPU memory cleared")

In [None]:
# CELL 6.5 enhancment ‚Äî AUDIO VERIFICATION WITH WHISPERX + FINE-TUNE

import torch
import gc
import whisperx
from pathlib import Path
import json
from pydub import AudioSegment
import tempfile
from rapidfuzz import fuzz
import re

# ============================================================================
# CONFIG
# ============================================================================
CONFIDENCE_THRESHOLD = 0.7
MIN_AYA_DURATION_MS = 1000
DEVICE = "cuda"
COMPUTE_TYPE = "float16"
BATCH_SIZE = 16
MODEL_NAME = "tarteel-ai/whisper-base-ar-quran"
ALIGN_MODEL = "WAV2VEC2_ASR_BASE_960H"  # Or suitable AR model

# ============================================================================
# SETUP
# ============================================================================
torch.cuda.empty_cache()
gc.collect()
print("=" * 80)
print("AUDIO VERIFICATION ‚Äî WHISPERX WITH QURAN FINE-TUNE")
print("=" * 80)
print(f"Threshold: {CONFIDENCE_THRESHOLD} | Model: {MODEL_NAME}")

# Load WhisperX model
model = whisperx.load_model(MODEL_NAME, DEVICE, compute_type=COMPUTE_TYPE)

# Load alignment model (auto for AR)
align_model, align_metadata = whisperx.load_align_model(language_code="ar", device=DEVICE)

print("‚úì Models loaded\n")

# ============================================================================
# HELPERS
# ============================================================================
def normalize(text: str) -> str:
    text = re.sub(r'[\u064B-\u0652\u0670]', '', text)
    text = re.sub(r'[ÿ£ÿ•ÿ¢]', 'ÿß', text)
    text = re.sub(r'ÿ©', 'Ÿá', text)
    text = re.sub(r'Ÿâ', 'Ÿä', text)
    return text.strip()

def extract_audio_segment(audio_path: Path, start_ms: int, end_ms: int) -> AudioSegment:
    audio = AudioSegment.from_file(str(audio_path))
    return audio[start_ms:end_ms]

def transcribe_segment(audio: AudioSegment) -> dict:
    with tempfile.NamedTemporaryFile(suffix=".wav") as temp_file:
        audio.export(temp_file.name, format="wav")
        audio_data = whisperx.load_audio(temp_file.name)
        
        # Transcribe
        result = model.transcribe(
            audio_data,
            batch_size=BATCH_SIZE,
            language="ar"
        )
        
        # Align
        aligned = whisperx.align(
            result["segments"],
            align_model,
            align_metadata,
            audio_data,
            DEVICE,
            return_char_alignments=False
        )
        
    return aligned

def compare_transcriptions(expected: str, actual: str) -> float:
    return fuzz.token_sort_ratio(normalize(expected), normalize(actual)) / 100.0

def realign_aya_words(
    aya_words: list,
    whisper_result: dict,
    original_start_ms: int
) -> list:
    whisper_words = []
    for seg in whisper_result.get("segments", []):
        for w in seg.get("words", []):
            whisper_words.append({
                "word": w["word"].strip(),
                "start_ms": int(w["start"] * 1000) + original_start_ms,
                "end_ms": int(w["end"] * 1000) + original_start_ms,
                "confidence": round(w.get("probability", 0.95), 4)
            })
    
    aligned = []
    whisper_pos = 0
    
    for quran_word in aya_words:
        if whisper_pos >= len(whisper_words):
            if aligned:
                last_end = aligned[-1]["end_ms"]
                aligned.append({
                    "word": quran_word,
                    "start_ms": last_end,
                    "end_ms": last_end + 500,
                    "confidence": 0.30,
                    "matched": False,
                    "verified": True
                })
            continue
        
        whisper_word = whisper_words[whisper_pos]
        match_score = fuzz.ratio(normalize(quran_word), normalize(whisper_word["word"])) / 100.0
        combined_conf = (match_score * 0.7) + (whisper_word["confidence"] * 0.3)
        
        aligned.append({
            "word": quran_word,
            "start_ms": whisper_word["start_ms"],
            "end_ms": whisper_word["end_ms"],
            "confidence": round(combined_conf, 4),
            "matched": match_score > 0.6,
            "verified": True
        })
        
        whisper_pos += 1
    
    while len(aligned) < len(aya_words):
        last_end = aligned[-1]["end_ms"] if aligned else original_start_ms
        aligned.append({
            "word": aya_words[len(aligned)],
            "start_ms": last_end,
            "end_ms": last_end + 500,
            "confidence": 0.30,
            "matched": False,
            "verified": True
        })
    
    return aligned

# ============================================================================
# MAIN LOOP
# ============================================================================
verified_count = improved_count = skipped_count = failed_count = 0

for aligned_json in sorted(Path(".").rglob("*_aligned.json")):
    print(f"\nProcessing: {aligned_json.name}")
    
    with open(aligned_json, encoding="utf-8") as f:
        data = json.load(f)
    
    audio_path = aligned_json.parent / data["metadata"]["audio_file"]
    if not audio_path.exists():
        print(f" ‚ùå Audio not found")
        failed_count += 1
        continue
    
    ayas = data.get("ayas", [])
    low_conf_ayas = [a for a in ayas if a.get("confidence", 1.0) < CONFIDENCE_THRESHOLD]
    
    if not low_conf_ayas:
        print(" ‚úì All good")
        skipped_count += 1
        continue
    
    print(f" Verifying {len(low_conf_ayas)} ayas")
    changes_made = False
    
    for aya in low_conf_ayas:
        aya_num = aya["aya_number"]
        aya_text = aya["text"]
        aya_words = aya_text.split()
        start_ms = aya["start_ms"]
        end_ms = aya["end_ms"]
        duration_ms = end_ms - start_ms
        old_conf = aya["confidence"]
        
        if duration_ms < MIN_AYA_DURATION_MS:
            continue
        
        print(f" Aya {aya_num}: Conf={old_conf:.2%}, Dur={duration_ms/1000:.1f}s")
        
        try:
            audio_seg = extract_audio_segment(audio_path, start_ms, end_ms)
            whisper_result = transcribe_segment(audio_seg)
            actual_text = whisper_result.get("text", "").strip()
            
            similarity = compare_transcriptions(aya_text, actual_text)
            print(f" Match: {similarity:.2%}")
            
            if similarity < 0.5:
                new_words = realign_aya_words(aya_words, whisper_result, start_ms)
                new_confs = [w["confidence"] for w in new_words]
                new_conf = sum(new_confs) / len(new_confs)
                
                aya["words"] = new_words
                aya["confidence"] = round(new_conf, 4)
                aya["end_ms"] = new_words[-1]["end_ms"]
                aya["verification_status"] = "realigned"
                aya["similarity_before"] = round(similarity, 4)
                
                changes_made = True
                improved_count += 1
                print(f" ‚úì Realigned: {old_conf:.2%} ‚Üí {new_conf:.2%}")
            
            elif similarity < 0.8:
                boost = similarity * 0.3
                new_conf = min(old_conf + boost, 0.95)
                
                aya["confidence"] = round(new_conf, 4)
                aya["verification_status"] = "adjusted"
                aya["similarity_score"] = round(similarity, 4)
                
                changes_made = True
                improved_count += 1
                print(f" ‚úì Adjusted: {old_conf:.2%} ‚Üí {new_conf:.2%}")
            
            else:
                aya["verification_status"] = "verified"
                aya["similarity_score"] = round(similarity, 4)
                changes_made = True
            
            torch.cuda.empty_cache()
            gc.collect()
        
        except Exception as e:
            print(f" ‚ùå Failed: {e}")
            aya["verification_status"] = "failed"
            aya["error"] = str(e)
            continue
    
    if changes_made:
        data["metadata"]["verification_version"] = "whisperx_quran"
        data["metadata"]["verified_ayas"] = sum(1 for a in ayas if "verification_status" in a)
        
        confs = [a["confidence"] for a in ayas]
        data["metadata"]["average_confidence"] = round(sum(confs) / len(confs), 4)
        
        data["low_confidence_ayas"] = [a["aya_number"] for a in ayas if a.get("confidence", 1.0) < CONFIDENCE_THRESHOLD]
        data["metadata"]["low_confidence_count"] = len(data["low_confidence_ayas"])
        
        with open(aligned_json, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        verified_count += 1
        print(" ‚úì Saved")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print(f"Verified: {verified_count} | Improved: {improved_count} | Skipped: {skipped_count} | Failed: {failed_count}")
print("=" * 80)

# Cleanup
del model, align_model
torch.cuda.empty_cache()
gc.collect()
print("‚úì GPU cleared")

In [None]:
# CELL 7 ‚Äî Generate HTML Viewer

from pathlib import Path
import json

for aligned_json in Path(".").rglob("*_aligned.json"):
    # Find corresponding audio file
    base = aligned_json.stem.replace("_aligned", "")
    audio_file = next(aligned_json.parent.glob(f"{base}.*"), None)
    
    if not audio_file or audio_file.suffix.lower() not in ['.mp3', '.wav', '.m4a']:
        continue
    
    with open(aligned_json, encoding='utf-8') as f:
        data = json.load(f)
    
    html = f"""<!DOCTYPE html>
<html dir="rtl" lang="ar">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{base}</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: 'Amiri', 'Scheherazade', serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            padding: 2rem;
        }}
        .container {{
            max-width: 900px;
            margin: 0 auto;
            background: white;
            border-radius: 16px;
            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
            overflow: hidden;
        }}
        .header {{
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 2rem;
            text-align: center;
        }}
        .header h1 {{ font-size: 2rem; margin-bottom: 0.5rem; }}
        .audio-player {{
            padding: 1.5rem;
            background: #f8f9fa;
            border-bottom: 1px solid #e9ecef;
        }}
        audio {{
            width: 100%;
            height: 40px;
        }}
        .ayas {{
            padding: 2rem;
        }}
        .aya {{
            font-size: 1.8rem;
            line-height: 3rem;
            margin-bottom: 2rem;
            padding: 1.5rem;
            border-radius: 12px;
            transition: all 0.3s ease;
            cursor: pointer;
        }}
        .aya:hover {{
            background: #f8f9fa;
        }}
        .aya.active {{
            background: #667eea;
            color: white;
            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
        }}
        .word {{
            display: inline-block;
            padding: 0.2rem 0.4rem;
            margin: 0 0.2rem;
            border-radius: 6px;
            transition: all 0.2s ease;
        }}
        .aya.active .word.highlight {{
            background: rgba(255, 255, 255, 0.3);
            transform: scale(1.05);
        }}
        .aya-number {{
            display: inline-block;
            width: 2rem;
            height: 2rem;
            line-height: 2rem;
            text-align: center;
            background: #667eea;
            color: white;
            border-radius: 50%;
            font-size: 1rem;
            margin-left: 0.5rem;
        }}
        .aya.active .aya-number {{
            background: white;
            color: #667eea;
        }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>{base.replace('_', ' ').title()}</h1>
        </div>
        <div class="audio-player">
            <audio id="audio" controls>
                <source src="{audio_file.name}" type="audio/{audio_file.suffix[1:]}">
            </audio>
        </div>
        <div class="ayas">
"""
    
    for aya in data['ayas']:
        words_html = ""
        for w in aya['words']:
            words_html += f'<span class="word" data-start="{w["start_ms"]}" data-end="{w["end_ms"]}">{w["word"]}</span>'
        
        html += f'''            <div class="aya" data-start="{aya['start_ms']}" data-end="{aya['end_ms']}">
                <span class="aya-number">{aya['aya_number']}</span>
                {words_html}
            </div>
'''
    
    html += """        </div>
    </div>
    <script>
        const audio = document.getElementById('audio');
        const ayas = document.querySelectorAll('.aya');
        
        // Click aya to play
        ayas.forEach(aya => {
            aya.addEventListener('click', () => {
                const start = parseInt(aya.dataset.start);
                audio.currentTime = start / 1000;
                audio.play();
            });
        });
        
        // Highlight during playback
        audio.addEventListener('timeupdate', () => {
            const currentMs = audio.currentTime * 1000;
            
            ayas.forEach(aya => {
                const start = parseInt(aya.dataset.start);
                const end = parseInt(aya.dataset.end);
                
                if (currentMs >= start && currentMs <= end) {
                    aya.classList.add('active');
                    
                    // Highlight words
                    const words = aya.querySelectorAll('.word');
                    words.forEach(word => {
                        const wStart = parseInt(word.dataset.start);
                        const wEnd = parseInt(word.dataset.end);
                        
                        if (currentMs >= wStart && currentMs <= wEnd) {
                            word.classList.add('highlight');
                        } else {
                            word.classList.remove('highlight');
                        }
                    });
                } else {
                    aya.classList.remove('active');
                    aya.querySelectorAll('.word').forEach(w => w.classList.remove('highlight'));
                }
            });
        });
    </script>
</body>
</html>"""
    
    html_file = aligned_json.with_suffix('.html')
    with open(html_file, 'w', encoding='utf-8') as f:
        f.write(html)
    
    print(f"‚úì {html_file.name}")

print("\n‚úÖ HTML viewers created!")

In [None]:
# deleted_count = 0
# for delete_json in Path(".").rglob("*_DELETE.json"):
#     delete_json.unlink()
#     print(f"‚úì Deleted {delete_json.name}")
#     deleted_count += 1

# print(f"\n‚úì Removed {deleted_count} temporary _DELETE.json files")
# print('='*70)

In [None]:
# CELL 8 ‚Äî FastAPI Backend for Quran Alignment Data

import json
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from pathlib import Path
from typing import List, Optional

app = FastAPI(title="Quran Alignment API", version="1.0.0")

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class Word(BaseModel):
    word: str
    start_ms: int
    end_ms: int

class Aya(BaseModel):
    aya_number: int
    text: str
    words: List[Word]
    start_ms: int
    end_ms: int

class QuranAlignment(BaseModel):
    ayas: List[Aya]

# Load all aligned data at startup
aligned_data = {}

def load_aligned_files():
    global aligned_data
    aligned_data = {}
    for json_file in Path(".").rglob("*_aligned.json"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            surah_name = json_file.stem.replace('_aligned', '')
            aligned_data[surah_name] = data
        except Exception as e:
            print(f"Error loading {json_file}: {e}")

@app.on_event('startup')
async def startup_event():
    load_aligned_files()

@app.get("/")
async def root():
    return {"message": "Quran Alignment API - Use /surahs to list available surahs"}

@app.get("/surahs")
async def get_surahs():
    return {"surahs": list(aligned_data.keys())}

@app.get("/surah/{surah_name}")
async def get_surah(surah_name: str):
    if surah_name not in aligned_data:
        raise HTTPException(status_code=404, detail="Surah not found")
    return aligned_data[surah_name]

@app.get("/surah/{surah_name}/aya/{aya_number}")
async def get_aya(surah_name: str, aya_number: int):
    if surah_name not in aligned_data:
        raise HTTPException(status_code=404, detail="Surah not found")
    
    surah_data = aligned_data[surah_name]
    ayas = surah_data.get("ayas", [])
    
    if 1 <= aya_number <= len(ayas):
        return ayas[aya_number - 1]
    else:
        raise HTTPException(status_code=404, detail="Aya not found")

# Run with: uvicorn.run(app, host="127.0.0.1", port=8000, reload=True)

In [None]:
# CELL 9 ‚Äî Beautiful Glassmorphism Dashboard with Streamlit

import streamlit as st
import json
from pathlib import Path

# Set page config
st.set_page_config(
    page_title=" Quran Alignment Dashboard",
    page_icon="üìñ",
    layout="wide"
)

# Custom CSS for glassmorphism
st.markdown("""
<style>
    @import url('https://fonts.googleapis.com/css2?family=Amiri:wght@400;700&display=swap');
    
    .glass-card {
        background: rgba(255, 255, 255, 0.15);
        backdrop-filter: blur(10px);
        border-radius: 16px;
        border: 1px solid rgba(255, 255, 255, 0.18);
        padding: 20px;
        margin: 10px 0;
        box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37);
    }
    
    .aya-card {
        background: rgba(255, 255, 255, 0.1);
        border-radius: 12px;
        padding: 15px;
        margin: 10px 0;
        border: 1px solid rgba(255, 255, 255, 0.1);
    }
    
    .word-timestamp {
        background: rgba(102, 126, 234, 0.2);
        border-radius: 6px;
        padding: 2px 6px;
        margin: 0 2px;
        font-size: 0.8em;
    }
    
    body {
        font-family: 'Amiri', 'Scheherazade', serif !important;
    }
    
    .arabic-text {
        font-size: 1.8rem !important;
        line-height: 2.5rem !important;
        text-align: right !important;
        direction: rtl !important;
    }
</style>
""", unsafe_allow_html=True)

st.markdown("<h1 style='text-align: center; color: white;'>üìñ Quran Alignment Dashboard</h1>", unsafe_allow_html=True)

# Load all aligned files
aligned_files = list(Path(".").rglob("*_aligned.json"))

if not aligned_files:
    st.error("‚ùå No aligned Quran files found! Run Cell 6 first.")
    st.stop()

# Sidebar for selection
with st.sidebar:
    st.markdown('<div class="glass-card">', unsafe_allow_html=True)
    st.header("üìã Surah Selection")
    
    # Create a mapping of display names to file paths
    surah_options = {}
    for f in aligned_files:
        display_name = f.stem.replace('_aligned', '').replace('_', ' ').title()
        surah_options[display_name] = f
    
    selected_surah = st.selectbox(
        "Choose a Surah:",
        options=list(surah_options.keys()),
        format_func=lambda x: x
    )
    
    selected_file = surah_options[selected_surah]
    st.success(f"Selected: {selected_file.name}")
    st.markdown('</div>', unsafe_allow_html=True)

# Main content
col1, col2 = st.columns([2, 1])

with col1:
    st.markdown(f'<div class="glass-card"><h2>üìñ {selected_surah}</h2></div>', unsafe_allow_html=True)
    
    # Load and display the selected surah
    with open(selected_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    ayas = data.get('ayas', [])
    
    for aya in ayas:
        with st.container():
            st.markdown(f'''
            <div class="aya-card">
                <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
                    <span style="background: rgba(102, 126, 234, 0.3); padding: 4px 12px; border-radius: 20px; font-weight: bold;">
                        ÿ¢Ÿäÿ© {aya['aya_number']}
                    </span>
                    <span class="word-timestamp">
                        {aya['start_ms']//1000}s - {aya['end_ms']//1000}s
                    </span>
                </div>
                <div class="arabic-text">
                    {aya['text']}
                </div>
            </div>
            ''', unsafe_allow_html=True)
            
            # Show word-by-word breakdown
            if st.expander(f"üîç Words for Aya {aya['aya_number']}"):
                words_html = ""
                for word_data in aya['words']:
                    words_html += f'''
                    <span class="word-timestamp" style="margin: 2px;">
                        {word_data['word']} ({word_data['start_ms']//1000}s-{word_data['end_ms']//1000}s)
                    </span>
                    '''
                st.markdown(f'<div style="direction: rtl; text-align: right;">{words_html}</div>', unsafe_allow_html=True)

with col2:
    st.markdown('<div class="glass-card"><h3>üìä Statistics</h3></div>', unsafe_allow_html=True)
    
    total_ayas = len(ayas)
    total_words = sum(len(aya['words']) for aya in ayas)
    total_duration = sum(aya['end_ms'] - aya['start_ms'] for aya in ayas) / 1000  # in seconds
    
    st.metric("ayas", total_ayas)
    st.metric("Words", total_words)
    st.metric("Duration", f"{total_duration/60:.1f} min")
    
    st.markdown('<div class="glass-card"><h3>üìÅ Available Surahs</h3></div>', unsafe_allow_html=True)
    for f in aligned_files:
        name = f.stem.replace('_aligned', '').replace('_', ' ').title()
        st.write(f"üìÑ {name}")

st.markdown("<hr style='margin: 30px 0;'>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center; color: rgba(255,255,255,0.6);'>‚ú® Quran Alignment Dashboard ‚ú®</p>", unsafe_allow_html=True)