In [16]:
# BLOCK 1: AUTO-DETECT FILES + GPU CHECK
import os
import json
from pathlib import Path
from rapidfuzz import fuzz
import re
import torch

print("="*60)
print("QURAN WORD-BY-WORD ALIGNMENT")
print("="*60)

# GPU Check
print("\n=== SYSTEM CHECK ===")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    total_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    free = total_mem - reserved
    
    print(f"Total GPU Memory: {total_mem:.2f}GB")
    print(f"Currently Used: {allocated:.2f}GB")
    print(f"Reserved: {reserved:.2f}GB")
    print(f"Free: {free:.2f}GB")
    
    if free < 4.0:
        print("‚ö† WARNING: Less than 4GB free! May need CPU mode.")
    else:
        print("‚úì Sufficient GPU memory available")
    
    # Clear any existing allocations
    torch.cuda.empty_cache()
    print("Cleared CUDA cache")
else:
    print("No GPU detected - will use CPU")

# Find files
print("\n=== FILE DETECTION ===")
audio_files = list(Path(".").glob("*.mp3")) + list(Path(".").glob("*.wav"))
if not audio_files:
    raise FileNotFoundError("No audio file found!")
audio_file = str(audio_files[0])

text_files = list(Path(".").glob("*.txt"))
if not text_files:
    raise FileNotFoundError("No text file found!")
text_file = str(text_files[0])

base_name = Path(audio_file).stem
output_json = f"{base_name}_timings.json"
whisper_raw = f"{base_name}_whisper_raw.json"

print(f"‚úì Audio: {audio_file}")
print(f"‚úì Text: {text_file}")
print(f"\nWill create:")
print(f"  - {whisper_raw}")
print(f"  - {output_json}")

# Check audio file size
audio_size = os.path.getsize(audio_file) / 1024**2
print(f"\nAudio file size: {audio_size:.2f}MB")

# Estimate processing requirements
import librosa
y, sr = librosa.load(audio_file, sr=16000, duration=1)  # Just load 1s to check
print(f"Audio sample rate: {sr}Hz")
del y  # Clean up

print("\n" + "="*60)

QURAN WORD-BY-WORD ALIGNMENT

=== SYSTEM CHECK ===
GPU: NVIDIA GeForce RTX 4080
Total GPU Memory: 15.99GB
Currently Used: 0.00GB
Reserved: 0.00GB
Free: 15.99GB
‚úì Sufficient GPU memory available
Cleared CUDA cache

=== FILE DETECTION ===
‚úì Audio: bakara.mp3
‚úì Text: baqara.txt

Will create:
  - bakara_whisper_raw.json
  - bakara_timings.json

Audio file size: 3.10MB
Audio sample rate: 16000Hz



In [17]:
# BLOCK 2: WHISPER TRANSCRIPTION WITH PROGRESS & GPU MONITORING
print("\n" + "="*60)
print("STEP 1: WHISPER TRANSCRIPTION")
print("="*60)

from faster_whisper import WhisperModel
from tqdm import tqdm
import torch

# GPU Monitoring function
def print_gpu_usage():
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.memory_allocated() / 1024**3
        gpu_max = torch.cuda.max_memory_allocated() / 1024**3
        gpu_reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"  GPU Memory: {gpu_mem:.2f}GB used | {gpu_max:.2f}GB peak | {gpu_reserved:.2f}GB reserved")
    else:
        print("  CPU mode (no GPU)")

if os.path.exists(whisper_raw):
    print(f"\n‚úì Using existing (APPROVED): {whisper_raw}")
    print("   (Delete this file to re-transcribe)")
    with open(whisper_raw, 'r', encoding='utf-8') as f:
        whisper_words = json.load(f)
else:
    print(f"\n‚ö†Ô∏è  {whisper_raw} not found - will transcribe")
    print("\nLoading Whisper model...")
    
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("Cleared CUDA cache")
    
    print_gpu_usage()
    
    # Use smaller model or CPU if GPU memory is limited
    try:
        model = WhisperModel("large-v2", device="cuda", compute_type="float16")
        print("Loaded large-v2 model on GPU")
    except Exception as e:
        print(f"GPU failed ({e}), falling back to CPU...")
        model = WhisperModel("large-v2", device="cpu", compute_type="int8")
    
    print_gpu_usage()
    
    print(f"\nTranscribing: {audio_file}")
    print("This may take several minutes...")
    
    # Get audio duration for progress bar
    import librosa
    y, sr = librosa.load(audio_file, sr=16000)
    duration = len(y) / sr
    print(f"Audio duration: {duration:.1f}s")
    
    # Transcribe with progress tracking
    segments_generator, info = model.transcribe(
        audio_file,
        language="ar",
        word_timestamps=True,
        beam_size=5,
        best_of=5,
        temperature=0.0
    )
    
    print(f"Detected language: {info.language} (probability: {info.language_probability:.2f})")
    
    # Process segments with progress bar
    whisper_words = []
    segment_count = 0
    
    with tqdm(total=int(duration), desc="Transcribing", unit="s") as pbar:
        for segment in segments_generator:
            segment_count += 1
            for word in segment.words:
                whisper_words.append({
                    "word": word.word.strip(),
                    "start_ms": int(word.start * 1000),
                    "end_ms": int(word.end * 1000)
                })
            
            # Update progress bar
            pbar.update(int(segment.end - segment.start))
            
            # Show GPU usage every 10 segments
            if segment_count % 10 == 0:
                print_gpu_usage()
    
    print(f"\nProcessed {segment_count} segments, found {len(whisper_words)} words")
    print_gpu_usage()
    
    # Save
    with open(whisper_raw, 'w', encoding='utf-8') as f:
        json.dump(whisper_words, f, ensure_ascii=False, indent=2)
    print(f"‚úì Saved: {whisper_raw}")
    
    # Clear GPU memory after transcription
    if torch.cuda.is_available():
        del model
        torch.cuda.empty_cache()
        print("\nCleared GPU memory")
        print_gpu_usage()

print(f"\nüìä Whisper: {len(whisper_words)} words")


STEP 1: WHISPER TRANSCRIPTION

‚úì Using existing (APPROVED): bakara_whisper_raw.json
   (Delete this file to re-transcribe)

üìä Whisper: 150 words


In [18]:
# BLOCK 3: ANCHOR-BASED ALIGNMENT
print("\n" + "="*60)
print("STEP 2: ANCHOR-BASED ALIGNMENT")
print("="*60)

def normalize_arabic(text):
    """Strip everything except base letters"""
    text = re.sub(r'[^\u0621-\u063A\u0641-\u064A]', '', text)
    text = text.replace('ÿ£', 'ÿß').replace('ÿ•', 'ÿß').replace('ÿ¢', 'ÿß').replace('Ÿ±', 'ÿß')
    text = text.replace('ÿ©', 'Ÿá').replace('Ÿâ', 'Ÿä').replace('ÿ§', 'Ÿà').replace('ÿ¶', 'Ÿä')
    return text

with open(text_file, 'r', encoding='utf-8') as f:
    quran_text = f.read()

quran_text = re.sub(r'\s*\(\d+\)\s*', ' ', quran_text)
all_quran_words = quran_text.split()
word_limit = len(whisper_words) + 50
quran_words = all_quran_words[:word_limit]

whisper_normalized = [normalize_arabic(w['word']) for w in whisper_words]
quran_normalized = [normalize_arabic(w) for w in quran_words]

# Skip Basmala
whisper_start = 0
basmala_norm = [normalize_arabic(w) for w in ['ÿ®ÿ≥ŸÖ', 'ÿßŸÑŸÑŸá', 'ÿßŸÑÿ±ÿ≠ŸÖŸÜ', 'ÿßŸÑÿ±ÿ≠ŸäŸÖ']]
if len(whisper_words) >= 4:
    if [whisper_normalized[i] for i in range(4)] == basmala_norm:
        if normalize_arabic(quran_words[0]) != basmala_norm[0]:
            whisper_start = 4
            print("‚úì Skipped Basmala")

print(f"\nQuran: {len(quran_words)} words | Whisper: {len(whisper_words)} words")

# STEP 1: Find ALL possible matches with scores
print("\nüîç Finding all possible matches...")
matches = []  # (q_idx, w_idx, num_combined, score)

for q_idx in range(len(quran_words)):
    q_norm = quran_normalized[q_idx]
    best_for_this_q = None
    
    for w_idx in range(whisper_start, len(whisper_words)):
        for num_combine in range(1, min(7, len(whisper_words) - w_idx + 1)):
            combined = "".join(whisper_normalized[w_idx:w_idx + num_combine])
            score = fuzz.ratio(q_norm, combined)
            
            if not best_for_this_q or score > best_for_this_q[3]:
                best_for_this_q = (q_idx, w_idx, num_combine, score)
    
    if best_for_this_q:
        matches.append(best_for_this_q)

# STEP 2: Find ANCHORS (high confidence matches)
print("\n‚öì Finding anchor points...")
anchors = []
for q_idx, w_idx, num_combine, score in matches:
    if score >= 80:  # High confidence
        anchors.append((q_idx, w_idx, num_combine, score))

anchors.sort(key=lambda x: x[0])  # Sort by Quran index
print(f"Found {len(anchors)} anchor points (‚â•80% confidence)")

# Show anchors
for i in range(min(5, len(anchors))):
    q_idx, w_idx, num_combine, score = anchors[i]
    print(f"  Anchor {i}: Q[{q_idx}]='{quran_words[q_idx]}' ‚Üî W[{w_idx}] (score: {score:.0f}%)")

# STEP 3: Interpolate between anchors
print("\nüîó Interpolating between anchors...")
aligned = []

if not anchors:
    print("‚ùå No anchors found! Using best guesses...")
    # Fallback: use all matches
    for q_idx, w_idx, num_combine, score in matches:
        if w_idx + num_combine - 1 < len(whisper_words):
            aligned.append({
                "word": quran_words[q_idx],
                "start_ms": whisper_words[w_idx]["start_ms"],
                "end_ms": whisper_words[w_idx + num_combine - 1]["end_ms"]
            })
else:
    # Add start anchor (beginning)
    anchors.insert(0, (0, whisper_start, 1, 100))
    # Add end anchor
    anchors.append((len(quran_words), len(whisper_words), 1, 100))
    
    # Process each segment between anchors
    for i in range(len(anchors) - 1):
        anchor1_q, anchor1_w, _, _ = anchors[i]
        anchor2_q, anchor2_w, _, _ = anchors[i + 1]
        
        quran_gap = anchor2_q - anchor1_q
        whisper_gap = anchor2_w - anchor1_w
        
        print(f"  Segment {i}: Q[{anchor1_q}‚Üí{anchor2_q}] ‚Üî W[{anchor1_w}‚Üí{anchor2_w}]")
        
        if quran_gap == 0:
            continue
        
        # Proportionally distribute Whisper words
        for j in range(quran_gap):
            q_idx = anchor1_q + j
            if q_idx >= len(quran_words):
                break
            
            # Calculate proportional Whisper position
            ratio = j / quran_gap if quran_gap > 0 else 0
            w_start_idx = anchor1_w + int(ratio * whisper_gap)
            w_end_idx = anchor1_w + int((j + 1) / quran_gap * whisper_gap)
            
            # Ensure valid indices
            w_start_idx = max(whisper_start, min(w_start_idx, len(whisper_words) - 1))
            w_end_idx = max(w_start_idx, min(w_end_idx, len(whisper_words) - 1))
            
            if w_start_idx < len(whisper_words) and w_end_idx < len(whisper_words):
                aligned.append({
                    "word": quran_words[q_idx],
                    "start_ms": whisper_words[w_start_idx]["start_ms"],
                    "end_ms": whisper_words[w_end_idx]["end_ms"]
                })

print(f"\n‚úÖ Aligned {len(aligned)} words")

# Show samples
print("\nüìä Sample alignments:")
for i in [0, 1, 2, len(aligned)//4, len(aligned)//2, len(aligned)*3//4, len(aligned)-2, len(aligned)-1]:
    if 0 <= i < len(aligned):
        w = aligned[i]
        dur = w['end_ms'] - w['start_ms']
        print(f"  [{i}] '{w['word']}' {w['start_ms']}ms to {w['end_ms']}ms ({dur}ms)")


STEP 2: ANCHOR-BASED ALIGNMENT
‚úì Skipped Basmala

Quran: 200 words | Whisper: 150 words

üîç Finding all possible matches...

‚öì Finding anchor points...
Found 163 anchor points (‚â•80% confidence)
  Anchor 0: Q[0]='ÿßŸÑŸìŸÖŸì' ‚Üî W[47] (score: 86%)
  Anchor 1: Q[1]='ÿ∞ŸéŸ∞ŸÑŸêŸÉŸé' ‚Üî W[6] (score: 100%)
  Anchor 2: Q[2]='Ÿ±ŸÑ€°ŸÉŸêÿ™ŸéŸ∞ÿ®Ÿè' ‚Üî W[7] (score: 80%)
  Anchor 3: Q[3]='ŸÑŸéÿß' ‚Üî W[8] (score: 100%)
  Anchor 4: Q[4]='ÿ±ŸéŸä€°ÿ®Ÿé€õ' ‚Üî W[9] (score: 100%)

üîó Interpolating between anchors...
  Segment 0: Q[0‚Üí0] ‚Üî W[4‚Üí47]
  Segment 1: Q[0‚Üí1] ‚Üî W[47‚Üí6]
  Segment 2: Q[1‚Üí2] ‚Üî W[6‚Üí7]
  Segment 3: Q[2‚Üí3] ‚Üî W[7‚Üí8]
  Segment 4: Q[3‚Üí4] ‚Üî W[8‚Üí9]
  Segment 5: Q[4‚Üí5] ‚Üî W[9‚Üí10]
  Segment 6: Q[5‚Üí6] ‚Üî W[10‚Üí11]
  Segment 7: Q[6‚Üí7] ‚Üî W[11‚Üí12]
  Segment 8: Q[7‚Üí8] ‚Üî W[12‚Üí13]
  Segment 9: Q[8‚Üí9] ‚Üî W[13‚Üí14]
  Segment 10: Q[9‚Üí10] ‚Üî W[14‚Üí15]
  Segment 11: Q[10‚Üí11] ‚Üî W[15‚Üí16]
  Segment 12: Q[11‚Üí13] ‚Üî W[16‚Üí18]


In [19]:
# BLOCK 4: SAVE OUTPUT
print("\n" + "="*60)
print("STEP 3: SAVING")
print("="*60)

with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(aligned, f, ensure_ascii=False, indent=2)

print(f"\nSaved: {output_json}")
print(f"\nDONE! Use in HTML:")
print(f"  1. Upload audio: {audio_file}")
print(f"  2. Upload JSON: {output_json}")
print(f"  3. Upload text: {text_file}")


STEP 3: SAVING

Saved: bakara_timings.json

DONE! Use in HTML:
  1. Upload audio: bakara.mp3
  2. Upload JSON: bakara_timings.json
  3. Upload text: baqara.txt
