In [None]:
# remove csv files from tafsir folder
import os
from pathlib import Path

base_folder = Path('tafsir')
for auth_folder in base_folder.iterdir():
    for csv_file in auth_folder.glob("*.csv"):
        os.remove(csv_file)

In [1]:
# Extract tafsir text from json file and save it to a text file
import os
import json

base_dir = "tafsir"

for author in os.listdir(base_dir):
    author_dir = os.path.join(base_dir, author)
    if not os.path.isdir(author_dir):
        continue
    for filename in os.listdir(author_dir):
        if filename.endswith(".json"):
            surah_number = filename.replace(".json", "")
            json_path = os.path.join(author_dir, filename)
            with open(json_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            for item in data:
                ayah_number = item["ayah_number"]
                txt_path = os.path.join(author_dir, f"{surah_number}_{ayah_number}.txt")
                tafsir_text = item.pop("tafsir_text", "")
                if tafsir_text:
                    with open(txt_path, "w", encoding="utf-8") as txt_file:
                        txt_file.write(tafsir_text)
                    with open(json_path, "w", encoding="utf-8") as f:
                        json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
# check if tafsir text has starting index and ending index then remove it
import os

base_dir = "tafsir"

for author in os.listdir(base_dir):
    author_dir = os.path.join(base_dir, author)
    if not os.path.isdir(author_dir):
        continue
    for filename in os.listdir(author_dir):
        if filename.endswith(".txt"):
            txt_path = os.path.join(author_dir, filename)
            with open(txt_path, "r", encoding="utf-8") as f:
                content = f.read()
            if content.find("﴿") != -1:
                print(f"Processing {txt_path}")
                # remove everything before the first occurrence of "﴿"
                content = content[content.find("﴿"):]
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(content)

In [22]:
# check if tafsir text in json file has starting index and ending index then remove it
import os
import json

base_dir = "tafsir"

for author in os.listdir(base_dir):
    author_dir = os.path.join(base_dir, author)
    if not os.path.isdir(author_dir):
        continue
    for filename in os.listdir(author_dir):
        if filename.endswith(".json"):
            json_path = os.path.join(author_dir, filename)
            with open(json_path, "r", encoding="utf-8") as f:
                json_content = json.load(f)
            if isinstance(json_content, list):
                for item in json_content:
                    tafsir_text = item.get("tafsir_text", "")
                    if tafsir_text:
                        if tafsir_text.find("﴿") != -1:
                            # remove everything before the first occurrence of "﴿"
                            tafsir_text = tafsir_text[tafsir_text.find("﴿"):]
                        item["tafsir_text"] = tafsir_text
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(json_content, f, ensure_ascii=False, indent=2)

In [None]:
# split long text by words then create chunks of acceptable characters per api request
with open("000 - INTRODUCTION TO AL-QURAN-01_chunk_01.txt", "r", encoding="utf-8") as f:
    text = f.read()

words = text.split()
print(f"Total characters: {len(text)}")
print(f"Total words: {len(words)}")

sentences = []
i = 0
while i < len(words):
    sentence = []
    while i < len(words) and len(" ".join(sentence + words[i:i+1])) <= 3000:
        sentence.append(words[i])
        i += 1
    sentences.append(" ".join(sentence))

print(f"Total sentences: {len(sentences)}")

for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence[:50]}... ({len(sentence)} characters)")


In [3]:
# Translate Urdu Lectures to English (single file)
from translate import TafsirTranslator

translator = TafsirTranslator()
result = translator.translate_file("surah1-lecture.txt", "surah1-lecture_en.txt", 'ur')

2025-07-18 15:18:24,434 - INFO - Multi-language Tafsir Translator initialized
2025-07-18 15:18:24,467 - INFO - Read 19473 characters from surah1-lecture.txt
2025-07-18 15:18:24,471 - INFO - Starting tafsir translation with automatic language detection...
2025-07-18 15:18:24,473 - INFO - Source language: Urdu (ur)
2025-07-18 15:18:24,494 - INFO - Text split into 7 chunks using newline and word-based strategy.
2025-07-18 15:18:24,498 - INFO - Text split into 7 chunks
2025-07-18 15:18:24,500 - INFO - Translating chunk 1/7...
2025-07-18 15:18:30,879 - INFO - Translating chunk 2/7...
2025-07-18 15:18:38,907 - INFO - Translating chunk 3/7...
2025-07-18 15:18:44,972 - INFO - Translating chunk 4/7...
2025-07-18 15:18:51,120 - INFO - Translating chunk 5/7...
2025-07-18 15:18:53,267 - INFO - Translating chunk 6/7...
2025-07-18 15:18:59,623 - INFO - Translating chunk 7/7...
2025-07-18 15:19:01,674 - INFO - Translation completed! Success rate: 100.0%
2025-07-18 15:19:01,717 - INFO - Translation sa

In [None]:
# Translate Urdu Lectures to English (batch processing)
from translate import TafsirTranslator

translator = TafsirTranslator()
result = translator.translate_files("lectures/", "lectures_translated/", 'ur')        # English
# result = translator.translate_files("lectures/", "lectures_translated/", 'ur', 'fr')  # French
# result = translator.translate_files("lectures/", "lectures_translated/", 'ur', 'de')  # German
# result = translator.translate_files("lectures/", "lectures_translated/", 'ur', 'es')  # Spanish
# result = translator.translate_files("lectures/", "lectures_translated/", 'ur', 'ar')  # Arabic

In [None]:
# Translate Arabic Tafsir to English (single file)
from translate import TafsirTranslator

translator = TafsirTranslator()
result = translator.translate_file("surah1-ayat1.txt", "surah1-ayat1_en.txt", "ar")

In [None]:
# Translate Arabic Tafsir to English (batch processing)
from translate import TafsirTranslator

translator = TafsirTranslator()
result = translator.translate_files("tafsir/", "tafsir_translated/", 'ar')        # English

In [4]:
import re
from typing import Dict, List, Tuple
from deep_translator import GoogleTranslator

class TranslationPlaceholderManager:
    def __init__(self, placeholder_format="PLACEHOLDER_{id}_END"):
        """
        Initialize placeholder manager
        
        Args:
            placeholder_format: Format string with {id} where counter goes
                               Options: "PLACEHOLDER_{id}_END", "[PH{id:03d}]", "<ph id=\"{id}\"/>"
        """
        self.placeholder_format = placeholder_format
        self.placeholder_map: Dict[str, str] = {}
        self.counter = 0
        
    def add_placeholder(self, original_text: str) -> str:
        """
        Add text to be protected and return its placeholder
        
        Args:
            original_text: Text to protect from translation
            
        Returns:
            Generated placeholder string
        """
        self.counter += 1
        placeholder = self.placeholder_format.format(id=self.counter)
        self.placeholder_map[placeholder] = original_text
        return placeholder
    
    def insert_placeholders(self, text: str, protected_texts: List[str]) -> str:
        """
        Insert placeholders for multiple protected texts
        
        Args:
            text: Original text
            protected_texts: List of text segments to protect
            
        Returns:
            Text with placeholders inserted
        """
        result = text
        for protected_text in protected_texts:
            placeholder = self.add_placeholder(protected_text)
            result = result.replace(protected_text, placeholder)
        return result
    
    def restore_placeholders(self, translated_text: str) -> str:
        """
        Replace placeholders with original protected text
        
        Args:
            translated_text: Text returned from translator
            
        Returns:
            Text with placeholders restored to original content
        """
        result = translated_text
        
        # Create flexible regex patterns for each placeholder
        for placeholder, original_text in self.placeholder_map.items():
            # Create a flexible pattern that handles potential translation corruption
            flexible_pattern = self._create_flexible_pattern(placeholder)
            
            # Replace all matches with original text
            result = flexible_pattern.sub(original_text, result)
            
        return result
    
    def _create_flexible_pattern(self, placeholder: str) -> re.Pattern:
        """
        Create a flexible regex pattern that can handle translator modifications
        
        Args:
            placeholder: Original placeholder string
            
        Returns:
            Compiled regex pattern
        """
        # Extract the core parts that are less likely to change
        if "PLACEHOLDER_" in placeholder and "_END" in placeholder:
            # For PLACEHOLDER_001_END format
            match = re.search(r'PLACEHOLDER_(\d+)_END', placeholder)
            if match:
                number = match.group(1)
                # Allow for case changes, spacing, and minor modifications
                pattern = rf'(?i)\s*PLACEHOLDER\s*[_\s]*{re.escape(number)}[_\s]*END\s*'
        
        elif placeholder.startswith('[PH') and placeholder.endswith(']'):
            # For [PH001] format
            match = re.search(r'\[PH(\d+)\]', placeholder)
            if match:
                number = match.group(1)
                # Allow for spacing and case changes
                pattern = rf'(?i)\s*\[?\s*PH\s*{re.escape(number)}\s*\]?\s*'
        
        elif '<ph id=' in placeholder:
            # For <ph id="001"/> format
            match = re.search(r'<ph id="(\d+)"/>', placeholder)
            if match:
                number = match.group(1)
                # Allow for various XML formatting changes
                pattern = rf'(?i)\s*<\s*ph\s+id\s*=\s*["\']?\s*{re.escape(number)}\s*["\']?\s*/?\s*>\s*'
        
        else:
            # Fallback: escape the entire placeholder and allow minor modifications
            escaped = re.escape(placeholder)
            pattern = rf'(?i)\s*{escaped}\s*'
        
        return re.compile(pattern)
    
    def clear(self):
        """Reset the placeholder manager"""
        self.placeholder_map.clear()
        self.counter = 0


# Example usage functions
def translate_with_protection(text: str, protected_segments: List[str], 
                            source_lang: str = 'auto', target_lang: str = 'en') -> str:
    """
    Translate text while protecting certain segments
    
    Args:
        text: Text to translate
        protected_segments: List of text segments to protect from translation
        source_lang: Source language code
        target_lang: Target language code
        
    Returns:
        Translated text with protected segments restored
    """
    
    # Initialize placeholder manager
    pm = TranslationPlaceholderManager("__{id}__")
    
    # Insert placeholders
    text_with_placeholders = pm.insert_placeholders(text, protected_segments)
    
    print(f"Text with placeholders: {text_with_placeholders}")
    
    # Translate
    translator = GoogleTranslator(source=source_lang, target=target_lang)
    try:
        translated_text = translator.translate(text_with_placeholders)
        print(f"Translated text: {translated_text}")
        
        # Restore placeholders
        final_text = pm.restore_placeholders(translated_text)
        print(f"Final text: {final_text}")
        
        return final_text
        
    except Exception as e:
        print(f"Translation error: {e}")
        return text


# Example usage
if __name__ == "__main__":
    # Example 1: Arabic text with Quranic verse to protect
    arabic_text = "هذا نص عربي وَاتَّقُواْ اللّهَ الَّذِي تَسَاءلُونَ بِهِ وَالأَرْحَامَ ونص آخر يجب ترجمته"
    protected_verse = "وَاتَّقُواْ اللّهَ الَّذِي تَسَاءلُونَ بِهِ وَالأَرْحَامَ"
    
    print("=== Arabic Example ===")
    result = translate_with_protection(arabic_text, [protected_verse], 'ar', 'en')
    
    # Example 2: Urdu text
    urdu_text = "یہ اردو کا متن ہے اللہ تعالیٰ اور یہ محفوظ رہنا چاہیے"
    protected_text = "اللہ تعالیٰ"
    
    print("\n=== Urdu Example ===")
    result2 = translate_with_protection(urdu_text, [protected_text], 'ur', 'en')
    
    # Example 3: Testing different placeholder formats
    print("\n=== Testing Different Placeholder Formats ===")
    
    formats = [
        "PLACEHOLDER_{id}_END",
        "[PH{id:03d}]", 
        "<ph id=\"{id}\"/>"
    ]
    
    for fmt in formats:
        print(f"\nTesting format: {fmt}")
        pm = TranslationPlaceholderManager(fmt)
        test_text = "نص تجريبي محمي نص آخر"
        protected = "محمي"
        
        with_ph = pm.insert_placeholders(test_text, [protected])
        print(f"With placeholder: {with_ph}")
        
        # Simulate what might come back from translator
        simulated_return = with_ph.replace("نص تجريبي", "test text").replace("نص آخر", "other text")
        print(f"Simulated translation: {simulated_return}")
        
        restored = pm.restore_placeholders(simulated_return)
        print(f"Restored: {restored}")

=== Arabic Example ===
Text with placeholders: هذا نص عربي __1__ ونص آخر يجب ترجمته
Translated text: This is an Arabic text __1__ and another text that must be translated
Final text: This is an Arabic textوَاتَّقُواْ اللّهَ الَّذِي تَسَاءلُونَ بِهِ وَالأَرْحَامَand another text that must be translated

=== Urdu Example ===
Text with placeholders: یہ اردو کا متن ہے __1__ اور یہ محفوظ رہنا چاہیے
Translated text: This is the text of Urdu __1__ and it must be safe
Final text: This is the text of Urduاللہ تعالیٰand it must be safe

=== Testing Different Placeholder Formats ===

Testing format: PLACEHOLDER_{id}_END
With placeholder: نص تجريبي PLACEHOLDER_1_END نص آخر
Simulated translation: test text PLACEHOLDER_1_END other text
Restored: test textمحميother text

Testing format: [PH{id:03d}]
With placeholder: نص تجريبي [PH001] نص آخر
Simulated translation: test text [PH001] other text
Restored: test textمحميother text

Testing format: <ph id="{id}"/>
With placeholder: نص تجريبي <ph id="1"/> ن