In [1]:
import pandas as pd
import os

# --- Configuration ---
english_file_path = 'english.txt'
urdu_file_path = 'urdu.txt'

# --- Function to Load and Preprocess Data with Debugging ---
def load_parallel_corpus_debug(eng_path, urd_path):
    english_sentences = []
    urdu_sentences = []
    line_num = 0
    max_lines = 0 # To store the length of the longer file

    if not os.path.exists(eng_path):
        print(f"Error: English file not found at '{eng_path}'")
        return None
    if not os.path.exists(urd_path):
        print(f"Error: Urdu file not found at '{urd_path}'")
        return None

    try:
        print(f"Reading and comparing lines from '{eng_path}' and '{urd_path}'...")
        with open(eng_path, 'r', encoding='utf-8') as f_eng, \
             open(urd_path, 'r', encoding='utf-8') as f_urd:

            while True:
                line_num += 1
                eng_line_raw = f_eng.readline()
                urd_line_raw = f_urd.readline()

                # --- End of File Handling ---
                if not eng_line_raw and not urd_line_raw:
                    print(f"Both files finished. Processed {line_num - 1} lines.")
                    max_lines = line_num -1
                    break
                if not eng_line_raw:
                    print(f"English file ended at line {line_num - 1}, but Urdu file continues.")
                    print(f"  Extra Urdu line {line_num}: {urd_line_raw.strip()!r}")
                    # Decide how to handle: For now, we'll break, leading to mismatch
                    max_lines = line_num # Urdu file is longer
                    # Read rest of Urdu file to get its total length (optional)
                    urd_lines_count = line_num
                    while f_urd.readline():
                        urd_lines_count += 1
                    max_lines = urd_lines_count
                    break
                if not urd_line_raw:
                    print(f"Urdu file ended at line {line_num - 1}, but English file continues.")
                    print(f"  Extra English line {line_num}: {eng_line_raw.strip()!r}")
                    # Decide how to handle: For now, we'll break, leading to mismatch
                    max_lines = line_num # English file is longer
                     # Read rest of English file to get its total length (optional)
                    eng_lines_count = line_num
                    while f_eng.readline():
                        eng_lines_count += 1
                    max_lines = eng_lines_count
                    break

                # --- Process Lines ---
                eng_sentence = eng_line_raw.strip()
                urd_sentence = urd_line_raw.strip()

                is_eng_valid = bool(eng_sentence)
                is_urd_valid = bool(urd_sentence)

                # Check if validity (non-empty after strip) differs
                if is_eng_valid != is_urd_valid:
                    print(f"WARNING: Mismatch in validity at line {line_num}:")
                    print(f"  English Raw: {eng_line_raw!r} -> Stripped: {eng_sentence!r} (Valid: {is_eng_valid})")
                    print(f"  Urdu Raw   : {urd_line_raw!r} -> Stripped: {urd_sentence!r} (Valid: {is_urd_valid})")
                    # If you want to enforce skipping based on mismatch:
                    # continue # This would skip both lines if one is invalid

                # Append if valid (original logic)
                if is_eng_valid:
                    english_sentences.append(eng_sentence)
                if is_urd_valid:
                    urdu_sentences.append(urd_sentence)

        print("-" * 20)
        print(f"Finished reading. Raw lines processed/compared: {max_lines}")
        print(f"Filtered English sentences collected: {len(english_sentences)}")
        print(f"Filtered Urdu sentences collected: {len(urdu_sentences)}")
        print("-" * 20)

        # --- Validation ---
        if len(english_sentences) != len(urdu_sentences):
            print(f"Error: Final sentence counts mismatch after filtering!")
            # No need to raise error here, just report and return None
            return None

        # --- Create DataFrame ---
        df = pd.DataFrame({
            'english': english_sentences,
            'urdu': urdu_sentences
        })
        print(f"Successfully loaded {len(df)} aligned sentence pairs.")
        return df

    except Exception as e:
        print(f"An unexpected error occurred during processing: {e}")
        return None

# --- Load the Data ---
parallel_df = load_parallel_corpus_debug(english_file_path, urdu_file_path)

# --- Display Sample Data ---
if parallel_df is not None:
    print("\n--- Sample Data (First 5 pairs) ---")
    print(parallel_df.head())
    print("\n--- Dataset Info ---")
    print(parallel_df.info())
else:
    print("\nFailed to load the dataset due to mismatches or errors.")
    print("Please check the WARNINGS/ERRORS above and inspect the data files.")

Reading and comparing lines from 'english.txt' and 'urdu.txt'...
Both files finished. Processed 24524 lines.
--------------------
Finished reading. Raw lines processed/compared: 24524
Filtered English sentences collected: 24524
Filtered Urdu sentences collected: 24524
--------------------
Successfully loaded 24524 aligned sentence pairs.

--- Sample Data (First 5 pairs) ---
                english                       urdu
0   is zain your nephew      زین تمہارا بھتیجا ہے۔
1  i wish youd trust me  کاش تم مجھ پر بھروسہ کرتے
2      did he touch you      کیا اس نے آپ کو چھوا؟
3      its part of life         اس کی زندگی کا حصہ
4        zain isnt ugly        زین بدصورت نہیں ہے۔

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24524 entries, 0 to 24523
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  24524 non-null  object
 1   urdu     24524 non-null  object
dtypes: object(2)
memory usage: 383.3+ 

In [2]:
import pandas as pd
import os

# --- Configuration ---
english_file_path = 'english.txt'
urdu_file_path = 'urdu.txt'

# --- Function to Load and Preprocess Data with Debugging ---
def load_parallel_corpus_debug(eng_path, urd_path):
    english_sentences = []
    urdu_sentences = []
    line_num = 0
    max_lines = 0 # To store the length of the longer file

    if not os.path.exists(eng_path):
        print(f"Error: English file not found at '{eng_path}'")
        return None
    if not os.path.exists(urd_path):
        print(f"Error: Urdu file not found at '{urd_path}'")
        return None

    try:
        print(f"Reading and comparing lines from '{eng_path}' and '{urd_path}'...")
        with open(eng_path, 'r', encoding='utf-8') as f_eng, \
             open(urd_path, 'r', encoding='utf-8') as f_urd:

            while True:
                line_num += 1
                eng_line_raw = f_eng.readline()
                urd_line_raw = f_urd.readline()

                # --- End of File Handling ---
                if not eng_line_raw and not urd_line_raw:
                    print(f"Both files finished. Processed {line_num - 1} lines.")
                    max_lines = line_num -1
                    break
                if not eng_line_raw:
                    print(f"English file ended at line {line_num - 1}, but Urdu file continues.")
                    print(f"  Extra Urdu line {line_num}: {urd_line_raw.strip()!r}")
                    # Decide how to handle: For now, we'll break, leading to mismatch
                    max_lines = line_num # Urdu file is longer
                    # Read rest of Urdu file to get its total length (optional)
                    urd_lines_count = line_num
                    while f_urd.readline():
                        urd_lines_count += 1
                    max_lines = urd_lines_count
                    break
                if not urd_line_raw:
                    print(f"Urdu file ended at line {line_num - 1}, but English file continues.")
                    print(f"  Extra English line {line_num}: {eng_line_raw.strip()!r}")
                    # Decide how to handle: For now, we'll break, leading to mismatch
                    max_lines = line_num # English file is longer
                     # Read rest of English file to get its total length (optional)
                    eng_lines_count = line_num
                    while f_eng.readline():
                        eng_lines_count += 1
                    max_lines = eng_lines_count
                    break

                # --- Process Lines ---
                eng_sentence = eng_line_raw.strip()
                urd_sentence = urd_line_raw.strip()

                is_eng_valid = bool(eng_sentence)
                is_urd_valid = bool(urd_sentence)

                # Check if validity (non-empty after strip) differs
                if is_eng_valid != is_urd_valid:
                    print(f"WARNING: Mismatch in validity at line {line_num}:")
                    print(f"  English Raw: {eng_line_raw!r} -> Stripped: {eng_sentence!r} (Valid: {is_eng_valid})")
                    print(f"  Urdu Raw   : {urd_line_raw!r} -> Stripped: {urd_sentence!r} (Valid: {is_urd_valid})")
                    # If you want to enforce skipping based on mismatch:
                    # continue # This would skip both lines if one is invalid

                # Append if valid (original logic)
                if is_eng_valid:
                    english_sentences.append(eng_sentence)
                if is_urd_valid:
                    urdu_sentences.append(urd_sentence)

        print("-" * 20)
        print(f"Finished reading. Raw lines processed/compared: {max_lines}")
        print(f"Filtered English sentences collected: {len(english_sentences)}")
        print(f"Filtered Urdu sentences collected: {len(urdu_sentences)}")
        print("-" * 20)

        # --- Validation ---
        if len(english_sentences) != len(urdu_sentences):
            print(f"Error: Final sentence counts mismatch after filtering!")
            # No need to raise error here, just report and return None
            return None

        # --- Create DataFrame ---
        df = pd.DataFrame({
            'english': english_sentences,
            'urdu': urdu_sentences
        })
        print(f"Successfully loaded {len(df)} aligned sentence pairs.")
        return df

    except Exception as e:
        print(f"An unexpected error occurred during processing: {e}")
        return None

# --- Load the Data ---
parallel_df = load_parallel_corpus_debug(english_file_path, urdu_file_path)

# --- Display Sample Data ---
if parallel_df is not None:
    print("\n--- Sample Data (First 5 pairs) ---")
    print(parallel_df.head())
    print("\n--- Dataset Info ---")
    print(parallel_df.info())
else:
    print("\nFailed to load the dataset due to mismatches or errors.")
    print("Please check the WARNINGS/ERRORS above and inspect the data files.")

Reading and comparing lines from 'english.txt' and 'urdu.txt'...
Both files finished. Processed 24524 lines.
--------------------
Finished reading. Raw lines processed/compared: 24524
Filtered English sentences collected: 24524
Filtered Urdu sentences collected: 24524
--------------------
Successfully loaded 24524 aligned sentence pairs.

--- Sample Data (First 5 pairs) ---
                english                       urdu
0   is zain your nephew      زین تمہارا بھتیجا ہے۔
1  i wish youd trust me  کاش تم مجھ پر بھروسہ کرتے
2      did he touch you      کیا اس نے آپ کو چھوا؟
3      its part of life         اس کی زندگی کا حصہ
4        zain isnt ugly        زین بدصورت نہیں ہے۔

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24524 entries, 0 to 24523
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  24524 non-null  object
 1   urdu     24524 non-null  object
dtypes: object(2)
memory usage: 383.3+ 

In [3]:
parallel_df

Unnamed: 0,english,urdu
0,is zain your nephew,زین تمہارا بھتیجا ہے۔
1,i wish youd trust me,کاش تم مجھ پر بھروسہ کرتے
2,did he touch you,کیا اس نے آپ کو چھوا؟
3,its part of life,اس کی زندگی کا حصہ
4,zain isnt ugly,زین بدصورت نہیں ہے۔
...,...,...
24519,i am in a hurry today,میں آج جلدی میں ہوں۔
24520,take this medicine,یہ دوا لے لو
24521,this is the case,یہ معاملہ ہے
24522,zains tipsy,زین ٹپسی


In [4]:
from BackTranslation import BackTranslation

# Initialize translator (create only one instance)
trans = BackTranslation()

result = trans.translate("ہیلو ، آپ کیسے ہیں؟", src="ur", tmp="en")
print(f"Original: {result.source_text}")
print(f"Urdu Back-Translation: {result.result_text}")
print(f"Urdu Translation: {result.tran_text}")


Original: ہیلو ، آپ کیسے ہیں؟
Urdu Back-Translation: ہیلو ، آپ کیسے ہیں؟
Urdu Translation: Hello, how are you?


In [5]:
import requests
import zipfile
import io
import os
import shutil

# URL for the repository's master branch as a zip file
zip_url = "https://github.com/zeerakahmed/makhzan/archive/refs/heads/master.zip"

# Download the zip file
print("Downloading repository zip file...")
response = requests.get(zip_url)
if response.status_code == 200:
    print("Download successful!")
    # Load the content into a BytesIO object
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    # Define a temporary extraction directory
    temp_dir = "makhzan_temp"

    # Extract the zip file content
    zip_file.extractall(temp_dir)

    # Path to the text folder within the extracted directory
    extracted_folder = os.path.join(temp_dir, "makhzan-master", "text")

    # Define destination directory for the text folder
    destination = "text"

    # Check if the text folder exists and move it to destination
    if os.path.exists(extracted_folder):
        # Remove the destination directory if it already exists
        if os.path.exists(destination):
            shutil.rmtree(destination)
        shutil.move(extracted_folder, destination)
        print(f"Text folder has been extracted to: {destination}")
    else:
        print("Text folder not found in the repository.")

    # Optionally, clean up the temporary directory
    shutil.rmtree(temp_dir)
else:
    print("Failed to download the repository zip file. Status code:", response.status_code)


Downloading repository zip file...
Download successful!
Text folder has been extracted to: text


In [6]:
import os
from lxml import etree
from tqdm.auto import tqdm
import re
import sys
import pysbd # Using pysbd

# --- Configuration ---
# Ensure this path is correct for your Colab environment where you uploaded/mounted the data
XML_DIR = "text"

# --- Function to extract text from relevant XML elements ---
def extract_text_from_element(element):
    """Extracts text using itertext."""
    text_parts = []
    # Using itertext() gets text nodes directly, including those between tags
    for text in element.itertext():
         cleaned_text = text.strip()
         # Basic cleaning: replace multiple whitespace chars with a single space
         cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
         if cleaned_text:
             text_parts.append(cleaned_text)
    # Join parts with a single space
    return " ".join(text_parts)

# --- Main Parsing and Sentence Extraction Logic ---
all_urdu_sentences = []
processed_files = 0
failed_files = []

print(f"Starting XML processing in directory: {XML_DIR}")

if not os.path.isdir(XML_DIR):
    print(f"Error: Directory not found - {XML_DIR}")
    # Stop execution if directory isn't found
    sys.exit(1) # Or raise an error depending on preference

try:
    # List files and sort numerically based on the number in the filename
    xml_files = sorted(
        [f for f in os.listdir(XML_DIR) if f.endswith('.xml') and f[:-4].isdigit()],
        key=lambda x: int(x[:-4]) # Assumes filenames are like '0001.xml'
    )
    print(f"Found {len(xml_files)} XML files matching the pattern.")
except FileNotFoundError:
     print(f"Error: Cannot list files in directory - {XML_DIR}. Check the path.")
     xml_files = []
except Exception as e:
     print(f"An unexpected error occurred while listing files: {e}")
     xml_files = []

# Initialize pysbd segmenter for Urdu
# clean=False preserves some potentially meaningful structures if needed later,
# clean=True performs more aggressive cleaning (like removing periods within numbers)
try:
    seg = pysbd.Segmenter(language="ur", clean=False)
    print("Initialized pysbd sentence segmenter for Urdu.")
except Exception as e:
     print(f"Error initializing pysbd: {e}")
     print("Make sure 'pysbd' is installed correctly.")
     sys.exit(1) # Stop if segmenter fails


for filename in tqdm(xml_files, desc="Processing XML Files"):
    filepath = os.path.join(XML_DIR, filename)
    try:
        # Parse XML robustly
        parser = etree.XMLParser(recover=True, encoding='utf-8') # recover=True helps with minor errors
        tree = etree.parse(filepath, parser=parser)
        root = tree.getroot()

        # Find the <body> element using XPath
        body = root.find('.//body')

        if body is not None:
            file_text_parts = []
            # Iterate through paragraph tags within the body
            for p_element in body.xpath('.//p'):
                # Extract text using the helper function (which uses itertext)
                paragraph_text = extract_text_from_element(p_element)

                # Skip paragraphs that seem empty after extraction or likely just foreign annotations
                # Check if paragraph contains an Arabic annotation AND its own direct text is empty/whitespace
                is_only_foreign_annotation = p_element.xpath('.//annotation[@lang="ar"]') and not ''.join(p_element.xpath('./text()')).strip()

                if paragraph_text and not is_only_foreign_annotation:
                     # Text already cleaned in extract_text_from_element
                     file_text_parts.append(paragraph_text)

            # Join paragraphs with a space (sent_tokenize handles sentence breaks)
            full_doc_text = " ".join(file_text_parts)

            if full_doc_text:
                # --- Use pysbd for Sentence Segmentation ---
                sentences = seg.segment(full_doc_text)
                # -----------------------------------------
                for sent in sentences:
                    # Final clean: strip leading/trailing whitespace from sentence
                    cleaned_sent = sent.strip()
                    # Basic filter: ensure sentence is not empty and maybe has minimum length
                    if cleaned_sent and len(cleaned_sent) > 3: # Example: min length 4 chars
                        all_urdu_sentences.append(cleaned_sent)
            processed_files += 1
        else:
             # Optionally track files without a body tag if needed
             # print(f"Warning: No <body> tag found in {filename}.")
             pass # Silently skip

    except etree.XMLSyntaxError as e:
        # More specific error for bad XML
        print(f"\nXML Syntax Error processing {filename}: {e}")
        failed_files.append(filename + " (XML Syntax Error)")
    except Exception as e:
        # General catch-all for other errors (pysbd, file reading, etc.)
        print(f"\nAn unexpected error occurred processing {filename}: {e}")
        # Include exception type for better debugging
        failed_files.append(filename + f" ({type(e).__name__}: {e})")


# --- Final Summary ---
print("\n--- Processing Summary ---")
print(f"Total files attempted: {len(xml_files)}")
print(f"Files successfully processed (found body & no errors): {processed_files}")
print(f"Total Urdu sentences extracted: {len(all_urdu_sentences)}")
if failed_files:
    print(f"Files failed or skipped due to errors: {len(failed_files)}")
    # Show first few failed files for debugging
    # print("Sample failed/skipped files:")
    # for f_detail in failed_files[:5]:
    #     print(f"  - {f_detail}")


# --- Display Sample Sentences ---
print("\n--- Sample Extracted Sentences (First 10) ---")
if all_urdu_sentences:
    for i, sent in enumerate(all_urdu_sentences[:10]):
        print(f"{i+1}: {sent}")
else:
    print("No sentences were extracted.")

print("\n--- Sample Extracted Sentences (Last 10) ---")
if len(all_urdu_sentences) >= 10:
    for i, sent in enumerate(all_urdu_sentences[-10:]):
        # Correct index calculation for last 10
        print(f"{len(all_urdu_sentences) - 10 + i + 1}: {sent}")
elif all_urdu_sentences:
     print("(Showing all extracted sentences as there are less than 10)")
     for i, sent in enumerate(all_urdu_sentences):
        print(f"{i+1}: {sent}")
else:
    print("No sentences were extracted.")


# --- Save sentences to a file ---
output_file = "extracted_urdu_monolingual_pysbd.txt"
print(f"\nSaving extracted sentences to {output_file}...")
try:
    with open(output_file, "w", encoding="utf-8") as f:
        for sent in all_urdu_sentences:
            f.write(sent + "\n")
    print(f"Sentences saved successfully to {output_file}")
except Exception as e:
    print(f"Error saving sentences: {e}")

# You can now use the file 'extracted_urdu_monolingual_pysbd.txt' for back-translation

  from .autonotebook import tqdm as notebook_tqdm


Starting XML processing in directory: text
Found 6315 XML files matching the pattern.
Initialized pysbd sentence segmenter for Urdu.


Processing XML Files:   0%|          | 0/6315 [00:00<?, ?it/s]

Processing XML Files: 100%|██████████| 6315/6315 [01:05<00:00, 96.81it/s] 



--- Processing Summary ---
Total files attempted: 6315
Files successfully processed (found body & no errors): 6315
Total Urdu sentences extracted: 215343

--- Sample Extracted Sentences (First 10) ---
1: بنگلہ دیش کی عدالتِ عالیہ نے طلاق کے ایک مقدمے کا فیصلہ کرتے ہوئے علما کے فتووں کو غیر قانونی قرار دیا ہے۔
2: عدالت نے پارلیمنٹ سے یہ درخواست کی ہے کہ وہ جلد ایسا قانون وضع کرے کہ جس کے بعد فتویٰ بازی قابلِ دست اندازیِ پولیس جرم بن جائے۔
3: بنگلہ دیش کے علما نے اس فیصلے پر بھر پور ردِ عمل ظاہرکرتے ہوئے اس کے خلاف ملک گیر تحریک چلانے کا اعلان کیا ہے۔
4: اس ضمن میں علما کی ایک تنظیم ”اسلامک یونٹی الائنس“ نے متعلقہ ججوں کو مرتد یعنی دین سے منحرف اور دائرۂ اسلام سے خارج قرار دیا ہے۔
5: فتوے کا لفظ دو موقعوں پر استعمال ہوتا ہے۔
6: ایک اس موقع پر جب کوئی صاحبِ علم شریعت کے کسی مئلے کے بارے میں اپنی رائے پیش کرتا ہے۔
7: دوسرے اس موقع پر جب کوئی عالمِ دین کسی خاص واقعے کے حوالے سے اپنا قانونی فیصلہ صادر کرتا ہے۔
8: ایک عرصے سے ہمارے علما کے ہاں اس دوسرے موقعِ استعمال کا غلبہ ہو گیا ہے۔
9: اس 

In [7]:
# Code you have to focus on:
import pandas as pd
import os
from BackTranslation import BackTranslation
from tqdm.auto import tqdm # For progress bar
import time # To add delays if needed
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import random # <-- Added for sampling

In [8]:

urdu_monolingual_file = "extracted_urdu_monolingual_pysbd.txt"
output_csv_file = "urdu_english_backtranslated_parallel_sampled.csv" # Changed output filename slightly
MAX_WORKERS = 4 # <<<--- ADJUSTED: Limit to 2 concurrent translation threads
SAMPLE_SIZE = 10000 # <<<--- ADDED: Number of sentences to sample


In [9]:
# --- 1. Load Urdu Sentences ---
print(f"Loading Urdu sentences from: {urdu_monolingual_file}")
if not os.path.exists(urdu_monolingual_file):
    print(f"Error: Input file not found at '{urdu_monolingual_file}'")
    exit()

Loading Urdu sentences from: extracted_urdu_monolingual_pysbd.txt


In [10]:
urdu_sentences_from_file_full = [] # Load into a temporary full list first
try:
    with open(urdu_monolingual_file, 'r', encoding='utf-8') as f:
        urdu_sentences_from_file_full = [line.strip() for line in f if line.strip()]
    print(f"Successfully loaded {len(urdu_sentences_from_file_full)} non-empty Urdu sentences.")
except Exception as e:
    print(f"Error reading file {urdu_monolingual_file}: {e}")
    exit()

if not urdu_sentences_from_file_full:
    print("No sentences found in the input file. Exiting.")
    exit()

Successfully loaded 215343 non-empty Urdu sentences.


In [11]:
# --- ADDED: Sampling Logic ---
if len(urdu_sentences_from_file_full) > SAMPLE_SIZE:
    print(f"Sampling {SAMPLE_SIZE} sentences from the loaded {len(urdu_sentences_from_file_full)} sentences...")
    random.seed(42) # Optional: for reproducible sampling
    urdu_sentences_to_process = random.sample(urdu_sentences_from_file_full, SAMPLE_SIZE)
    print(f"Proceeding with {len(urdu_sentences_to_process)} sampled sentences.")
else:
    print(f"Number of sentences ({len(urdu_sentences_from_file_full)}) is less than or equal to SAMPLE_SIZE ({SAMPLE_SIZE}). Processing all loaded sentences.")
    urdu_sentences_to_process = urdu_sentences_from_file_full
# --- End of Sampling Logic ---

Sampling 10000 sentences from the loaded 215343 sentences...
Proceeding with 10000 sampled sentences.


In [12]:
# --- 2. Initialize BackTranslator ---
# Initialize *once* outside the loop/worker function
print("Initializing BackTranslation object...")
try:
    # Assuming BackTranslation() is thread-safe for concurrent calls
    trans = BackTranslation()
    print("BackTranslation initialized.")
except Exception as e:
    print(f"Error initializing BackTranslation: {e}")
    print("Please ensure the 'BackTranslation' library and its dependencies are installed correctly.")
    exit()

Initializing BackTranslation object...
BackTranslation initialized.


In [13]:
# --- 3. Define Worker Function ---
def translate_sentence(sentence):
    """
    Worker function to translate a single sentence.
    Handles individual translation errors.
    Returns a tuple: (original_sentence, english_translation_or_None)
    """
    try:
        # Perform the translation: Urdu -> English (tmp) -> Urdu
        result = trans.translate(sentence, src='ur', tmp='en')
        english_translation = result.tran_text

        if english_translation and english_translation.strip():
            return sentence, english_translation.strip()
        else:
            # print(f"\nWarning: Received empty/invalid translation for: {sentence!r}") # Too noisy in parallel
            return sentence, None # Indicate failure for this sentence
    except Exception as e:
        # Add a small delay before retrying or failing, might help with transient rate limits
        # time.sleep(0.5) # Optional: uncomment if needed, but slows down process
        # print(f"\nError translating sentence: {sentence!r}. Error: {e}") # Too noisy in parallel
        return sentence, None # Indicate failure for this sentence

In [None]:
# CODE YOU HAVE TO FOCUS ON:
import csv # <-- Import the csv module
import os  # <-- Make sure os is imported (likely already is)

# --- (Previous code for loading, sampling, initializing BackTranslation, and defining translate_sentence remains the same) ---

# --- Configuration (ensure output_csv_file is defined) ---
# output_csv_file = "urdu_english_backtranslated_parallel_sampled.csv" # Should be defined earlier

# --- 4-6. Iterate, Translate (in Parallel), Store (Incrementally), and Handle Errors ---
successful_translations = 0 # Counter for successful translations
failed_translations = 0     # Counter for failed translations

print(f"\nStarting parallel back-translation for {len(urdu_sentences_to_process)} sentences...") # Use the sampled list size
print(f"Using up to {MAX_WORKERS} concurrent workers.") # Now reflects 2
print(f"Appending results incrementally to: {output_csv_file}")

# Check if the file exists to determine if header is needed
file_exists = os.path.exists(output_csv_file)
# Check if file exists *and* has content (more robust than just exists)
is_empty = not file_exists or os.path.getsize(output_csv_file) == 0

# Use ThreadPoolExecutor for I/O-bound tasks
# Open the CSV file *before* the executor starts and keep it open
try:
    with open(output_csv_file, 'a', newline='', encoding='utf-8') as csvfile, \
         ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: # MAX_WORKERS is now 2

        csv_writer = csv.writer(csvfile)

        # Write header only if the file is new/empty
        if is_empty:
            csv_writer.writerow(['urdu', 'english'])
            print("Output file is new or empty. Writing header row.")

        # Use executor.map to apply the worker function to each sentence in the sampled list
        futures = executor.map(translate_sentence, urdu_sentences_to_process) # Use the sampled list

        # Process results as they complete using tqdm for progress
        # Iterate directly over the futures iterator wrapped in tqdm
        print("Processing and appending translations...")
        for original, translation in tqdm(futures, total=len(urdu_sentences_to_process), desc="Translating"):
            if translation is not None:
                # Write the successful translation pair directly to the CSV
                csv_writer.writerow([original, translation])
                successful_translations += 1
            else:
                failed_translations += 1
        # The 'with open...' block ensures the file is flushed and closed here

except Exception as e:
    print(f"\nAn error occurred during the translation process: {e}")
    print("Progress up to this point might be saved in the CSV file.")
    # Depending on the error, you might want to exit or handle differently
    # exit() # Optional: stop execution if a critical error occurs during processing

# --- Post-Processing Summary ---
print("\n--- Translation Summary ---")
print(f"Successfully translated pairs appended to CSV: {successful_translations}")
print(f"Sentences failed during translation: {failed_translations}")
print(f"Total sentences attempted (sampled): {len(urdu_sentences_to_process)}") # Report based on sampled size

# --- 7 & 8 Removed: DataFrame creation and final save are no longer needed ---
# No need to create the full DataFrame in memory or save it again.

if successful_translations == 0 and failed_translations > 0:
     print("\nNo sentences were successfully translated and saved.")
     print("Check for persistent errors like API rate limits, network issues, or configuration problems.")
elif successful_translations == 0 and failed_translations == 0:
     print("\nNo sentences were processed (input list might have been empty).")
else:
     print(f"\nData has been incrementally saved to: {output_csv_file}")


print("\nProcessing finished.")


Starting parallel back-translation for 10000 sentences...
Using up to 4 concurrent workers.
Appending results incrementally to: urdu_english_backtranslated_parallel_sampled.csv
Output file is new or empty. Writing header row.
Processing and appending translations...


Translating:   2%|▏         | 206/10000 [1:40:10<79:22:23, 29.18s/it]
