In [22]:
import pandas as pd
import os
from BackTranslation import BackTranslation
from tqdm.auto import tqdm 
import time 
import concurrent.futures
from lxml import etree
import re
import sys
import pysbd 
import requests
import zipfile
import io
import shutil
import csv 
import os
from concurrent.futures import ThreadPoolExecutor
import random 

In [2]:
english_file_path = 'english.txt'
urdu_file_path = 'urdu.txt'

In [3]:
def load_parallel_corpus(eng_path, urd_path):
    english_sentences = []
    urdu_sentences = []
    line_num = 0
    max_lines = 0 

    if not os.path.exists(eng_path):
        print(f"Error: English file not found at '{eng_path}'")
        return None
    if not os.path.exists(urd_path):
        print(f"Error: Urdu file not found at '{urd_path}'")
        return None

    try:
        print(f"Reading and comparing lines from '{eng_path}' and '{urd_path}'...")
        with open(eng_path, 'r', encoding='utf-8') as f_eng, \
             open(urd_path, 'r', encoding='utf-8') as f_urd:

            while True:
                line_num += 1
                eng_line_raw = f_eng.readline()
                urd_line_raw = f_urd.readline()

                if not eng_line_raw and not urd_line_raw:
                    print(f"Both files finished. Processed {line_num - 1} lines.")
                    max_lines = line_num -1
                    break
                if not eng_line_raw:
                    print(f"English file ended at line {line_num - 1}, but Urdu file continues.")
                    print(f"  Extra Urdu line {line_num}: {urd_line_raw.strip()!r}")
                    max_lines = line_num
                    urd_lines_count = line_num
                    while f_urd.readline():
                        urd_lines_count += 1
                    max_lines = urd_lines_count
                    break
                if not urd_line_raw:
                    print(f"Urdu file ended at line {line_num - 1}, but English file continues.")
                    print(f"  Extra English line {line_num}: {eng_line_raw.strip()!r}")
                    max_lines = line_num 
                    eng_lines_count = line_num
                    while f_eng.readline():
                        eng_lines_count += 1
                    max_lines = eng_lines_count
                    break

                eng_sentence = eng_line_raw.strip()
                urd_sentence = urd_line_raw.strip()

                is_eng_valid = bool(eng_sentence)
                is_urd_valid = bool(urd_sentence)

                if is_eng_valid != is_urd_valid:
                    print(f"WARNING: Mismatch in validity at line {line_num}:")
                    print(f"  English Raw: {eng_line_raw!r} -> Stripped: {eng_sentence!r} (Valid: {is_eng_valid})")
                    print(f"  Urdu Raw   : {urd_line_raw!r} -> Stripped: {urd_sentence!r} (Valid: {is_urd_valid})")

                if is_eng_valid:
                    english_sentences.append(eng_sentence)
                if is_urd_valid:
                    urdu_sentences.append(urd_sentence)

        print("-" * 20)
        print(f"Finished reading. Raw lines processed/compared: {max_lines}")
        print(f"Filtered English sentences collected: {len(english_sentences)}")
        print(f"Filtered Urdu sentences collected: {len(urdu_sentences)}")
        print("-" * 20)

        if len(english_sentences) != len(urdu_sentences):
            print(f"Error: Final sentence counts mismatch after filtering!")
            return None

        df = pd.DataFrame({
            'english': english_sentences,
            'urdu': urdu_sentences
        })
        print(f"Successfully loaded {len(df)} aligned sentence pairs.")
        return df

    except Exception as e:
        print(f"An unexpected error occurred during processing: {e}")
        return None

In [4]:
parallel_df = load_parallel_corpus(english_file_path, urdu_file_path)

if parallel_df is not None:
    print("\n--- Sample Data (First 5 pairs) ---")
    print(parallel_df.head())
    print("\n--- Dataset Info ---")
    print(parallel_df.info())
else:
    print("\nFailed to load the dataset due to mismatches or errors.")
    print("Please check the WARNINGS/ERRORS above and inspect the data files.")

Reading and comparing lines from 'english.txt' and 'urdu.txt'...
Both files finished. Processed 24524 lines.
--------------------
Finished reading. Raw lines processed/compared: 24524
Filtered English sentences collected: 24524
Filtered Urdu sentences collected: 24524
--------------------
Successfully loaded 24524 aligned sentence pairs.

--- Sample Data (First 5 pairs) ---
                english                       urdu
0   is zain your nephew      زین تمہارا بھتیجا ہے۔
1  i wish youd trust me  کاش تم مجھ پر بھروسہ کرتے
2      did he touch you      کیا اس نے آپ کو چھوا؟
3      its part of life         اس کی زندگی کا حصہ
4        zain isnt ugly        زین بدصورت نہیں ہے۔

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24524 entries, 0 to 24523
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  24524 non-null  object
 1   urdu     24524 non-null  object
dtypes: object(2)
memory usage: 383.3+ 

In [5]:
parallel_df

Unnamed: 0,english,urdu
0,is zain your nephew,زین تمہارا بھتیجا ہے۔
1,i wish youd trust me,کاش تم مجھ پر بھروسہ کرتے
2,did he touch you,کیا اس نے آپ کو چھوا؟
3,its part of life,اس کی زندگی کا حصہ
4,zain isnt ugly,زین بدصورت نہیں ہے۔
...,...,...
24519,i am in a hurry today,میں آج جلدی میں ہوں۔
24520,take this medicine,یہ دوا لے لو
24521,this is the case,یہ معاملہ ہے
24522,zains tipsy,زین ٹپسی


In [None]:
trans = BackTranslation()

result = trans.translate("ہیلو ، آپ کیسے ہیں؟", src="ur", tmp="en")
print(f"Original: {result.source_text}")
print(f"Urdu Back-Translation: {result.result_text}")
print(f"Urdu Translation: {result.tran_text}")


Original: ہیلو ، آپ کیسے ہیں؟
Urdu Back-Translation: ہیلو ، آپ کیسے ہیں؟
Urdu Translation: Hello, how are you?


In [6]:
zip_url = "https://github.com/zeerakahmed/makhzan/archive/refs/heads/master.zip"

print("Downloading repository zip file...")
response = requests.get(zip_url)
if response.status_code == 200:
    print("Download successful!")
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))

    temp_dir = "makhzan_temp"

    zip_file.extractall(temp_dir)

    extracted_folder = os.path.join(temp_dir, "makhzan-master", "text")

    destination = "text"

    if os.path.exists(extracted_folder):
        if os.path.exists(destination):
            shutil.rmtree(destination)
        shutil.move(extracted_folder, destination)
        print(f"Text folder has been extracted to: {destination}")
    else:
        print("Text folder not found in the repository.")

    shutil.rmtree(temp_dir)
else:
    print("Failed to download the repository zip file. Status code:", response.status_code)


Downloading repository zip file...
Download successful!
Text folder has been extracted to: text


In [7]:
XML_DIR = "text"

In [8]:
def extract_text_from_element(element):
    """Extracts text using itertext."""
    text_parts = []
    for text in element.itertext():
         cleaned_text = text.strip()
         cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
         if cleaned_text:
             text_parts.append(cleaned_text)
    return " ".join(text_parts)

In [9]:
all_urdu_sentences = []
processed_files = 0
failed_files = []

print(f"Starting XML processing in directory: {XML_DIR}")

Starting XML processing in directory: text


In [None]:
if not os.path.isdir(XML_DIR):
    print(f"Error: Directory not found - {XML_DIR}")
    sys.exit(1) #

try:
    xml_files = sorted(
        [f for f in os.listdir(XML_DIR) if f.endswith('.xml') and f[:-4].isdigit()],
        key=lambda x: int(x[:-4]) 
    )
    print(f"Found {len(xml_files)} XML files matching the pattern.")
except FileNotFoundError:
     print(f"Error: Cannot list files in directory - {XML_DIR}. Check the path.")
     xml_files = []
except Exception as e:
     print(f"An unexpected error occurred while listing files: {e}")
     xml_files = []

Found 6315 XML files matching the pattern.


In [None]:
try:
    seg = pysbd.Segmenter(language="ur", clean=False)
    print("Initialized pysbd sentence segmenter for Urdu.")
except Exception as e:
     print(f"Error initializing pysbd: {e}")
     print("Make sure 'pysbd' is installed correctly.")
     sys.exit(1) # Stop if segmenter fails

Initialized pysbd sentence segmenter for Urdu.


In [12]:
for filename in tqdm(xml_files, desc="Processing XML Files"):
    filepath = os.path.join(XML_DIR, filename)
    try:
        parser = etree.XMLParser(recover=True, encoding='utf-8') 
        tree = etree.parse(filepath, parser=parser)
        root = tree.getroot()

        body = root.find('.//body')

        if body is not None:
            file_text_parts = []
            for p_element in body.xpath('.//p'):
                paragraph_text = extract_text_from_element(p_element)

                is_only_foreign_annotation = p_element.xpath('.//annotation[@lang="ar"]') and not ''.join(p_element.xpath('./text()')).strip()

                if paragraph_text and not is_only_foreign_annotation:
                     file_text_parts.append(paragraph_text)

            full_doc_text = " ".join(file_text_parts)

            if full_doc_text:
                sentences = seg.segment(full_doc_text)
                for sent in sentences:
                    cleaned_sent = sent.strip()
                    if cleaned_sent and len(cleaned_sent) > 3: # Example: min length 4 chars
                        all_urdu_sentences.append(cleaned_sent)
            processed_files += 1
        else:
             pass 

    except etree.XMLSyntaxError as e:
        print(f"\nXML Syntax Error processing {filename}: {e}")
        failed_files.append(filename + " (XML Syntax Error)")
    except Exception as e:
        print(f"\nAn unexpected error occurred processing {filename}: {e}")
        failed_files.append(filename + f" ({type(e).__name__}: {e})")

Processing XML Files: 100%|██████████| 6315/6315 [01:04<00:00, 98.20it/s] 


In [13]:
print(f"Total files attempted: {len(xml_files)}")
print(f"Files successfully processed (found body & no errors): {processed_files}")
print(f"Total Urdu sentences extracted: {len(all_urdu_sentences)}")
if failed_files:
    print(f"Files failed or skipped due to errors: {len(failed_files)}")

Total files attempted: 6315
Files successfully processed (found body & no errors): 6315
Total Urdu sentences extracted: 215343


In [14]:
if all_urdu_sentences:
    for i, sent in enumerate(all_urdu_sentences[:10]):
        print(f"{i+1}: {sent}")
else:
    print("No sentences were extracted.")

1: بنگلہ دیش کی عدالتِ عالیہ نے طلاق کے ایک مقدمے کا فیصلہ کرتے ہوئے علما کے فتووں کو غیر قانونی قرار دیا ہے۔
2: عدالت نے پارلیمنٹ سے یہ درخواست کی ہے کہ وہ جلد ایسا قانون وضع کرے کہ جس کے بعد فتویٰ بازی قابلِ دست اندازیِ پولیس جرم بن جائے۔
3: بنگلہ دیش کے علما نے اس فیصلے پر بھر پور ردِ عمل ظاہرکرتے ہوئے اس کے خلاف ملک گیر تحریک چلانے کا اعلان کیا ہے۔
4: اس ضمن میں علما کی ایک تنظیم ”اسلامک یونٹی الائنس“ نے متعلقہ ججوں کو مرتد یعنی دین سے منحرف اور دائرۂ اسلام سے خارج قرار دیا ہے۔
5: فتوے کا لفظ دو موقعوں پر استعمال ہوتا ہے۔
6: ایک اس موقع پر جب کوئی صاحبِ علم شریعت کے کسی مئلے کے بارے میں اپنی رائے پیش کرتا ہے۔
7: دوسرے اس موقع پر جب کوئی عالمِ دین کسی خاص واقعے کے حوالے سے اپنا قانونی فیصلہ صادر کرتا ہے۔
8: ایک عرصے سے ہمارے علما کے ہاں اس دوسرے موقعِ استعمال کا غلبہ ہو گیا ہے۔
9: اس کا نتیجہ یہ نکلا ہے کہ اس لفظ کا رائے یا نقطۂ نظر کے مفہوم میں استعمال کم و بیش متروک ہو گیا ہے۔
10: چنانچہ اب فتوے کا مطلب ہی علما کی طرف سے کسی خاص مألے یا واقعے کے بارے میں حتمی فیصلے کا صدور سمجھا ج

In [15]:
output_file = "extracted_urdu_monolingual_pysbd.txt"
try:
    with open(output_file, "w", encoding="utf-8") as f:
        for sent in all_urdu_sentences:
            f.write(sent + "\n")
    print(f"Sentences saved successfully to {output_file}")
except Exception as e:
    print(f"Error saving sentences: {e}")


Sentences saved successfully to extracted_urdu_monolingual_pysbd.txt


In [16]:
urdu_monolingual_file = "extracted_urdu_monolingual_pysbd.txt"
output_csv_file = "urdu_english_backtranslated_parallel_sampled.csv"
MAX_WORKERS = 4 
SAMPLE_SIZE = 10000 

In [17]:
print(f"Loading Urdu sentences from: {urdu_monolingual_file}")
if not os.path.exists(urdu_monolingual_file):
    print(f"Error: Input file not found at '{urdu_monolingual_file}'")
    exit()

Loading Urdu sentences from: extracted_urdu_monolingual_pysbd.txt


In [18]:
urdu_sentences_from_file_full = [] 
try:
    with open(urdu_monolingual_file, 'r', encoding='utf-8') as f:
        urdu_sentences_from_file_full = [line.strip() for line in f if line.strip()]
    print(f"Successfully loaded {len(urdu_sentences_from_file_full)} non-empty Urdu sentences.")
except Exception as e:
    print(f"Error reading file {urdu_monolingual_file}: {e}")
    exit()

if not urdu_sentences_from_file_full:
    print("No sentences found in the input file. Exiting.")
    exit()

Successfully loaded 215343 non-empty Urdu sentences.


In [19]:
if len(urdu_sentences_from_file_full) > SAMPLE_SIZE:
    print(f"Sampling {SAMPLE_SIZE} sentences from the loaded {len(urdu_sentences_from_file_full)} sentences...")
    random.seed(42) 
    urdu_sentences_to_process = random.sample(urdu_sentences_from_file_full, SAMPLE_SIZE)
    print(f"Proceeding with {len(urdu_sentences_to_process)} sampled sentences.")
else:
    print(f"Number of sentences ({len(urdu_sentences_from_file_full)}) is less than or equal to SAMPLE_SIZE ({SAMPLE_SIZE}). Processing all loaded sentences.")
    urdu_sentences_to_process = urdu_sentences_from_file_full


Sampling 10000 sentences from the loaded 215343 sentences...
Proceeding with 10000 sampled sentences.


In [20]:
trans = BackTranslation()

In [21]:
def translate_sentence(sentence):
    """
    Worker function to translate a single sentence.
    Handles individual translation errors.
    Returns a tuple: (original_sentence, english_translation_or_None)
    """
    try:
        result = trans.translate(sentence, src='ur', tmp='en')
        english_translation = result.tran_text

        if english_translation and english_translation.strip():
            return sentence, english_translation.strip()
        else:
            return sentence, None 
    except Exception as e:
        return sentence, None 

In [23]:
successful_translations = 0 
failed_translations = 0    

print(f"\nStarting parallel back-translation for {len(urdu_sentences_to_process)} sentences...") 
print(f"Using up to {MAX_WORKERS} concurrent workers.") 
print(f"Appending results incrementally to: {output_csv_file}")


Starting parallel back-translation for 10000 sentences...
Using up to 4 concurrent workers.
Appending results incrementally to: urdu_english_backtranslated_parallel_sampled.csv


In [None]:
file_exists = os.path.exists(output_csv_file)
is_empty = not file_exists or os.path.getsize(output_csv_file) == 0

with open(output_csv_file, 'a', newline='', encoding='utf-8') as csvfile, \
    ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:

    csv_writer = csv.writer(csvfile)

    if is_empty:
       csv_writer.writerow(['urdu', 'english'])
       print("Output file is new or empty. Writing header row.")

    futures = executor.map(translate_sentence, urdu_sentences_to_process)

    print("Processing and appending translations...")
    for original, translation in tqdm(futures, total=len(urdu_sentences_to_process), desc="Translating"):
       if translation is not None:
          csv_writer.writerow([original, translation])
          successful_translations += 1
       else:
          failed_translations += 1

print(f"Successfully translated pairs appended to CSV: {successful_translations}")
print(f"Sentences failed during translation: {failed_translations}")
print(f"Total sentences attempted (sampled): {len(urdu_sentences_to_process)}")

if successful_translations == 0 and failed_translations > 0:
    print("\nNo sentences were successfully translated and saved.")
    print("Check for persistent errors like API rate limits, network issues, or configuration problems.")
elif successful_translations == 0 and failed_translations == 0:
    print("\nNo sentences were processed (input list might have been empty).")
else:
    print(f"\nData has been incrementally saved to: {output_csv_file}")

print("\nProcessing finished.")


Output file is new or empty. Writing header row.
Processing and appending translations...


Translating:   1%|▏         | 126/10000 [02:25<2:43:33,  1.01it/s]