In [1]:
from pathlib import Path

In [2]:
data_path = Path("data/raw/parallel-n")
files = list(data_path.glob("*"))[:5]
files

[PosixPath('data/raw/parallel-n/IITB.en-hi.en'),
 PosixPath('data/raw/parallel-n/IITB.en-hi.hi')]

In [3]:
en_file = "data/raw/parallel-n/IITB.en-hi.en"
hi_file = "data/raw/parallel-n/IITB.en-hi.hi"

with open(en_file, encoding = "utf-8") as f_en, open(hi_file, encoding = "utf-8") as f_hi:
    for i in range(10):
        en_line = f_en.readline().strip()
        hi_line = f_hi.readline().strip()
        print(f"{i+1}. EN: {en_line}")
        print(f"   HI: {hi_line}")
        print("-" * 50)

1. EN: Give your application an accessibility workout
   HI: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
--------------------------------------------------
2. EN: Accerciser Accessibility Explorer
   HI: एक्सेर्साइसर पहुंचनीयता अन्वेषक
--------------------------------------------------
3. EN: The default plugin layout for the bottom panel
   HI: निचले पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
4. EN: The default plugin layout for the top panel
   HI: ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
5. EN: A list of plugins that are disabled by default
   HI: उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है
--------------------------------------------------
6. EN: Highlight duration
   HI: अवधि को हाइलाइट रकें
--------------------------------------------------
7. EN: The duration of the highlight box when selecting accessible nodes
   HI: पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की

In [4]:
import spacy

In [5]:
#NER ALONE DIDN'T WORK

In [5]:
nlp = spacy.load("en_core_web_sm")
sentence = "Accerciser Accessibility Explorer"
doc = nlp(sentence)
for ent in doc.ents:
    print(ent.text, ent.label_)

In [7]:
#COMBINING NER AND HEURISTICS (HYBRID)

In [6]:
# Load English NLP pipeline
nlp = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "lemmatizer"])

en_file = Path("data/raw/parallel-n/IITB.en-hi.en")

extracted_names = set()

MAX_LINES = 10000

with open(en_file, encoding="utf-8") as f:
    lines = []
    for i, line in enumerate(f):
        if i >= MAX_LINES:
            break
        line = line.strip()
        if line:
            lines.append(line)

print(f"Processing {len(lines)} sentences...")

# Process in batches
for doc in nlp.pipe(lines, batch_size = 500):
    # Part 1: NER
    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG", "PRODUCT", "GPE"}:
            extracted_names.add(ent.text)

    # Part 2: Heuristics
    for token in doc:
        if(
            token.text[0].isupper() and
            token.is_alpha and
            len(token.text) > 2
        ):
            extracted_names.add(token.text)

print("Done")
print("Total extracted names: ", len(extracted_names))
list(extracted_names)[:20]

Processing 10000 sentences...
Done
Total extracted names:  1227


['Incremental',
 'Programs',
 'Hyperlink',
 'The New BSD License See the COPYING and',
 'Elevator',
 'Details:%',
 'Law',
 'Addrcheck',
 'Due',
 'Macro',
 'Tools',
 'Library',
 'User',
 'Jamestown',
 'Skip',
 'Asked',
 'New View',
 'Mine',
 'Specify',
 'Athena']

In [7]:
import json

with open("data/processed/extracted_names_en_10k.json", "w", encoding = "utf-8") as f:
    json.dump(sorted(extracted_names), f, ensure_ascii = False, indent = 2)

In [8]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from difflib import SequenceMatcher

In [None]:
# ==========================================================
# BASELINE ALIGNMENT PIPELINE (DISABLED – NOT USED IN RESULTS)
# ==========================================================
# nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])
# ...
# json.dump(aligned_pairs, f, ensure_ascii=False, indent=2)

In [81]:
# nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])

# def romanize_hi(word):
#     return transliterate(word, sanscript.DEVANAGARI, sanscript.ITRANS)

# def similarity(a,b):
#     return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# en_file = Path("data/raw/parallel-n/IITB.en-hi.en")
# hi_file = Path("data/raw/parallel-n/IITB.en-hi.hi")

# MAX_LINES = 10000

# en_lines, hi_lines = [], []

# with open(en_file, encoding = "utf-8") as f_en , open(hi_file, encoding = "utf-8") as f_hi:
#     for i, (en, hi) in enumerate(zip(f_en, f_hi)):
#         if i >= MAX_LINES:
#             break
#         en_lines.append(en.strip())
#         hi_lines.append(hi.strip())

# aligned_pairs = []

# for en_sent, hi_sent in zip(en_lines, hi_lines):
#     doc = nlp(en_sent)

#     en_names = set()

#     for ent in doc.ents:
#         if ent.label_ in {"PERSON", "ORG", "PRODUCT", "GPE"}:
#             en_names.add(ent.text)

#     for token in doc:
#         if token.text[0].isupper() and token.is_alpha and len(token.text) > 2:
#             en_names.add(token.text)

#     if not en_names:
#         continue

#     hi_tokens = hi_sent.split()

#     for en_name in en_names:
#         best_match = None
#         best_score = 0.0

#         for hi_word in hi_tokens:
#             if len(hi_word) < 2:
#                 continue

#             hi_roman = romanize_hi(hi_word)
#             score = similarity(en_name, hi_roman)

#             if score > best_score:
#                 best_score = score
#                 best_match = hi_word

#         if best_score >= 0.45:
#             aligned_pairs.append((en_name, best_match, best_score))

# for pair in aligned_pairs[:20]:
#     print(pair)

('Highlight', 'हाइलाइट', 0.5882352941176471)
('Highlight', 'हाइलाइट', 0.5882352941176471)
('Highlight', 'हाइलाइट', 0.5882352941176471)
('API', 'एपीआई', 0.5)
('Hide', 'छिपाएं', 0.5454545454545454)
('Console', 'कन्सोल', 0.5714285714285714)
('IPython', 'आईपाइथन', 0.5882352941176471)
('Monitor', 'मानिटर', 0.6666666666666666)
('Monitor', 'मानिटर', 0.6666666666666666)
('Highlight', 'हाइलाइट', 0.5882352941176471)
('WIDGET', 'विडजेट', 0.5714285714285714)
('Alpha', 'अल्फा', 1.0)
('Des', 'डेस्कटोप', 0.5)
('Position', 'स्थिति', 0.5714285714285714)
('Offset', 'ओफसेट', 0.5714285714285714)
('Name', 'नाम', 0.75)
('URI', 'यूआरआई', 0.6)
('Plugin', 'प्लग-इन', 0.6666666666666666)
('Native', 'वतनी', 0.5)
('LDTP', 'एलडीटीपी', 0.6153846153846154)


In [82]:
# import json

# with open("data/processed/aligned_pairs_10k.json", "w", encoding = "utf-8") as f:
#     json.dump(aligned_pairs, f, ensure_ascii = False, indent = 2)

In [9]:
import json

with open("data/processed/aligned_pairs_10k.json", encoding = "utf-8") as f:
    aligned_pairs = json.load(f)

In [10]:
deduped = {}

for en, hi, score in aligned_pairs:
    key = (en.lower(), hi)

    if key not in deduped or score > deduped[key][2]:
        deduped[key] = (en, hi, score)

In [11]:
deduped_pairs = list(deduped.values())

In [12]:
COMMON_ENGLISH = {
    "the", "this", "that", "these", "those", 
    "data", "report", "give", "allow", "show", "hide", 
    "name", "position", "default", "settings", "file", 
    "view", "edit", "help", "use", "using", "used", 
    "set", "get", "add", "remove"
}

In [13]:
COMMON_HINDI = {
    "से", "में", "का", "की", "के", "को", "पर",
    "है", "था", "थी", "थे", "असमर्थ", "संग्रहित",
    "लिए", "द्वारा", "साथ", "बिना"
}

In [14]:
filtered_pairs = []

for en, hi, score in deduped_pairs:
    if en.lower() in COMMON_ENGLISH:
        continue
    filtered_pairs.append((en, hi, score))

print("Before: ", len(aligned_pairs))
print("After dedup: ", len(deduped_pairs))
print("After filtering: ", len(filtered_pairs))

for pair in filtered_pairs[:20]:
    print(pair)

Before:  3581
After dedup:  667
After filtering:  631
('Highlight', 'हाइलाइट', 0.5882352941176471)
('API', 'एपीआई', 0.5)
('Console', 'कन्सोल', 0.5714285714285714)
('IPython', 'आईपाइथन', 0.5882352941176471)
('Monitor', 'मानिटर', 0.6666666666666666)
('WIDGET', 'विडजेट', 0.5714285714285714)
('Alpha', 'अल्फा', 1.0)
('Des', 'डेस्कटोप', 0.5)
('Offset', 'ओफसेट', 0.5714285714285714)
('URI', 'यूआरआई', 0.6)
('Plugin', 'प्लग-इन', 0.6666666666666666)
('Native', 'वतनी', 0.5)
('LDTP', 'एलडीटीपी', 0.6153846153846154)
('Recorder', 'रेकोर्डर', 0.7058823529411765)
('Creates', 'करता', 0.46153846153846156)
('num1', '(num1)', 0.8)
('Accerciser', 'इसे', 0.46153846153846156)
('Changes', 'आएंगे।', 0.46153846153846156)
('Node', '(नोड)', 0.6)
('Alt', 'आल्ट', 0.8571428571428571)


In [15]:
with open("data/processed/aligned_pairs_10k_clean.json", "w", encoding = "utf-8") as f:
    json.dump(filtered_pairs, f, ensure_ascii = False, indent = 2)

In [None]:
#ALIGNMENT

In [20]:
import re

SIM_THRESHOLD = 0.5

COMMON_ENGLISH = {
    "the", "this", "that", "these", "those", 
    "data", "report", "give", "allow", "show", "hide", 
    "name", "position", "default", "settings", "file", 
    "view", "edit", "help", "use", "using", "used", 
    "set", "get", "add", "remove"
}

SEMANTIC_IDENTIFIER_WORDS = {
    "name", "display", "value", "default", "file",
    "user", "setting", "config", "option", "property",
    "area", "reset", "mode", "type", "level", "status"
}

COMMON_HINDI = {
    "से", "में", "का", "की", "के", "को", "पर",
    "है", "था", "थी", "थे", "असमर्थ", "संग्रहित",
    "लिए", "द्वारा", "साथ", "बिना"
}

DEMONYM_TRANSLATIONS = {
    "अमेरिकी", "ब्रिटिश", "भारतीय", "चीनी", "जापानी"
}

BAD_ENGLISH = {
    "american", "british", "baltic", "hungarian",
    "european", "african", "asian", "arab"
}

ENGLISH_DEMONYMS = {
    "america", "american", "turkish", "british", "french",
    "german", "italian", "spanish", "chinese", "japanese",
    "indian", "european", "african", "asian"
}

DIRECTIONAL_ADJECTIVES = {
    "western", "eastern", "northern", "southern",
    "central", "global", "local"
}
    
nlp = spacy.load("en_core_web_sm", disable = ["parser", "lemmatizer"])

def is_title_like(doc):
    """
    Heuristic: sentence is title-like if it has no verb
    """
    return not any(token.pos_ == "VERB" for token in doc)

def is_devanagari(word):
    """
    Returns true if all characters in the word are Devanagari
    """
    return all('\u0900' <= ch <= '\u097F' for ch in word)

def normalize_hindi(word):
    # Keep only Devanagari characters
    word = re.sub(r"[^\u0900-\u097F]", "", word)

    # Always remove trailing visarga
    word = word.rstrip("ः")

    return word

def is_plural_english(word):
    return word.lower().endswith("s") and len(word) > 4

def split_camel_case(word):
    return re.findall(r"[A-Z][a-z]*", word)

def is_bad_short_english(word):
    return len(word) <= 2

def is_acronym(word):
    return word.isupper() and len(word) >= 3

def looks_like_translation(en, hi):
    """
    Returns true if hindi word is likely a semantic translation
    rather than a phonetic transliteration.
    """
    hi_roman = romanize_hi(hi)
    if len(hi_roman) <= 3 and len(en) >= 4:
        return True
    if abs(len(en) - len(hi_roman)) > 4:
        return True
    return False

def is_native_hindi_translation(hi_word):
    """
    Detects short native Hindi words likely to be translations
    rather than transliterations.
    """
    return len(hi_word) <= 3
    
def romanize_hi(word):
    try:
        return transliterate(word, sanscript.DEVANAGARI, sanscript.ITRANS)
    except Exception:
        return ""

def similarity(a,b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

en_file = Path("data/raw/parallel-n/IITB.en-hi.en")
hi_file = Path("data/raw/parallel-n/IITB.en-hi.hi")

print("Starting full dataset alignment (streaming mode)...")

aligned_pairs = set()
last_logged = 0
total_sentences = 0

with open(en_file, encoding = "utf-8") as f_en, open(hi_file, encoding = "utf-8") as f_hi:
    for en_sent, hi_sent in zip(f_en, f_hi):
        en_sent = en_sent.strip()
        hi_sent = hi_sent.strip()

        total_sentences += 1

        if total_sentences % 100000 == 0:
            print("Sentences Processed: ", total_sentences)

        if not en_sent or not hi_sent:
            continue

        doc = nlp(en_sent)
        title_like = is_title_like(doc)

        en_names = set()

        # -------- English name extraction --------
        for ent in doc.ents:
            if ent.label_ in {"PERSON", "ORG", "PRODUCT", "GPE"}:
                en_names.add(ent.text)

        for token in doc:
            if (
                token.text[0].isupper()
                and token.is_alpha
                and len(token.text) > 2
                and token.text.lower() not in COMMON_ENGLISH
                and (token.i != 0 or title_like)
            ):
                raw_parts = re.split(r"[\/]", token.text)
                for part in raw_parts:
                    camel_parts = split_camel_case(part)
                    for p in camel_parts:
                        if p.lower() not in COMMON_ENGLISH and len(p) > 2:
                            en_names.add(p)

        if not en_names:
            continue

        hi_tokens = hi_sent.split()

        # -------- Alignment logic --------
        for en_name in en_names:
            if any(
                part.lower() in SEMANTIC_IDENTIFIER_WORDS
                for part in split_camel_case(en_name)
            ):
                continue
            if en_name.lower() in COMMON_ENGLISH:
                continue
            if en_name.lower() in BAD_ENGLISH:
                continue
            if en_name.lower() in ENGLISH_DEMONYMS:
                continue
            if en_name.lower() in DIRECTIONAL_ADJECTIVES:
                continue
            parts = en_name.split()
            if len(parts) > 1 and len(parts[-1]) <= 2:
                continue
            if is_bad_short_english(en_name):
                continue
            if is_plural_english(en_name):
                continue
            if en_name.lower() in {"read", "write", "open", "close", "set", "get"}:
                continue
            if is_acronym(en_name) and en_name.lower() not in {"imap", "ldap", "smtp", "http"}:
                continue

            best_match = None
            best_score = 0.0

            for hi_word in hi_tokens:
                hi_word = normalize_hindi(hi_word)
                
                if hi_word in {"पथ", "मार्ग", "नाम", "स्थिति", "संख्या"}:
                    continue
                if " " in hi_word:
                    continue
                if not hi_word:
                    continue
                if not is_devanagari(hi_word):
                    continue
                if hi_word in COMMON_HINDI:
                    continue
                if hi_word in DEMONYM_TRANSLATIONS:
                    continue
                if len(hi_word) > 15:
                    continue

                hi_roman = romanize_hi(hi_word)

                if (
                    len(hi_word) <= 3
                    and similarity(en_name, hi_roman) > 0.8
                ):
                    continue

                if looks_like_translation(en_name, hi_word):
                    continue
                    
                if len(hi_roman) > len(en_name) * 1.6:
                    continue

                if hi_roman in en_name.lower() or en_name.lower() in hi_roman:
                    if abs(len(en_name) - len(hi_roman)) > 3:
                        continue
                    
                sim = similarity(en_name, hi_roman)
                if sim < 0.6:
                    continue

                if sim > best_score:
                    best_score = sim
                    best_match = hi_word

            if best_score >= SIM_THRESHOLD:
                aligned_pairs.add((
                    en_name,
                    best_match,
                    round(best_score, 3)
                ))

        if len(aligned_pairs) // 500 > last_logged:
            last_logged = len(aligned_pairs) // 500
            print("Aligned pairs so far:", len(aligned_pairs))

aligned_pairs = list(aligned_pairs)

print("Total aligned pairs: ", len(aligned_pairs))

with open("data/processed/aligned_pairs_full.json", "w", encoding = "utf-8") as f:
    json.dump(aligned_pairs, f, ensure_ascii = False, indent = 2)

print("Full dataset alignment completed")
print("Total aligned pairs: ", len(aligned_pairs))

Starting full dataset alignment (streaming mode)...
Aligned pairs so far: 500
Sentences Processed:  100000
Aligned pairs so far: 1000
Aligned pairs so far: 1500
Aligned pairs so far: 2000
Aligned pairs so far: 2501
Aligned pairs so far: 3000
Aligned pairs so far: 3500
Aligned pairs so far: 4000
Aligned pairs so far: 4500
Aligned pairs so far: 5000
Aligned pairs so far: 5500
Aligned pairs so far: 6000
Aligned pairs so far: 6500
Aligned pairs so far: 7000
Aligned pairs so far: 7500
Aligned pairs so far: 8000
Sentences Processed:  200000
Aligned pairs so far: 8501
Aligned pairs so far: 9000
Aligned pairs so far: 9500
Aligned pairs so far: 10000
Aligned pairs so far: 10500
Aligned pairs so far: 11000
Aligned pairs so far: 11500
Sentences Processed:  300000
Aligned pairs so far: 12000
Aligned pairs so far: 12500
Sentences Processed:  400000
Aligned pairs so far: 13000
Aligned pairs so far: 13503
Aligned pairs so far: 14007
Aligned pairs so far: 14501
Aligned pairs so far: 15000
Aligned pair

In [21]:
aligned_pairs 

[('Laik', 'लगीं', 0.667),
 ('Tundra', 'टुण्ड्रा', 1.0),
 ('Lakshminath', 'लक्ष्मीनाथ', 0.957),
 ('xiva', 'निवास', 0.6),
 ('Sanjskrit', 'संस्कृत', 0.737),
 ('Amartya', 'अमर्त्य', 1.0),
 ('Bhagat', 'विभाग', 0.769),
 ('Maurya', 'अनुरूपा', 0.615),
 ('Hath', 'हालत', 0.6),
 ('Kashinath', 'काशीनाथ', 0.947),
 ('Marigaon', 'मारीगाँव', 0.778),
 ('Vietnamese', 'वियेतनामी', 0.7),
 ('Ishvara', 'ईश्वर', 1.0),
 ('Phani', 'रोशनी', 0.667),
 ('Dasaundha', 'दसौंधा', 0.889),
 ('Radha - Krishna', 'राधाकृष्णन', 0.8),
 ('Siddarth', 'सिद्धार्थ', 0.889),
 ('Ashvadhama', 'अश्वत्थामा', 0.857),
 ('Maksmulr', 'मैक्समूलर', 0.8),
 ('Bharat', 'बाहर', 0.833),
 ('Mardhekar', 'मर्ढेकर', 0.947),
 ('Rsabhanatha', 'ऋषभनाथ', 0.88),
 ('Trifid', 'ट्रिफिड', 0.714),
 ('Aitrai', 'ऐतरेय', 0.714),
 ('Ashtanga', 'अष्टांग', 0.875),
 ('Duggar', 'डोगरा', 0.667),
 ('Greville', 'ग्रेविल्ले', 1.0),
 ('Sabha', 'सहयोगी', 0.615),
 ('Raima Sen', 'राइमा', 0.714),
 ('Ramma', 'रात', 0.667),
 ('Bahadur', 'बहुत', 0.615),
 ('Changanacherry', 'चंगन

In [22]:
import json
from collections import Counter

with open("data/processed/aligned_pairs_full.json", encoding = "utf-8") as f:
    pairs = json.load(f)

english_names = [en for en, _, _ in pairs]

print("Total aligned pairs: ", len(pairs))
print("Unique English names: ", len(set(english_names)))

print("\nTop 15 most frequent English names:")
for name, count in Counter(english_names).most_common(15):
    print(name, count)

Total aligned pairs:  122092
Unique English names:  74648

Top 15 most frequent English names:
India 406
State 164
Sabha 161
Rama 150
Asia 150
Bhavan 117
China 117
Shah 108
Allah 107
Khan 95
Chandra 86
Islam 77
Shiva 72
Arabia 71
Chairman 68


In [None]:
#LOAD ALIGNED PAIRS AND GROUP BY ENGLISH NAMES

In [31]:
import json
from collections import defaultdict

with open("data/processed/aligned_pairs_full.json", encoding = "utf-8") as f:
    aligned_pairs = json.load(f)

name_groups = defaultdict(list)

for en, hi, score in aligned_pairs:
    clean_en = en.strip()
    if "/" in clean_en:
        continue
    name_groups[clean_en].append(hi)

print(f"Total unique English names: {len(name_groups)}")

Total unique English names: 74490


In [None]:
#ROMANIZE HINDI VARIANTS

In [24]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

def romanize_hi(word):
    try:
        return transliterate(word, sanscript.DEVANAGARI, sanscript.ITRANS)
    except:
        return ""

In [None]:
#CLUSTER HINDI VARIANTS

In [25]:
from difflib import SequenceMatcher

def similarity(a,b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def cluster_variants(hi_variants, threshold = 0.7):
    clusters = []

    for word in hi_variants:
        roman = romanize_hi(word)
        placed = False

        for cluster in clusters:
            rep_roman = cluster["roman"]
            if similarity(roman, rep_roman) >= threshold:
                cluster["variants"].append(word)
                placed = True
                break

        if not placed:
            clusters.append({
                "canonical": word,
                "roman": roman,
                "variants": [word]
            })
    return clusters

In [None]:
#APPLY CLUSTERING TO ALL NAMES

In [26]:
BAD_ENGLISH = {
    "this", "that", "these", "those", 
    "data", "report", "name", "set",
    "get", "add", "remove"
}

In [27]:
BAD_ENGLISH.update({
    "american", "fast", "slow", "new", "old", "good", "bad",
    "open", "close", "high", "low", "western", "eastern", 
    "northern", "southern", "central", "global", "local"
})

In [28]:
def looks_like_translation(en, hi):
    """
    Returns true if hindi word is likely a semantic translation
    rather than a phonetic transliteration.
    """
    hi_roman = romanize_hi(hi)
    if len(hi_roman) <= 3 and len(en) >= 4:
        return True
    if abs(len(en) - len(hi_roman)) > 4:
        return True
    return False

def is_native_hindi_translation(hi_word):
    """
    Detects short native Hindi words likely to be translations
    rather than transliterations.
    """
    return len(hi_word) <= 3

In [29]:
canonical_clusters = {}

for en_name, hi_variants in name_groups.items():

    # Clean & normalize
    unique_variants = list(
        set(normalize_hindi(v) for v in hi_variants if normalize_hindi(v))
    )

    if not unique_variants:
        continue

    clusters = cluster_variants(unique_variants)
    final_clusters = []

    for cluster in clusters:
        variants = cluster["variants"]

        # Step 1: remove clear translations early
        phonetic_variants = [
            v for v in variants
            if similarity(en_name, romanize_hi(v)) >= 0.65
            and not looks_like_translation(en_name, v)
            and not is_native_hindi_translation(v)
        ]

        # Step 2: fallback ONLY if nothing survived
        if phonetic_variants:
            candidates = phonetic_variants
        else:
            candidates = sorted(
                variants,
                key=lambda v: similarity(en_name, romanize_hi(v)),
                reverse=True
            )[:1]

        # Step 3: choose canonical
        canonical = min(
            candidates,
            key=lambda v: (
                -similarity(en_name, romanize_hi(v)),
                len(v)
            )
        )

        final_clusters.append({
            "canonical": canonical,
            "variants": variants
        })
        
    if not final_clusters:
        continue
        
    canonical_clusters[en_name] = final_clusters

In [None]:
#RESULTS

In [30]:
for en, clusters in list(canonical_clusters.items())[:5]:
    print(f"\n English Name: {en}")
    for c in clusters:
        print("  Canonical:", c["canonical"])
        print("  Variants:", c["variants"])


 English Name: Scoring
  Canonical: स्कोरिंग
  Variants: ['स्कोरिंग']

 English Name: Binding
  Canonical: बाइंडिंग
  Variants: ['बाइंडिंग']

 English Name: Camera
  Canonical: कैमरा
  Variants: ['कैमरा']

 English Name: Radio
  Canonical: रेडियो
  Variants: ['रेडियो']

 English Name: Transcode
  Canonical: ट्रांसकोड
  Variants: ['ट्रांसकोड']


In [None]:
#FILTERED IN VSCODE USING FILE filter_aligned_pairs.py

In [None]:
#CHECKING aligned_pairs_filtered.json file for other transliteration issues

In [51]:
import json
import random

with open("data/processed/aligned_pairs_filtered.json", encoding = "utf-8") as f:
    pairs = json.load(f)

for p in random.sample(pairs, 30):
    print(p)

['Panchal', 'पांचाल', 0.8]
['Barrackpur', 'बराकपुर', 0.8]
['Iqamat', 'क़यामत', 0.714]
['Socialist', 'सोशालिस्ट', 0.737]
['Brush', 'ब्रश', 0.727]
['Sambhav', 'संभव', 0.933]
['Krishnamoorthi', 'कृष्णमूर्ति', 0.815]
['Sai', 'साही', 0.857]
['Mark Twain', 'मार्कट्वेन', 0.6]
['Norman', 'नारमन', 0.714]
['Chitrakoot', 'चित्रकूट', 0.8]
['Chhau', 'चौकी', 0.727]
['Gurtu', 'गुर्टु', 1.0]
['Prachar', 'प्रचार', 0.933]
['Tilaya', 'तिलाया', 1.0]
['Halva', 'हलवाह', 0.769]
['Gune', 'गुणे', 1.0]
['Ravindran', 'रवीन्द्रन', 0.947]
['Acarya', 'आचार्यो', 0.769]
['Iqbal', 'इक़बाल', 0.833]
['Surjit', 'सुरजीत', 0.857]
['Syiemlieh', 'सिम्लिह', 0.75]
['Erivan', 'जेरेवन', 0.714]
['Krishna Avatar', 'कृष्णावतार', 0.857]
['Dione', 'डैओने', 0.909]
['Farkka', 'फरक्का', 0.714]
['Testament', 'न्यूटेस्टामेन्ट', 0.818]
['Ramendrasundar', 'रामेंद्रसुंदर', 0.828]
['Pattnayak', 'पटनायक', 0.842]
['Sandani', 'सांडनी', 0.857]


In [None]:
#NOW CHECKING THE FILE WITH HIGHEST SCORES FOR OTHER TRANSLITERATION ISSUES

In [54]:
import json, random

with open("data/processed/aligned_pairs_high_conf.json", encoding = "utf-8") as f:
    data = json.load(f)

for p in random.sample(data, 40):
    print(p)

['Valaraju', 'वलराजु', 1.0]
['Samayapuram', 'समयपुरम्', 1.0]
['Poduval', 'पोडुवाल', 0.933]
['Farhan', 'फ़रहान', 0.857]
['Latrobe', 'लाट्रोब', 0.857]
['Lalitaditya', 'ललितादित्य', 1.0]
['Akash', 'आकाश', 0.909]
['Chaiyan', 'छैंया', 0.857]
['Prabhath', 'प्रभात', 0.875]
['Kakavali', 'काकवली', 1.0]
['Harp', 'हार्प', 0.889]
['Essebsi', 'एसेब्सी', 0.923]
['Rohtak', 'रोहतक', 0.857]
['Dhania', 'धनिया', 0.923]
['Gadiara', 'गडियारा', 0.933]
['Arch', 'आर्च', 0.889]
['Kapalbhati', 'कपालभाती', 0.952]
['Chetanya', 'चेतन्य', 1.0]
['Paraguarí', 'पारागुआरी', 0.889]
['Amargun', 'अमरगुन', 0.875]
['Nimroz', 'निमरोज़', 0.857]
['Teprikardr', 'टेपरिकार्डर', 0.87]
['Brahmana', 'ब्रह्मा', 0.857]
['Rupaniji', 'रुपाणी', 0.857]
['Mangueshi', 'मंगुएशी', 0.889]
['Krishnadayal', 'कृष्णदयाल', 0.923]
['Islamia', 'इस्लामिया', 0.933]
['Bhutabali', 'भूतबलि', 1.0]
['Bilal', 'बिलाल', 0.909]
['Bhatiyari', 'भटियारी', 1.0]
['Malti', 'मालती', 0.909]
['Mbandaka', 'माबान्दाका', 0.941]
['Bhattoji', 'भट्टोजी', 1.0]
['Lakshmilahari'

In [55]:
from collections import Counter

english = [en for en, _, _ in data]

print("Filtered aligned pairs: ", len(data))
print("Unique english names: ", len(set(english)))

for name, c in Counter(english).most_common(15):
    print(name, c)  

Filtered aligned pairs:  41044
Unique english names:  33418
Chennai 26
Maharashtra 23
Mumbai 19
Rajasthan 16
Kashmir 15
Vanuatu 12
Hindustani 12
Hindi 12
Brahma 12
Madurai 12
Kirgizia 11
Visakhapatnam 11
Kirghizia 11
Brahman 11
Bangladesh 10
