In [1]:
from pathlib import Path

In [2]:
data_path = Path("data/raw/parallel-n")
files = list(data_path.glob("*"))[:5]
files

[PosixPath('data/raw/parallel-n/IITB.en-hi.en'),
 PosixPath('data/raw/parallel-n/IITB.en-hi.hi')]

In [3]:
en_file = "data/raw/parallel-n/IITB.en-hi.en"
hi_file = "data/raw/parallel-n/IITB.en-hi.hi"

with open(en_file, encoding = "utf-8") as f_en, open(hi_file, encoding = "utf-8") as f_hi:
    for i in range(10):
        en_line = f_en.readline().strip()
        hi_line = f_hi.readline().strip()
        print(f"{i+1}. EN: {en_line}")
        print(f"   HI: {hi_line}")
        print("-" * 50)

1. EN: Give your application an accessibility workout
   HI: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
--------------------------------------------------
2. EN: Accerciser Accessibility Explorer
   HI: एक्सेर्साइसर पहुंचनीयता अन्वेषक
--------------------------------------------------
3. EN: The default plugin layout for the bottom panel
   HI: निचले पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
4. EN: The default plugin layout for the top panel
   HI: ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
5. EN: A list of plugins that are disabled by default
   HI: उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है
--------------------------------------------------
6. EN: Highlight duration
   HI: अवधि को हाइलाइट रकें
--------------------------------------------------
7. EN: The duration of the highlight box when selecting accessible nodes
   HI: पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की

In [4]:
import spacy

In [5]:
#NER ALONE DIDN'T WORK

In [5]:
nlp = spacy.load("en_core_web_sm")
sentence = "Accerciser Accessibility Explorer"
doc = nlp(sentence)
for ent in doc.ents:
    print(ent.text, ent.label_)

In [7]:
#COMBINING NER AND HEURISTICS (HYBRID)

In [6]:
# Load English NLP pipeline
nlp = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "lemmatizer"])

en_file = Path("data/raw/parallel-n/IITB.en-hi.en")

extracted_names = set()

MAX_LINES = 10000

with open(en_file, encoding="utf-8") as f:
    lines = []
    for i, line in enumerate(f):
        if i >= MAX_LINES:
            break
        line = line.strip()
        if line:
            lines.append(line)

print(f"Processing {len(lines)} sentences...")

# Process in batches
for doc in nlp.pipe(lines, batch_size = 500):
    # Part 1: NER
    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG", "PRODUCT", "GPE"}:
            extracted_names.add(ent.text)

    # Part 2: Heuristics
    for token in doc:
        if(
            token.text[0].isupper() and
            token.is_alpha and
            len(token.text) > 2
        ):
            extracted_names.add(token.text)

print("Done")
print("Total extracted names: ", len(extracted_names))
list(extracted_names)[:20]

Processing 10000 sentences...
Done
Total extracted names:  1227


['Alpha',
 'Run / Continue',
 'Tommy',
 'Goto Line',
 'Freecell',
 'Loading',
 'High',
 'Java',
 'CVS Options',
 'Are',
 'Carlo',
 'Traditional',
 'Urgent',
 'Load Log',
 'Royal East',
 'Anjuta',
 'Four',
 'Direction',
 'SymbolDb',
 'Shell Script File']

In [7]:
import json

with open("data/processed/extracted_names_en_10k.json", "w", encoding = "utf-8") as f:
    json.dump(sorted(extracted_names), f, ensure_ascii = False, indent = 2)

In [8]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from difflib import SequenceMatcher

In [11]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])

def romanize_hi(word):
    return transliterate(word, sanscript.DEVANAGARI, sanscript.ITRANS)

def similarity(a,b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

en_file = Path("data/raw/parallel-n/IITB.en-hi.en")
hi_file = Path("data/raw/parallel-n/IITB.en-hi.hi")

MAX_LINES = 10000

en_lines, hi_lines = [], []

with open(en_file, encoding = "utf-8") as f_en , open(hi_file, encoding = "utf-8") as f_hi:
    for i, (en, hi) in enumerate(zip(f_en, f_hi)):
        if i >= MAX_LINES:
            break
        en_lines.append(en.strip())
        hi_lines.append(hi.strip())

aligned_pairs = []

for en_sent, hi_sent in zip(en_lines, hi_lines):
    doc = nlp(en_sent)

    en_names = set()

    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG", "PRODUCT", "GPE"}:
            en_names.add(ent.text)

    for token in doc:
        if token.text[0].isupper() and token.is_alpha and len(token.text) > 2:
            en_names.add(token.text)

    if not en_names:
        continue

    hi_tokens = hi_sent.split()

    for en_name in en_names:
        best_match = None
        best_score = 0.0

        for hi_word in hi_tokens:
            if len(hi_word) < 2:
                continue

            hi_roman = romanize_hi(hi_word)
            score = similarity(en_name, hi_roman)

            if score > best_score:
                best_score = score
                best_match = hi_word

        if best_score >= 0.45:
            aligned_pairs.append((en_name, best_match, best_score))

for pair in aligned_pairs[:20]:
    print(pair)

('Highlight', 'हाइलाइट', 0.5882352941176471)
('Highlight', 'हाइलाइट', 0.5882352941176471)
('Highlight', 'हाइलाइट', 0.5882352941176471)
('API', 'एपीआई', 0.5)
('Hide', 'छिपाएं', 0.5454545454545454)
('IPython', 'आईपाइथन', 0.5882352941176471)
('Console', 'कन्सोल', 0.5714285714285714)
('Monitor', 'मानिटर', 0.6666666666666666)
('Monitor', 'मानिटर', 0.6666666666666666)
('Highlight', 'हाइलाइट', 0.5882352941176471)
('WIDGET', 'विडजेट', 0.5714285714285714)
('Alpha', 'अल्फा', 1.0)
('Des', 'डेस्कटोप', 0.5)
('Position', 'स्थिति', 0.5714285714285714)
('Offset', 'ओफसेट', 0.5714285714285714)
('Name', 'नाम', 0.75)
('URI', 'यूआरआई', 0.6)
('Plugin', 'प्लग-इन', 0.6666666666666666)
('Native', 'वतनी', 0.5)
('LDTP', 'एलडीटीपी', 0.6153846153846154)


In [12]:
import json

with open("data/processed/aligned_pairs_10k.json", "w", encoding = "utf-8") as f:
    json.dump(aligned_pairs, f, ensure_ascii = False, indent = 2)

In [13]:
import json

with open("data/processed/aligned_pairs_10k.json", encoding = "utf-8") as f:
    aligned_pairs = json.load(f)

In [14]:
deduped = {}

for en, hi, score in aligned_pairs:
    key = (en.lower(), hi)

    if key not in deduped or score > deduped[key][2]:
        deduped[key] = (en, hi, score)

In [15]:
deduped_pairs = list(deduped.values())

In [26]:
COMMON_ENGLISH = {
    "the", "give", "allow", "show", "hide", "name", "position",
    "default", "settings", "file", "view", "edit", "help",
    "use", "using", "used", "set", "get", "add", "remove"
}

In [27]:
filtered_pairs = []

for en, hi, score in deduped_pairs:
    if en.lower() in COMMON_ENGLISH:
        continue
    filtered_pairs.append((en, hi, score))

print("Before: ", len(aligned_pairs))
print("After dedup: ", len(deduped_pairs))
print("After filtering: ", len(filtered_pairs))

for pair in filtered_pairs[:20]:
    print(pair)

Before:  0
After dedup:  667
After filtering:  645
('Highlight', 'हाइलाइट', 0.5882352941176471)
('API', 'एपीआई', 0.5)
('IPython', 'आईपाइथन', 0.5882352941176471)
('Console', 'कन्सोल', 0.5714285714285714)
('Monitor', 'मानिटर', 0.6666666666666666)
('WIDGET', 'विडजेट', 0.5714285714285714)
('Alpha', 'अल्फा', 1.0)
('Des', 'डेस्कटोप', 0.5)
('Offset', 'ओफसेट', 0.5714285714285714)
('URI', 'यूआरआई', 0.6)
('Plugin', 'प्लग-इन', 0.6666666666666666)
('Native', 'वतनी', 0.5)
('LDTP', 'एलडीटीपी', 0.6153846153846154)
('Recorder', 'रेकोर्डर', 0.7058823529411765)
('Creates', 'करता', 0.46153846153846156)
('num1', '(num1)', 0.8)
('Accerciser', 'इसे', 0.46153846153846156)
('Changes', 'आएंगे।', 0.46153846153846156)
('Node', '(नोड)', 0.6)
('Alt', 'आल्ट', 0.8571428571428571)


In [28]:
with open("data/processed/aligned_pairs_10k_clean.json", "w", encoding = "utf-8") as f:
    json.dump(filtered_pairs, f, ensure_ascii = False, indent = 2)

In [35]:
MAX_LINES = 50000
SIM_THRESHOLD = 0.5

COMMON_ENGLISH = {
    "the", "give", "allow", "show", "hide", "name", "position",
    "default", "settings", "file", "view", "edit", "help",
    "use", "using", "used", "set", "get", "add", "remove"
}

nlp = spacy.load("en_core_web_sm", disable = ["parser", "lemmatizer"])

def is_title_like(doc):
    """
    Heuristic: sentence is title-like if it has no verb
    """
    return not any(token.pos_ == "VERB" for token in doc)

def romanize_hi(word):
    try:
        return transliterate(word, sanscript.DEVANAGARI, sanscript.ITRANS)
    except Exception:
        return ""

def similarity(a,b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

en_file = Path("data/raw/parallel-n/IITB.en-hi.en")
hi_file = Path("data/raw/parallel-n/IITB.en-hi.hi")

en_lines, hi_lines = [], []

with open(en_file, encoding = "utf-8") as f_en, open(hi_file, encoding = "utf-8") as f_hi:
    for i, (en, hi) in enumerate(zip(f_en, f_hi)):
        if i >= MAX_LINES:
            break
        en_lines.append(en.strip())
        hi_lines.append(hi.strip())

print(f"Loaded {len(en_lines)} sentence pairs")

aligned_pairs = set()

for en_sent, hi_sent in zip(en_lines, hi_lines):
    doc = nlp(en_sent)
    title_like = is_title_like(doc)

    en_names = set()

    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG", "PRODUCT", "GPE"}:
            en_names.add(ent.text)
    
    for token in doc:
        if(token.text[0].isupper() and token.is_alpha and len(token.text) > 2 and token.text.lower() not in COMMON_ENGLISH and (token.i != 0 or title_like)):
            en_names.add(token.text)

    if not en_names:
        continue
        
    if len(aligned_pairs) == 0:
        print("EN SENT: ", en_sent)
        print("EN NAMES: ", en_names)
        print("HI SENT: ", hi_sent)
        print("-" * 50)

    hi_tokens = hi_sent.split()

    for en_name in en_names:
        if en_name.lower() in COMMON_ENGLISH:
            continue

        best_match = None
        best_score = 0.0

        for hi_word in hi_tokens:
            hi_roman = romanize_hi(hi_word)

            if abs(len(en_name) - len(hi_roman)) > 6:
                continue
                
            score = similarity(en_name, hi_roman)

            if score > best_score:
                best_score = score
                best_match = hi_word

        if best_score >= SIM_THRESHOLD:
            aligned_pairs.add((
                en_name,
                best_match,
                round(best_score, 3)
            ))

aligned_pairs = list(aligned_pairs)

print("Total aligned pairs: ", len(aligned_pairs))

with open("data/processed/aligned_pairs_50k.json", "w", encoding = "utf-8") as f:
    json.dump(aligned_pairs, f, ensure_ascii = False, indent = 2)

print("Saved aligned_pairs_50k.json")

Loaded 50000 sentence pairs
EN SENT:  Accerciser Accessibility Explorer
EN NAMES:  {'Accessibility', 'Accerciser', 'Explorer'}
HI SENT:  एक्सेर्साइसर पहुंचनीयता अन्वेषक
--------------------------------------------------
EN SENT:  Highlight duration
EN NAMES:  {'Highlight'}
HI SENT:  अवधि को हाइलाइट रकें
--------------------------------------------------
Total aligned pairs:  1482
Saved aligned_pairs_50k.json


In [36]:
aligned_pairs

[('Quit Anjuta', 'Anjuta', 0.706),
 ('PURPOSE', 'PURPOSE.', 0.933),
 ('Reset', 'रिसेट', 0.727),
 ('Fontconfig', 'फ़ॉन्टकॉन्फ़िग', 0.583),
 ('Demo', 'डेमो', 1.0),
 ('JHBuild', 'बिल्ड', 0.667),
 ('Mail', 'नई', 0.571),
 ('Tasks', 'अक्षर', 0.5),
 ('XDG', '"XDG', 0.857),
 ('Drag Axis', 'ड्रैग', 0.667),
 ('Pager', 'पेजर', 0.545),
 ('Profile', 'प्रोफाइल', 0.625),
 ('See', 'See', 1.0),
 ('Simulation', '(सिमुलेशन)', 0.609),
 ('Mail', 'रही', 0.5),
 ('Menus', 'मेनू', 0.889),
 ('GPU', 'GPU', 1.0),
 ('USA', 'इस', 0.667),
 ('DOS EOL', 'EOL', 0.6),
 ('All', 'काल', 0.571),
 ('Zebra', 'जेब्रा', 0.8),
 ('Baltic', 'बाल्टिक', 0.769),
 ('Arguments', 'आर्गुमेंट्स', 0.842),
 ('About', 'अंजूटा', 0.545),
 ('Log', 'Log', 1.0),
 ('URI', 'URI', 1.0),
 ('Finnish', 'फिनिश', 0.667),
 ('LDAP', 'LDAP', 1.0),
 ('Transcode', 'ट्रांसकोड', 0.632),
 ('Domain Name', 'डोमेन', 0.588),
 ('Refresh', 'रिफ्रेश', 0.625),
 ('Hebrew', 'हिब्रू', 0.545),
 ('Checksum', 'चेकसम', 0.706),
 ('ccc - analyzer', 'ccc-analyzer:', 0.889),
 ('Dj