In [1]:
from pathlib import Path

In [2]:
data_path = Path("data/raw/parallel-n")
files = list(data_path.glob("*"))[:5]
files

[PosixPath('data/raw/parallel-n/IITB.en-hi.en'),
 PosixPath('data/raw/parallel-n/IITB.en-hi.hi')]

In [3]:
en_file = "data/raw/parallel-n/IITB.en-hi.en"
hi_file = "data/raw/parallel-n/IITB.en-hi.hi"

with open(en_file, encoding = "utf-8") as f_en, open(hi_file, encoding = "utf-8") as f_hi:
    for i in range(10):
        en_line = f_en.readline().strip()
        hi_line = f_hi.readline().strip()
        print(f"{i+1}. EN: {en_line}")
        print(f"   HI: {hi_line}")
        print("-" * 50)

1. EN: Give your application an accessibility workout
   HI: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
--------------------------------------------------
2. EN: Accerciser Accessibility Explorer
   HI: एक्सेर्साइसर पहुंचनीयता अन्वेषक
--------------------------------------------------
3. EN: The default plugin layout for the bottom panel
   HI: निचले पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
4. EN: The default plugin layout for the top panel
   HI: ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
--------------------------------------------------
5. EN: A list of plugins that are disabled by default
   HI: उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है
--------------------------------------------------
6. EN: Highlight duration
   HI: अवधि को हाइलाइट रकें
--------------------------------------------------
7. EN: The duration of the highlight box when selecting accessible nodes
   HI: पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की

In [4]:
import spacy

In [5]:
#NER ALONE DIDN'T WORK

In [5]:
nlp = spacy.load("en_core_web_sm")
sentence = "Accerciser Accessibility Explorer"
doc = nlp(sentence)
for ent in doc.ents:
    print(ent.text, ent.label_)

In [7]:
#COMBINING NER AND HEURISTICS (HYBRID)

In [6]:
# Load English NLP pipeline
nlp = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "lemmatizer"])

en_file = Path("data/raw/parallel-n/IITB.en-hi.en")

extracted_names = set()

MAX_LINES = 10000

with open(en_file, encoding="utf-8") as f:
    lines = []
    for i, line in enumerate(f):
        if i >= MAX_LINES:
            break
        line = line.strip()
        if line:
            lines.append(line)

print(f"Processing {len(lines)} sentences...")

# Process in batches
for doc in nlp.pipe(lines, batch_size = 500):
    # Part 1: NER
    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG", "PRODUCT", "GPE"}:
            extracted_names.add(ent.text)

    # Part 2: Heuristics
    for token in doc:
        if(
            token.text[0].isupper() and
            token.is_alpha and
            len(token.text) > 2
        ):
            extracted_names.add(token.text)

print("Done")
print("Total extracted names: ", len(extracted_names))
list(extracted_names)[:20]

Processing 10000 sentences...
Done
Total extracted names:  1227


['Crea',
 'Options',
 'Run',
 'Creates',
 'Spider',
 'Maximum',
 'Consistency',
 'Profile Name',
 'Debugging',
 'Base',
 'NAME',
 'Gold',
 'Yukon',
 'jack of diamonds',
 'Baltic',
 'Designer',
 'Romanian',
 'Todo List Preferences',
 'YOFF',
 'Text']

In [7]:
import json

with open("data/processed/extracted_names_en_10k.json", "w", encoding = "utf-8") as f:
    json.dump(sorted(extracted_names), f, ensure_ascii = False, indent = 2)

In [9]:
import sys
print(sys.executable)

/opt/miniconda3/bin/python


In [10]:
import sys
!{sys.executable} -m pip install indic-transliteration

Collecting indic-transliteration
  Using cached indic_transliteration-2.3.75-py3-none-any.whl.metadata (1.4 kB)
Collecting backports.functools-lru-cache (from indic-transliteration)
  Using cached backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting regex (from indic-transliteration)
  Using cached regex-2025.11.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting roman (from indic-transliteration)
  Using cached roman-5.2-py3-none-any.whl.metadata (4.3 kB)
Using cached indic_transliteration-2.3.75-py3-none-any.whl (159 kB)
Using cached backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl (6.7 kB)
Using cached regex-2025.11.3-cp313-cp313-macosx_11_0_arm64.whl (288 kB)
Using cached roman-5.2-py3-none-any.whl (6.0 kB)
Installing collected packages: roman, regex, backports.functools-lru-cache, indic-transliteration
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [indic-transliteration]
[1A[2KSuccessfully 

In [1]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from difflib import SequenceMatcher

In [2]:
import indic_transliteration
print(indic_transliteration.__file__)

/opt/miniconda3/lib/python3.13/site-packages/indic_transliteration/__init__.py
