In [111]:
import os
os.chdir("/Users/jasleenkaur/Desktop/translit-consistency")

In [112]:
from g2p_en import G2p
import re

In [113]:
import warnings
warnings.filterwarnings("ignore")

In [114]:
g2p = G2p()

In [115]:
import json, random

with open("data/processed/aligned_pairs_high_conf.json", encoding="utf-8") as f:
    raw_pairs = json.load(f)

pairs = [(en.lower(), hi) for en, hi, _ in raw_pairs]

random.seed(42)
random.shuffle(pairs)

n = len(pairs)
train = pairs[:int(0.8 * n)]
val   = pairs[int(0.8 * n):int(0.9 * n)]
test  = pairs[int(0.9 * n):]

In [116]:
VALID_CLUSTERS = {
    "क्", "ख्", "ग्", "घ्",
    "च्", "ज्", "ट्", "ठ्", "ड्", "ढ्",
    "त्", "थ्", "द्", "ध्",
    "प्", "फ्", "ब्", "भ्",
    "श्", "ष्", "स्", "ह्"
}

In [117]:
CONSONANTS = {
    "B": "ब",
    "BH": "भ",

    "CH": "च",

    "D": "द",
    "DH": "ध",

    "F": "फ",
    "G": "ग",
    "GH": "घ",

    "HH": "ह",
    "JH": "झ",

    "K": "क",
    "KH": "ख",

    "L": "ल",
    "M": "म",
    "N": "न",
    "NG": "ङ",

    "P": "प",
    "PH": "फ",

    "R": "र",
    "S": "स",
    "SH": "श",

    "T": "त",
    "TH": "थ",

    "V": "व",
    "W": "व",
    "Y": "य",
    "Z": "ज",
}

In [118]:
VOWEL_MATRAS = {
    "AA": "ा",
    "AE": "ै",
    "AH": "अ",   # schwa fallback
    "AO": "ो",
    "EH": "े",
    "IH": "ि",
    "IY": "ी",
    "UH": "ु",
    "UW": "ू",
    "OW": "ो"
}

In [119]:
FULL_VOWELS = {
    "AA": "आ",
    "AE": "ऐ",
    "AH": "अ",
    "AO": "ओ",
    "EH": "ए",
    "IH": "इ",
    "IY": "ई",
    "UH": "उ",
    "UW": "ऊ",
    "OW": "ओ"
}

In [120]:
def phonemes_to_hindi(phonemes):
    out = []
    pending = None

    for p in phonemes:
        if p in CONSONANTS:
            if pending:
                out.append(pending)
            pending = CONSONANTS[p]

        elif p in VOWEL_MATRAS:
            if p == "EH":
                if pending:
                # consonant + EH → short i (दिल्ली, गिरि)
                    out.append(pending + "ि")
                else:
                # word-initial EH → ए (Betula → बेटुला)
                    out.append("ए")
                pending = None
            else:
                if pending:
                    out.append(pending + VOWEL_MATRAS[p])
                    pending = None
                else:
                    out.append(FULL_VOWELS[p])

    if pending:
        out.append(pending + "्")

    return "".join(out)

In [121]:
def clean_phonemes(phonemes):
    cleaned = []
    for p in phonemes:
        p = re.sub(r"\d", "", p)
        if p.isalpha():
            cleaned.append(p)
    return cleaned

In [122]:
def normalize_hindi_skeleton(text):
    rules = [
        ("ङग", "ंग"),   # बङगलर → बंगलर
        ("नद", "ंद"),   # चनदर → चंदर
        ("शत", "ष्ट"),  # महशतर → महष्ट
        ("झश", "जश"),   # रझशन → रजशन (rare but safe)
    ]

    for a, b in rules:
        text = text.replace(a, b)

    return text

In [123]:
def restore_hindi_structure(text):
    # --- Nasal assimilation (phonotactically safe)
    text = text.replace("ङग", "ंग")
    text = text.replace("नद", "ंद")
    text = text.replace("नद्र", "ंद्र")

    # --- Conjunct normalization
    text = text.replace("षटर", "ष्ट्र")
    text = text.replace("षट्र", "ष्ट्र")
    text = text.replace("ष्टर", "ष्ट्र")

    # --- Vishnu-type cluster (safe phonetic rule)
    text = text.replace("सष्ण", "ष्ण")
    text = text.replace("सनव", "ष्णव")
    text = text.replace("ङह", "ंघ")

    return text

In [124]:
def schwa_cleanup(text):
    # remove inherent schwa before matras
    text = re.sub(r"([क-ह])अ([ािीुूेो])", r"\1\2", text)

    # remove trailing schwa
    if text.endswith("अ"):
        text = text[:-1]

    return text

In [125]:
def nukta_fix(text):
    text = text.replace("फ", "फ़")
    text = text.replace("ज", "ज़")
    text = text.replace("ड", "ड़")
    text = text.replace("ढ", "ढ़")
    return text

In [126]:
def schwa_deletion(text):
    out = []
    i = 0
    while i < len(text):
        if (
            i+2 < len(text)
            and text[i] >= "क" and text[i] <= "ह"
            and text[i+1] == "अ"
            and text[i+2] >= "क" and text[i+2] <= "ह"
        ):
            # delete schwa ONLY if valid conjunct follows
            if text[i+2] + "्" in VALID_CLUSTERS:
                out.append(text[i])
                i += 2
                continue
        out.append(text[i])
        i += 1
    return "".join(out)

In [127]:
def anusvara_fix(text):
    # Convert nasal before stops → anusvara
    text = re.sub(r"न([क-घच-झट-ढत-धप-भ])", r"ं\1", text)
    text = re.sub(r"म([क-घच-झट-ढत-धप-भ])", r"ं\1", text)
    return text

In [128]:
def cluster_fixes(text):
    fixes = {
        "पर": "प्र",
        "तर": "त्र",
        "गर": "ग्र",
        "सव": "स्व",
        "शन": "श्न",
        "कष": "क्ष",
    }
    for a, b in fixes.items():
        text = text.replace(a, b)
    return text

In [129]:
def gemination_fix(en, hi):
    en = en.lower()

    SAFE_GEMINATES = {
        "ll": "ल",
        "tt": "त",
        "pp": "प",
        "kk": "क",
        "mm": "म"
    }

    for dbl, dev in SAFE_GEMINATES.items():
        if dbl in en:
            # apply ONLY once and only if already present
            hi = hi.replace(dev, dev + "्" + dev, 1)

    return hi

In [130]:
def trim_final_vowel_noise(hi):
    if len(hi) > 4 and hi.endswith(("अ",)):
        hi = hi[:-1]
    return hi

def vowel_echo_fix(hi):
    if len(hi) > 5:
        hi = hi.replace("िरि", "ि")
        hi = hi.replace("रीरी", "री")
        hi = hi.replace("लीली", "ली")
    return hi

In [131]:
SUFFIX_FIXES = {
    "pur": "पुर",
    "puri": "पुरी",
    "nagar": "नगर",
    "giri": "गिरी",
    "dham": "धाम",
    "gram": "ग्राम",
}

def safe_suffix_restore(en, hi):
    SUFFIXES = {
        "pur": "पुर",
        "puri": "पुरी",
        "nagar": "नगर",
        "giri": "गिरी",
        "gram": "ग्राम",
    }

    for suf, dev in SUFFIXES.items():
        if en.endswith(suf):
            if not hi.endswith(dev) and len(hi) <= len(dev) + 3:
                hi = hi[:-len(dev)] + dev if len(hi) >= len(dev) else hi + dev
    return hi

In [132]:
def final_vowel_length_restore(en, hi):
    en = en.lower()
    if en.endswith("i") and hi.endswith("ि"):
        hi = hi[:-1] + "ी"
    if en.endswith("u") and hi.endswith("ु"):
        hi = hi[:-1] + "ू"
    return hi

In [95]:
# def recover_missing_vowel(en, hi):
#     if en.endswith("i") and hi.endswith("ि"):
#         return hi[:-1] + "ी"
#     if en.endswith("a") and not hi.endswith("ा"):
#         return hi + "ा"
#     return hi

In [96]:
# def trim_overgeneration(hi):
#     if hi.endswith("्"):
#         hi = hi[:-1]
#     if len(hi) >= 2 and hi[-1] == hi[-2]:
#         hi = hi[:-1]
#     return hi

In [97]:
# def recover_long_vowels(en, hi):
#     # aa → ा
#     if en.endswith("a") and not hi.endswith("ा"):
#         hi += "ा"

#     # oo / u → ऊ
#     if en.endswith(("u", "oo")) and hi.endswith("ु"):
#         hi = hi[:-1] + "ू"

#     return hi

In [98]:
# def trim_trailing_consonant(hi):
#     if len(hi) >= 2 and hi[-1] in "यरल" and hi[-2] not in "ािीुूेो":
#         hi = hi[:-1]
#     return hi

In [133]:
def trim_trailing_junk(hi):
    # Remove trailing halant
    if hi.endswith("्"):
        hi = hi[:-1]

    # Remove trailing consonant without vowel
    if len(hi) >= 2 and hi[-1] in "यरलन" and hi[-2] not in "ािीुूेो":
        hi = hi[:-1]

    return hi

In [134]:
def collapse_repetition(hi):
    hi = hi.replace("रीरी", "री")
    hi = hi.replace("लीली", "ली")
    hi = hi.replace("विि", "वि")
    hi = hi.replace("िरि", "ि")
    hi = hi.replace("नन", "न")
    hi = hi.replace("लल", "ल")
    return hi

In [135]:
def transliterate_p2g(word):
    phonemes = clean_phonemes(g2p(word))
    hi = phonemes_to_hindi(phonemes)

    hi = normalize_hindi_skeleton(hi)
    hi = restore_hindi_structure(hi)

    hi = schwa_cleanup(hi)
    hi = schwa_deletion(hi)

    hi = anusvara_fix(hi)
    hi = cluster_fixes(hi)
    hi = gemination_fix(word, hi)

    hi = nukta_fix(hi)

    hi = vowel_echo_fix(hi)
    hi = trim_final_vowel_noise(hi)
    hi = final_vowel_length_restore(word, hi)

    # hi = suffix_restore(word.lower(), hi)

    # hi = recover_missing_vowel(word, hi)
    # hi = recover_long_vowels(word, hi)   
    # hi = trim_overgeneration(hi)
    # hi = trim_trailing_consonant(hi) 

    hi = collapse_repetition(hi)      # NEW
    hi = safe_suffix_restore(word, hi)  # NEW
    hi = trim_trailing_junk(hi)

    # 1️⃣ Remove trailing halant
    if hi.endswith("्"):
        hi = hi[:-1]

    # 2️⃣ Prevent over-generation of final consonant
    if len(hi) >= 2 and hi[-1] in "यरलन" and hi[-2] not in "ािीुूेो":
        hi = hi[:-1]

    # 3️⃣ Final vowel length restore (safe)
    word_l = word.lower()
    if word_l.endswith("i") and hi.endswith("ि"):
        hi = hi[:-1] + "ी"
    if word_l.endswith("u") and hi.endswith("ु"):
        hi = hi[:-1] + "ू"
    return hi

In [136]:
tests = [
    "Delhi",
    "Kolkata",
    "Bangalore",
    "Rajasthan",
    "Chandrakala",
    "Vishnupuram",
    "Maharashtra",
    "Kaveri"
]

for w in tests:
    print(w, "→", transliterate_p2g(w))

Delhi → दिली
Kolkata → कोलकात
Bangalore → बैंगअलोर
Rajasthan → रझशअ
Chandrakala → चैंदरकाल
Vishnupuram → विसनूवबाम
Maharashtra → माहाष्ट
Kaveri → काविरी


In [137]:
import json
import random

with open("data/processed/aligned_pairs_high_conf.json", encoding = "utf-8") as f:
    raw_pairs = json.load(f)

print("Total: ", len(raw_pairs))

Total:  41044


In [138]:
import random

sample = random.sample(raw_pairs, 200)
for en, hi, _ in sample[:20]:
    print(en, "→", transliterate_p2g(en), "| gold:", hi)

Manika → मैनिक | gold: मानिक
Shreyasi → शरैसी | gold: श्रेयसी
dhyanamudra → दअंफ़यांदार | gold: ध्यानमुद्रा
Past → पैसत | gold: पास्ट
Hundi → हअंदी | gold: हुन्डी
Pingala → पिंगाल | gold: पिंगला
Kavipriya → कअविप्री | gold: कविप्रिया
Maiti → ंती | gold: मैती
Bhaona → बोआ | gold: भाओना
Dhanraj → दान | gold: धनराज
Baskasht → बैसक्ष्ट | gold: बकाश्त
Pattamal → पैत्तअमअ | gold: पट्टम्माल
Vilappakkam → विलैप्पक्कअम | gold: विलप्पक्कम
Cebuano → चिबूआनो | gold: सेबुआनो
Likud → लिकद | gold: लिकुड
Namaskar → नअमैसकार | gold: नमस्कार।
Nurul → नुरअ | gold: नूरुल
Harishankar → हारिशाङक | gold: हरिशंकर
Bujumburo → बयूझअंबिरो | gold: बुजुम्बुरो
Nuristan → नुरसतअ | gold: नूरिस्तान


In [139]:
words = [
    "Benami", "Giri", "Delhi", "Sahid",
    "Kaveri", "Kanakprabha", "Sukhaphaa"
]

for w in words:
    print(w, "→", transliterate_p2g(w))

Benami → बिनअमी
Giri → गिरी
Delhi → दिली
Sahid → सैहिद
Kaveri → काविरी
Kanakprabha → कांकानरफ़
Sukhaphaa → सूकाफ़


In [140]:
def char_accuracy(pred, gold):
    m = min(len(pred), len(gold))
    correct = sum(pred[i] == gold[i] for i in range(m))
    return correct / max(len(gold), 1)

acc = []
for en, hi, _ in raw_pairs[:1000]:
    pred = transliterate_p2g(en)
    acc.append(char_accuracy(pred, hi))

print("Improved P2G char accuracy: ", sum(acc) / len(acc))

Improved P2G char accuracy:  0.3215187978687977


In [141]:
def error_type(pred, gold):
    if len(pred) < len(gold):
        return "UNDER"
    if len(pred) > len(gold):
        return "OVER"
    if "्" in pred and "्" not in gold:
        return "HALANT"
    return "OTHER"

In [142]:
def evaluate_system(data, predict_fn, limit=None):
    scores = []

    for i, (en, hi) in enumerate(data):
        if limit and i >= limit:
            break

        pred = predict_fn(en)
        scores.append(char_accuracy(pred, hi))

    return sum(scores) / len(scores)

In [143]:
from collections import Counter

def analyze_p2g_errors(data, limit=1000):
    counter = Counter()
    for i, (en, hi) in enumerate(data):
        if i >= limit:
            break
        pred = transliterate_p2g(en)
        counter[error_type(pred, hi)] += 1
    total = sum(counter.values())
    return {k: round(v/total*100,2) for k, v in counter.items()}

In [144]:
print("P2G Val:",
      round(evaluate_system(val, transliterate_p2g, 2000) * 100, 2), "%")

print("P2G Errors:",
      analyze_p2g_errors(val))

P2G Val: 34.14 %
P2G Errors: {'UNDER': 50.4, 'OTHER': 27.6, 'OVER': 21.6, 'HALANT': 0.4}
