In [26]:
# encoding: utf-8
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import re

# List of valid Sanskrit characters in SLP1 notation
swaras = ['a', 'A', 'i', 'I', 'u', 'U', 'e', 'E', 'o', 'O', 'f', 'F', 'x', 'X']
vyanjanas = ['k', 'K', 'g', 'G', 'N', 
             'c', 'C', 'j', 'J', 'Y',
             'w', 'W', 'q', 'Q', 'R',
             't', 'T', 'd', 'D', 'n',
             'p', 'P', 'b', 'B', 'm',
             'y', 'r', 'l', 'v','S', 'z', 's', 'h', 'L', '|']
others = ['H', 'Z', 'V', 'M', '~', '/', '\\', '^', '\'']

slp1charlist = swaras + vyanjanas + others

# Function to remove unnecessary symbols from Devanagari text
def read_devnagri_text(text):
    symbols_to_remove = ['\u200c', '\u200d', '\ufeff', '\xad', '-', '–', '—', ' : ', '’', '“', '”', 
                         '!', '%', '&', '*', '+', '?', '/', ';', '{', '}', '=', ',', '\'', '"', 
                         '.', '॥', ')', '(', '[', ']', '<', '>', '`', '‘', '#', '।', '|']
    
    for symbol in symbols_to_remove:
        text = text.replace(symbol, '')

    text = re.sub('[0-9a-zA-Z०-९]', '', text)  # Remove English and Hindi numbers
    return text.strip()

# Function to remove characters not in SLP1
def remove_nonslp1_chars(word):
    return ''.join([char for char in word if char in slp1charlist])

# Function to process the Sandhi dataset
def get_sandhi_dataset(datafile, outputfile):
    word1list, word2list, outputlist = [], [], []
    
    with open(datafile, encoding="utf-8") as fp:
        lines = fp.read().splitlines()

    with open(outputfile, "w", encoding="utf-8") as outfile:
        for line in lines:
            if '=>' not in line:
                continue
            
            inout = line.split('=>')
            words = inout[1].split('+')

            if len(words) != 2:
                continue
            
            word1 = read_devnagri_text(words[0].strip())
            slp1word1 = remove_nonslp1_chars(transliterate(word1, sanscript.DEVANAGARI, sanscript.SLP1))
            
            word2 = read_devnagri_text(words[1].strip())
            slp1word2 = remove_nonslp1_chars(transliterate(word2, sanscript.DEVANAGARI, sanscript.SLP1))
            
            expected = read_devnagri_text(inout[0].strip())
            slp1expected = remove_nonslp1_chars(transliterate(expected, sanscript.DEVANAGARI, sanscript.SLP1))

            if slp1word1 and slp1word2 and slp1expected:
                word1list.append(slp1word1)
                word2list.append(slp1word2)
                outputlist.append(slp1expected)

                # Write output to file in the required format
                outfile.write(f"{slp1expected} => {slp1word1} + {slp1word2}\n")
                outfile.write(f"{expected} => {word1} + {word2}\n\n")

    return word1list, word2list, outputlist

# Function to get processed data and save output to a file
def get_xy_data(datafile, outputfile):
    w1l, w2l, ol = get_sandhi_dataset(datafile, outputfile)
    print(f"Processed data has been saved to {outputfile}")
    return w1l, w2l, ol

# File containing the Sandhi dataset
datafile = "sandhiset.txt"  # Make sure this file exists in your directory
outputfile = "converted_sandhi_output.txt"  # Output file for processed data

# Running the function and saving the output to a file
get_xy_data(datafile, outputfile)


Processed data has been saved to converted_sandhi_output.txt


(['yasmAt',
  'saH',
  'svapitA',
  'sUryodBavaH',
  'SItaH',
  'na',
  'svaprakfteH',
  'vEvaSyataH',
  'kaH',
  'na',
  'tat',
  'SarIre',
  'nizprARaH',
  'putraH',
  'vayasA',
  'vartamAnAH',
  'anyAnizwacikIrzuH',
  'saH',
  'nAsikAm',
  'kftrimatAm',
  'vAcyaH',
  'vartamAnaH',
  'yaH',
  'saH',
  'jagati',
  'jyAyAMsaH',
  'jyezWAH',
  'syuH',
  "ko'pi",
  'pUrtiH',
  'AdO',
  'saduHKAH',
  'ataH',
  'kasyacit',
  'duHKinaH',
  'tat',
  'sparSaH',
  'eva',
  'sAraH',
  'BUyaH',
  'nEva',
  'BAgyaSAlI',
  'SrIH',
  'kaYcit',
  'bahiH',
  'tat',
  'bahiH',
  'susTirAH',
  'yat',
  'tat',
  'suhft',
  'sahft',
  'SAntaH',
  'kalahaH',
  'prakftiniyama',
  'yataH',
  'DEryAt',
  'na',
  'bahiH',
  'gurUBaktAH',
  'vidvacCradDAlavaH',
  'vedavyAsaH',
  'bfhat',
  'saH',
  'Bavet',
  'DaninaH',
  'yaSaH',
  'tizWet',
  'yat',
  'kAmakroDAdayaH',
  'mftAH',
  'saH',
  'praSAntaH',
  'bahiH',
  'janaH',
  'vfttilABaH',
  'mftaH',
  'suKam',
  'saH',
  'yataH',
  'kasyacit',
  'Bavati',
