In [1]:
from enum import Enum 

import re 

In [2]:
direct_copy_file = '/home/kuugu/Projects/manner/backend/data.txt'

metadata_dict = {}

In [3]:
data_str = '' 

with open(direct_copy_file, 'r') as f:
    data_str = f.read() 


In [4]:
# split the pages 
page_strs = re.split('----P[0-9][0-9][0-9]', data_str)
page_strs = list(map(lambda x: x.strip(), page_strs))

progress_percent = 100 * sum(map(lambda x: 0 if len(x) == 0 else 1, page_strs))/len(page_strs)
progress_percent = round(progress_percent, 2)

metadata_dict['project_progress_pct'] = progress_percent

print (metadata_dict['project_progress_pct'])

1.29


In [5]:
# parsing rules: 
# [R1] first line should be entirely in kannada, they are all kannada words 
# [R2] first line contains the word, if multiple forms are present they are comma separated 
# [R3] if the word is a verb, it might have the root word. root word is always in (brackets)
# [R4] second line can be the type of word (verb, noun, etc..), if it's not, it is the start of the meaning
# [R5] there cannot be only one line, this means there's no meaning provided 

word_type = {
    's.': 'substantive', 
    'v.a.': 'verb active', 
    'v.n.': 'verb neuter', 
    'vlg.': 'vulgar word', 
    'fg.': 'figurative', 
}

class ParseOptions(Enum): 
    EXPAND = 1 
    NO_EXPAND = 2 

class ParseOutputType(Enum): 
    EMPTY = 1 
    NO_DATA = 2 
    NOT_KANNADA = 3  
    PARSE_SUCCESS = 4 

def is_kannada(word): 
    for ch in word: 
        if not ((0x0C80 <= ch <= 0x0CFF) or (ch in (' ', ')', '('))): 
            return False
    return True 

def flatten_list(nested_list: list): 
    ret_list = [] 

    if len(nested_list) == 0:
        return ret_list 
    else: 
        for elem in nested_list: 
            if type(elem) == list:
                ret_list = ret_list + flatten_list(elem) 
            else: 
                ret_list.append(elem) 
    return ret_list


def get_kannada_words(line) -> list[str]: 
    kannada_words = [] 

    parsed_line = line.split()
    parsed_line = flatten_list(list(map(lambda x: x.split(','), parsed_line)))
    parsed_line = list(filter(lambda x: len(x)>0, parsed_line))

    # NOTE (kuugu): I am assuming the data is correct here 
    #               we need to build the CFG checker later 

    if len(parsed_line) == 1: 
        kannada_words = [parsed_line[0]]
    
    if len(parsed_line) > 1: 
        if parsed_line[-1][0] == '(': 
            kannada_words = parsed_line[:-1]

    return kannada_words

def parse_word_no_expand(word_data:str) -> list: 
    lines = word_data.strip().split('\n') 
    lines = list(map(lambda x: x.strip(), lines))
        
    if len(lines) == 1 and lines[0] == '':
        return [{'parse_output_type': ParseOutputType.EMPTY}]
    elif len(lines) == 1: 
        if not is_kannada(lines[0]):
            return [{'word': lines[0], 'parse_output_type': ParseOutputType.NOT_KANNADA}] # R1 
        else: 
            return [{'word': lines[0], 'parse_output_type': ParseOutputType.NO_DATA}] # R5 

    kannada_words = get_kannada_words(lines[0])

    ret_data = [] 

    for word in kannada_words:
        ret_data.append(
            {
                'word': word, 
                'parse_output_type': ParseOutputType.PARSE_SUCCESS, 
                'data': '\n'.join(lines[1:])
            }
        )
    
    return ret_data 

def parse_word(word_data, parse_type=ParseOptions.NO_EXPAND):
    if parse_type == ParseOptions.EXPAND:
        raise NotImplementedError

    if parse_type == ParseOptions.NO_EXPAND: 
        return parse_word_no_expand(word_data)

    if word_data.strip() == '': 
        return [{'parse_output_type': ParseOutputType.EMPTY}]

In [6]:
def test_flatten_list():
    test_list = [
        1, 
        [
            [
                [
                    1, 2, [
                        3, 4]
                ], "hello" 
            ]
        ], "hi", 3, 4, []
    ]

    assert (flatten_list(test_list) == [1, 1, 2, 3, 4, 'hello', 'hi', 3, 4]) 

def test_get_kannada_words(): 
    x = get_kannada_words('ಅಂಗೋಪಾಂಗ,ಪಾಂಗ (ಅಂಗ-ಉಪ-ಅಂಗ)')
    assert(len(x) == 2) 
    assert(x == ['ಅಂಗೋಪಾಂಗ', 'ಪಾಂಗ'])

test_flatten_list()
test_get_kannada_words() 

In [7]:
page_strs[1]

'ಆ\nThe first letter of the alphabet\n2. an interjection of surprise\n3. when prefixed to nouns derived from Sanskrit beginning with a consontant, it denotes negation, as ಸತ್ಯ truth, ಅಸತ್ಯ untruth; before a vowel ಆ becomes ಅನ್, as ಅಂತ end, ಅನಂತ endless. \n\nಅಂಕ \ns.\nFight, war, battle, combat. \n2. cock-fight \n3. a mark, sign. \n--ದಾಯೆ a combatant, wrestler. \n--ದ ಬಾಕಿಯಾರ್ a field or place for cock-fighting. \n--ಕಾದುನಿ to fight, to wrestle.\n\nಅಂಕಣ \ns. \nThe space between two cross beams or pillars (supporting a roof.)\n2. a quadrate, square\n3. a small room, compartment. \n--ಪಾಡುನಿ to upset.\n\nಅಂಕಾಯನ \ns. \nA temple feast; see ಆಯನ.\n\nಅಂಕಿತ  \ns. \nA mark.  \nadj.\nmarked. \n--ಪಾಡುನಿ to affix one’s signature to any writing. \n2. to dedicate a work.\n3. to put a mark.\n\nಅಂಕುಡೊಂಕು \ns. \nCrookedness. \nadj. \nbent, crooked.\n\nಅಂಕುಶ  \ns. \nA goad, hook used to drive an elephant with.\n\nಅಂಕಿಪುನಿ (ಅಂಕಿ) \nv.a.\nTo cipher, count, reckon (in writing).\n\nಅಂಕೆ \ns. \nA numerical figur

In [8]:
raw_word_dict = {}

for page_str in page_strs: 
    words_data = page_str.split('\n\n')

    for word_data in words_data: 
        word_data = word_data.strip() 
        word_data = word_data.split('\n')
        word, meaning = word_data[0].strip(), '\n'.join(word_data[1:])

        raw_word_dict[word] = meaning 
        

In [12]:
# sample html file 

for curr_letter in ['ಔ', 'ಅಂ', 'ಅಃ']: 
    html_file_content = ''

    html_file_content += ''' 
        <!doctype html>
        <html lang="en">
        <head>
            <meta charset="UTF-8" />
            <title>manner</title>
        </head>
        <body>
        <a href="..">back to main</a> 
        <br> 
    '''

    for word in raw_word_dict.keys():
        
        if len(word)>0 and word[:len(curr_letter)] == curr_letter: 
            html_file_content += '''
                <h>{}</h> 
                <p>{}</p>
                <br><br> 
            '''.format(word, raw_word_dict[word])

    html_file_content += ''' 
        </body>
        </html>
    '''

    with open('../pages/'+curr_letter+'.html', 'w') as f:
        f.write(html_file_content)