In [1]:
import pandas as pd
import numpy as np
import os
import re
import json as js
from pathlib import Path
from tqdm import tqdm
from ast import literal_eval

In [2]:
directory_kamus = "Daftar Kamus Analisis Machine Readable"
directory_kamus_full = "[Full] Daftar Kamus Ekstraksi"

### Algoritma One Entry Corpus ###

In [3]:
# Algoritma Tambahan
POS = ["v","a","n","pron","adv","num","p"]

def is_contain_bold_and_italic(font):
    contains_bold = False; contains_italic = False
    for i in font:
        if "bold" in i.lower(): contains_bold = True
        if "italic" in i.lower(): contains_italic = True
        if contains_bold == True and contains_italic == True: return True
    return False

def is_last_fonem(s): # baru dapat handle fonem (/../) dan ([...])
    if re.match(r'^.*\]$',str(s)): return True
    if re.match(r'^.*\/$',str(s)): return True
    return False

def is_start_fonem(s): # baru dapat handle fonem (/../) dan ([...])
    if re.match(r'^\[.*',str(s)): return True
    if re.match(r'^\/.*',str(s)): return True
    return False

def is_bold_contains_POS(s):
    kata = s.strip()
    
    if len(kata) > 1:
        if is_contain_only_whitespaces(kata[-2]) and (kata[-1] in POS): return True
    else:
        if (kata[-1] in POS): return True
    
    return False

def is_contain_only_whitespaces(s):
    if re.match(r'^\s*$', str(s)): return True
    return False

def is_end_entri(s):
    symbol = [";",",",":"]
    if s in symbol:
        return True
    else:
        return False

In [4]:
# Algoritma join seperate entry
def join_seperate_entry(data):
    result_entry = []
    result_entry_with_font_size_pos = []
    result_posisi = []
    result_page = []
    is_padanan_lema = []
    
    i = 0
    while i < len(data["Entri"]):
        if i == (len(entries)-1): 
            result_entry.append(data["Entri"][i])
            result_entry_with_font_size_pos.append(data["entry_font_size_pos"][i])
            result_posisi.append(data["posisi_entry"][i])
            result_page.append(data["page"][i])
            is_padanan_lema.append(data["is_padanan_lema"][i])
            break
        
        joined_entry = data["Entri"][i]
        joined_entry_with_font_size_pos = data["entry_font_size_pos"][i]
        curr_posisi = data["posisi_entry"][i]
        curr_page = data["page"][i]
        curr_pl = data["is_padanan_lema"][i]
        
        curr_y0 = joined_entry_with_font_size_pos[-1][-1][1]
        batas_atas = curr_y0 + (curr_y0*(1/100)) # error 1%
        batas_bawah = curr_y0 - (curr_y0*(1/100)) # error 1%
        
        
        nxt= i+1
        if nxt > len(entries)-1: 
            result_entry.append(joined_entry)
            result_entry_with_font_size_pos.append(joined_entry_with_font_size_pos)
            result_posisi.append(curr_posisi)
            result_page.append(curr_page)
            is_padanan_lema.append(curr_pl)
            break
            
        else:
            nxt_y0 = round(data["entry_font_size_pos"][nxt][0][-1][1],2)
            while batas_bawah <= nxt_y0 <= batas_atas:
                if curr_pl == 1: break

                joined_entry = joined_entry + " " + data["Entri"][nxt]
                joined_entry_with_font_size_pos.extend(data["entry_font_size_pos"][nxt])

                if isinstance(data["page"][nxt],list):
                    curr_page = data["page"][nxt]
                elif isinstance(curr_page,list):
                    curr_page = curr_page
                elif curr_page != page[nxt]:
                    curr_page = [curr_page, data["page"][nxt]]

                nxt += 1
                
                if nxt == (len(entries)): break
                    
                nxt_y0 = round(data["entry_font_size_pos"][nxt][0][-1][1],2)

                curr_y0 = joined_entry_with_font_size_pos[-1][-1][1]
                curr_pl = data["is_padanan_lema"][nxt]
                batas_atas = curr_y0 + (curr_y0*(1/100)) # error 1%
                batas_bawah = curr_y0 - (curr_y0*(1/100)) # error 1%

            result_entry.append(joined_entry)
            result_entry_with_font_size_pos.append(joined_entry_with_font_size_pos)
            result_posisi.append(curr_posisi)
            result_page.append(curr_page)
            is_padanan_lema.append(curr_pl)
            i = nxt
            
    return {
        "Entri":result_entry,
        "entry_font_size_pos":result_entry_with_font_size_pos,
        "posisi_entry":result_posisi,
        "page":result_page,
        "is_padanan_lema":is_padanan_lema
    }

In [5]:
# kategorisasi entry, untuk memisahkan mana yang main entry dan sub entry
def categorize_main_entry(posisi, pages, lema):
    output = []
    
    i = 0; j = 0
    while i < len(pages):
        if isinstance(pages[i], list): # kasus entry cross page
            prev_posisi_x0 = posisi[i-1][0]
            
            if abs(posisi[i][0] - prev_posisi_x0) <= 3:
                output.append(output[i-1])
                
            else:
                batas_atas = round(prev_posisi_x0 + (prev_posisi_x0 * (2/100)),2) # error 2%
                batas_kolom = 2*batas_atas
                
                if posisi[i][0] > batas_atas and posisi[i][0] < batas_kolom:
                    output.append(0)
                    
                else:
                    output.append(1)
                    
            i += 1; j += 1
        
        else:   
            posisi_by_page = []; lema_by_page = []; curr_page = pages[j] 
        
            while curr_page == pages[i]: # kelompokkan entri berdasarkan halaman
                posisi_by_page.append(posisi[j][0])
                lema_by_page.append(lema[j])
                
                j += 1
                
                if j > len(pages) - 1: break
                    
                curr_page = pages[j]
                
            sorted_posisi = sorted(posisi_by_page) # urutkan
            
            i = j; k = 0; l = 0 # update nilai i
            while k < len(posisi_by_page):
                if lema_by_page[k] == 1:
                    if k == len(posisi_by_page)-1:
                        output.append(1); break
                    else:
                        output.append(1)
                        output.append(0)
                        k += 2
                else:
                    if abs(posisi_by_page[k] - sorted_posisi[l]) > 3:
                        output.append(0); k += 1 # jika tidak sesuai urutan

                    else:
                        output.append(1) # index pertama setelah header atau nomor halaman
                        batas_atas = round(posisi_by_page[k] + (posisi_by_page[k] * (2/100)),2) # error 2%
                        batas_kolom = 2*batas_atas

                        m = k + 1
                        if m > len(posisi_by_page) - 1: break

                        nxt_posisi = posisi_by_page[m]
                        while nxt_posisi > batas_atas and nxt_posisi < batas_kolom:
                            output.append(0); m += 1

                            if m > len(posisi_by_page) - 1: 
                                break 

                            nxt_posisi = posisi_by_page[m]

                        k = m
                        if nxt_posisi < batas_kolom:
                            l += 1
                        else:
                            l = m
                
                
    return output 

In [217]:
# pisahkan main entry atau entry-entry pokok yang masih tergabung
def seperate_joined_entry(data, kategori):
    result_entries = []
    result_entries_with_font_size_pos = []
    result_posisi = []
    result_page = []
    
    entry = []
    entry_with_font_size_pos = []
    posisi_dummy = None;
    
    i = 0
    while i < len(data["Entri"]):
        
        if len(data["entry_font_size_pos"][i]) < 2: # jika hanya terdiri dari 1 kata atau 0 kata
            result_entries.append(data["Entri"][i])
            result_entries_with_font_size_pos.append(data["entry_font_size_pos"][i])
            result_posisi.append(data["posisi_entry"][i])
            result_page.append(data["page"][i])
            entry = []; entry_with_font_size_pos = []; posisi_dummy = None
        
        else:
            posisi = data["posisi_entry"][i]
            batas_atas = round(posisi[0] + (posisi[0] * (1/100)),2)

            detail = data["entry_font_size_pos"][i][0] # handle index 0

            entry.append(detail[0].strip())
            entry_with_font_size_pos.append(detail)
            posisi_dummy = detail[-1]

            if (kategori[i] == 0): 
                result_entries.append(data["Entri"][i])
                result_entries_with_font_size_pos.append(data["entry_font_size_pos"][i])
                result_posisi.append(data["posisi_entry"][i])
                result_page.append(data["page"][i])
                entry = []; entry_with_font_size_pos = []; posisi_dummy = None

            else: # pisahkan entri pokok yang tergabung
                batas_bawah = round(posisi[0] - (posisi[0] * (1/100)),2) # error 1%

                for j in range(1,len(data["entry_font_size_pos"][i])):
                    detail = data["entry_font_size_pos"][i][j]; 
                    posisi_x0 = round(float(detail[-1][0]))

                    if batas_bawah <= posisi_x0 <= batas_atas: # pisahkan entry
                        joined_entry = (" ").join(entry)
                        result_entries.append(joined_entry)
                        result_entries_with_font_size_pos.append(entry_with_font_size_pos)
                        result_posisi.append(posisi_dummy)
                        result_page.append(data["page"][i])

                        entry = []; entry_with_font_size_pos = []
                        entry.append(detail[0].strip())
                        entry_with_font_size_pos.append(detail)
                        posisi_dummy = detail[-1]

                    else:
                        entry.append(detail[0].strip())
                        entry_with_font_size_pos.append(detail)
        
        if entry != []:
            joined_entry = (" ").join(entry)
            result_entries.append(joined_entry)
            result_entries_with_font_size_pos.append(entry_with_font_size_pos)
            result_posisi.append(posisi_dummy)
            result_page.append(data["page"][i])
            entry = []; entry_with_font_size_pos = []; posisi_dummy = None

        i += 1
    
    return {
        "Entri":result_entries,
        "entry_font_size_pos":result_entries_with_font_size_pos,
        "posisi_entry":result_posisi,
        "page":result_page
    }

In [219]:
# memisahkan prakategorial
def seperate_prakategorial(data):
    result = {
        "Entri":[],
        "entry_font_size_pos":[],
        "posisi_entry":[],
        "page":[]
    }
    
    for i in range(len(data["Entri"])):
        txt = data["Entri"][i]
        split_txt = txt.strip().split(",",1)
        
        if len(split_txt) < 2 or txt[-1] == ",": # tidak terdapat koma atau koma berada di akhir
            result['Entri'].append(txt)
            result['entry_font_size_pos'].append(data['entry_font_size_pos'][i])
            result['page'].append(data['page'][i])
            result['posisi_entry'].append(data['posisi_entry'][i])
        
        else:
            frst_entri = split_txt[0].strip().split(" ")
            sec_entri = split_txt[1].strip().split(" ")
            
            for j in range(len(frst_entri)):
                frst_entri[j] = frst_entri[j].strip()
            
            for k in range(len(sec_entri)):
                sec_entri[k] = sec_entri[k].strip()
                
            if len(frst_entri) <= 2 and (frst_entri[0] == "" or frst_entri[0] == ","): # koma berada di awal entri
                result['Entri'].append(txt)
                result['entry_font_size_pos'].append(data['entry_font_size_pos'][i])
                result['page'].append(data['page'][i])
                result['posisi_entry'].append(data['posisi_entry'][i])
            
            else:
                inf_frst_entri = data['entry_font_size_pos'][i][:len(frst_entri)]
                
                if "bold" in inf_frst_entri[-1][1].lower() or frst_entri[-1] in POS:
                    if (len(frst_entri) + len(sec_entri)) == len(data['entry_font_size_pos'][i]): # kasus koma menempel
                        frst_entri[-1] = frst_entri[-1] + ","
                        inf_sec_entri = data['entry_font_size_pos'][i][len(frst_entri):]

                    else: # kasus koma tidak menempel
                        frst_entri.append(",")
                        inf_frst_entri = data['entry_font_size_pos'][i][:len(frst_entri)+1]
                        inf_sec_entri = data['entry_font_size_pos'][i][len(frst_entri)+1:]
                        
                    # entri pertama
                    result['Entri'].append(" ".join(frst_entri))
                    result['entry_font_size_pos'].append(inf_frst_entri)
                    result['page'].append(data['page'][i])
                    result['posisi_entry'].append(data['posisi_entry'][i])

                    # entri kedua
                    result['Entri'].append(" ".join(sec_entri))
                    result['entry_font_size_pos'].append(inf_sec_entri)
                    result['page'].append(data['page'][i])
                    result['posisi_entry'].append(data['posisi_entry'][i])

                else: # pemisahan koma tidak valid
                    result['Entri'].append(txt)
                    result['entry_font_size_pos'].append(data['entry_font_size_pos'][i])
                    result['page'].append(data['page'][i])
                    result['posisi_entry'].append(data['posisi_entry'][i])
                
                
    return result

In [220]:
# algoritma bersihkan entry dari fonem
def clean_entry(data):
    result = {
        "Entri":[],
        "entry_font_size_pos":[],
        "posisi_entry":[],
        "page":[]
    }
    
    for i in range(len(data["Entri"])): # remove fonem
        txt = data["Entri"][i] # data text
        
        if not is_contain_only_whitespaces(txt):
            
            entry_font_size_pos = data["entry_font_size_pos"][i]
            txt = re.sub(r'\[.*?\]',"",txt)
            entry_font_size_pos = clean_entry_font_size_paranthesis(entry_font_size_pos)

            txt = re.sub(r'\/.*?\/',"",txt)
            entry_font_size_pos = clean_entry_font_size_slash(entry_font_size_pos)

            clean = re.sub(' +', ' ', txt) ## remove multiple whitespace
            result["Entri"].append(clean.strip())
            result["entry_font_size_pos"].append(entry_font_size_pos)

            result['posisi_entry'].append(data['posisi_entry'][i])
            result['page'].append(data['page'][i])
    
    for j in range(1,len(result['Entri'])): # fix symbol
        array_simbol = []; array_simbol_font_size_pos = []
        
        prev_txt_split = result["Entri"][j-1].split(" ")
        prev_entri_font_size_pos = result['entry_font_size_pos'][j-1]
        
        # buang seluruh simbol, kecuali ; pada entri sebelumnya
        while (prev_txt_split[-1] != "") and (not is_end_entri(prev_txt_split[-1][-1])):
            if (prev_txt_split[-1][0].isalnum()) or (prev_txt_split[-1][-1].isalnum()): 
                break
                
            else:
                if (prev_txt_split==[] or prev_entri_font_size_pos == []):break
                
                array_simbol.append(prev_txt_split[-1])
                array_simbol_font_size_pos.append(prev_entri_font_size_pos[-1])
                del prev_txt_split[-1]
                del prev_entri_font_size_pos[-1]
                
                result["Entri"][j-1] = " ".join(prev_txt_split)
                result['entry_font_size_pos'][j-1] = prev_entri_font_size_pos
            
            if (prev_txt_split==[] or prev_entri_font_size_pos == []):break
        
        txt_split = result['Entri'][j].split(" ")
        if is_end_entri(txt_split[0]): 
            result['Entri'][j-1] = result['Entri'][j-1] + txt_split[0]
            result['entry_font_size_pos'][j-1].append(result['entry_font_size_pos'][j][0])
            
            del txt_split[0]
            result['entry_font_size_pos'][j] = result['entry_font_size_pos'][j][1:]
            result['Entri'][j] = " ".join(txt_split)
        
        if array_simbol != []:
            new_entry = []
            new_entry.extend(array_simbol)
            new_entry.extend(txt_split)
            result['Entri'][j] = " ".join(new_entry)
            
            new_entry_font_size_pos = []
            new_entry_font_size_pos.extend(array_simbol_font_size_pos)
            new_entry_font_size_pos.extend(result['entry_font_size_pos'][j])
            result['entry_font_size_pos'][j] = new_entry_font_size_pos    
    
    for l in range(len(result['entry_font_size_pos'])):
        if result['entry_font_size_pos'][l] != []:
            result['posisi_entry'][l] = result['entry_font_size_pos'][l][0][-1]
        
    return result

In [221]:
def clean_entry_font_size_paranthesis(data):
    clean_data = []
    i = 0
    
    while i < len(data):
        txt = data[i][0]
        if re.match(r'^.*\[.*?\].*$',str(txt)): ## kasus ...[..]...
            clean = re.sub(r'\[.*?\]',"",txt)
            if clean == "":
                i += 1
            else:
                clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                i += 1
        elif re.match(r'^.*\[.*',str(txt)): ## kasus ...[...
            nxt = i+1
            if nxt > len(data)-1: # i di indeks terakhir
                clean_data.append(data[i])
                break
                
            nxt_txt = data[nxt][0]
            while not re.match(r'^.*\].*$',str(nxt_txt)): # mencari "...]...."
                nxt += 1
                if nxt > len(data)-1: break
                nxt_txt = data[nxt][0]
            
            if nxt > len(data)-1: # jika "....]..." tidak ditemukan
                for k in range(i,nxt):
                    clean_data.append(data[k])
                break
            else:
                ## append [ pertama
                clean = txt.split("[",1)[0]
                if clean != "":
                    clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                    
                ## append ] kedua
                clean_nxt = nxt_txt.split("]",1)[1]
                if clean_nxt != "":
                    clean_data.append([clean_nxt.strip(),data[nxt][1],data[nxt][2],data[i][3]])
                
                i = nxt+1
        else:
            clean_data.append(data[i])
            i += 1
    
    return clean_data


def clean_entry_font_size_slash(data):
    clean_data = []
    i = 0
    
    while i < len(data):
        txt = data[i][0]
        if re.match(r'^.*\/.*?\/.*$',str(txt)): ## kasus .../../...
            clean = re.sub(r'\/.*?\/',"",txt)
            if clean == "":
                i += 1
            else:
                clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                i += 1
        elif re.match(r'^.*\/.*',str(txt)): ## kasus .../...
            nxt = i+1
            if nxt > len(data)-1: # i di indeks terakhir
                clean_data.append(data[i])
                break
                
            nxt_txt = data[nxt][0]
            while not re.match(r'^.*\/.*$',str(nxt_txt)): # mencari ".../...."
                nxt += 1
                if nxt > len(data)-1: break
                nxt_txt = data[nxt][0]
            
            if nxt > len(data)-1: # jika "..../..." tidak ditemukan
                for k in range(i,nxt):
                    clean_data.append(data[k])
                break
            else:
                ## append / pertama
                clean = txt.split("/",1)[0]
                if clean != "":
                    clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                    
                ## append / kedua
                clean_nxt = nxt_txt.split("/",1)[1]
                if clean_nxt != "":
                    clean_data.append([clean_nxt.strip(),data[nxt][1],data[nxt][2],data[i][3]])
                
                i = nxt+1
        else:
            clean_data.append(data[i])
            i += 1
    
    return clean_data

In [222]:
def fix_page(pages):
    clean_page = []
    cnt = 1;
    
    for i in range(len(pages)):
        if i == 0:
            clean_page.append(cnt)
        else:
            if isinstance(pages[i], list):
                clean_page.append([cnt,cnt+1])
                cnt += 1
            else:
                if isinstance(pages[i-1], list):
                    clean_page.append(cnt)
                else:
                    if pages[i] == pages[i-1]:
                        clean_page.append(cnt)
                    else:
                        cnt += 1
                        clean_page.append(cnt)
    return clean_page

In [223]:
def categorize_prakategorial(entries):
    output = []
    
    for i in entries:
        if i == "" or len(i)==1:
            output.append(0)
        else:
            if re.match(r'.*\,$',str(i)): 
                output.append(1)
            elif is_contain_only_whitespaces(i[-2]) and (i[-1] in POS):
                output.append(1)
            else:
                output.append(0)
    return output

> Main Program

In [234]:
def build_corpus_one_entry_by_font_pos(data):
    # tahapan awal, pendekatan dengan font
    result = join_seperate_entry(data)
    kategorisasi = categorize_main_entry(result["posisi_entry"], result["page"], result["is_padanan_lema"])
    result = seperate_joined_entry(result, kategorisasi)
#   result = fix_cross_page_entry(result)
    
    clean_result = clean_entry(result)
    clean_result = seperate_prakategorial(result)
    clean_result["is_padanan_lema"] = categorize_prakategorial(clean_result["Entri"])
    return clean_result

### Main Program (JSON) ###

In [235]:
directory_CSV = "CSV One Entry JSON With Font Approach"
directory_hasil = "CSV One Entry JSON With Font + Posisi Approach"

for filename in tqdm(os.listdir(directory_CSV)):
    print("====" + filename + "====")
    kamus = pd.read_csv(directory_CSV + "/" + filename)
    kamus = kamus.dropna()
    kamus = kamus.reset_index(drop=True)
    entries = kamus["Entri"].values.tolist()

    entries_font_size_pos = []
    for i in kamus["entry_font_size_pos"].values.tolist():
        entries_font_size_pos.append(literal_eval(i))

    posisi_entry = []
    for i in kamus["posisi_entry"].values.tolist():
        posisi_entry.append(literal_eval(i))

    page = []
    for i in kamus["page"].values.tolist():
        if not isinstance(i,int):
            page.append(literal_eval(i))
        else:
            page.append(int(i))

    input_data = {
        "Entri":entries,
        "entry_font_size_pos":entries_font_size_pos,
        "posisi_entry":posisi_entry,
        "page":page,
        "is_padanan_lema":kamus["is_padanan_lema"].values.tolist()
    }
    
    CSV_res = build_corpus_one_entry_by_font_pos(input_data)

    result_csv = pd.DataFrame.from_dict(CSV_res)
    result_csv = result_csv[result_csv["Entri"] != ""]
    result_csv = result_csv.reset_index(drop=True)

    result_csv = result_csv.dropna()
    result_csv = result_csv.reset_index(drop=True)
    
    new_filename = os.path.splitext(filename)[0]
    result_csv.to_csv(directory_hasil + "/" + new_filename + "-posisi.csv",index=False)

 56%|██████████████████████████████████████████████▋                                     | 5/9 [00:00<00:00, 41.83it/s]

====41. Kamus Bahasa Indonesia-Bali A-K (1997)-page_58_59-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====43. Kamus Bahasa Indonesia-Bahasa Minangkabau I (1994)-page_23_24-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====54. Kamus Bahasa Indonesia Mentawai (1998)-page_23_24-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====57. Kamus Bahasa Bugis-Indonesia (1977)-page_20_21-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====61. Kamus Banjar-Indonesia (1977)-page_51_52-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====68. Kamus Dwibahasa Bahasa Talaud - Bahasa Indonesia (2018)-page_19_20-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====77. Kamus Samawa-Indonesia Edisi 2 (2017)-page_27_28-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indonesia (2017)-page_17_18-hasil-ekstraksi-one_entry_from_JSON-font.csv====
====91. Kamus Simalungun - Indonesia (edisi kedua) (2015)-page_48_49-hasil-ekstraksi-one_entry_from_JSON-font.csv==

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 41.35it/s]


### Main Program Full Kamus (JSON)

In [236]:
directory_CSV = "[Full] CSV One Entry JSON With Font Approach"
directory_hasil = "[Full] CSV One Entry JSON With Font + Posisi Approach"

for filename in tqdm(os.listdir(directory_CSV)):
    print("====" + filename + "====")
    kamus = pd.read_csv(directory_CSV + "/" + filename)
    kamus = kamus.dropna()
    kamus = kamus.reset_index(drop=True)
    entries = kamus["Entri"].values.tolist()

    entries_font_size_pos = []
    for i in kamus["entry_font_size_pos"].values.tolist():
        entries_font_size_pos.append(literal_eval(i))

    posisi_entry = []
    for i in kamus["posisi_entry"].values.tolist():
        posisi_entry.append(literal_eval(i))

    page = []
    for i in kamus["page"].values.tolist():
        if not isinstance(i,int):
            page.append(literal_eval(i))
        else:
            page.append(int(i))

    input_data = {
        "Entri":entries,
        "entry_font_size_pos":entries_font_size_pos,
        "posisi_entry":posisi_entry,
        "page":page,
        "is_padanan_lema":kamus["is_padanan_lema"].values.tolist()
    }
    
    CSV_res = build_corpus_one_entry_by_font_pos(input_data)

    result_csv = pd.DataFrame.from_dict(CSV_res)
    result_csv = result_csv[result_csv["Entri"] != ""]
    result_csv = result_csv.reset_index(drop=True)

    result_csv = result_csv.dropna()
    result_csv = result_csv.reset_index(drop=True)
    
    new_filename = os.path.splitext(filename)[0]
    result_csv.to_csv(directory_hasil + "/" + new_filename + "-posisi.csv",index=False)

  0%|                                                                                           | 0/48 [00:00<?, ?it/s]

====10. Kamus Bahasa Indonesia-Dayak Deah Edisi I (2013)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


  2%|█▋                                                                                 | 1/48 [00:02<01:42,  2.19s/it]

====12. Kamus Bahasa Indonesia-Kaidipang L-Z (2000)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


  4%|███▍                                                                               | 2/48 [00:06<02:41,  3.52s/it]

====14. Kamus Bahasa Indonesia-Bahasa Minangkabau II (1994)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


  6%|█████▏                                                                             | 3/48 [00:11<03:09,  4.20s/it]

====15. Kamus Bahasa Indonesia-Pasir (1997)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


  8%|██████▉                                                                            | 4/48 [00:14<02:46,  3.78s/it]

====16. Kamus Bahasa Indonesia Karo A-K (1998)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 10%|████████▋                                                                          | 5/48 [00:19<02:58,  4.14s/it]

====17. Kamus Melayu Makasar-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 12%|██████████▍                                                                        | 6/48 [00:21<02:15,  3.24s/it]

====18. Kamus Bahasa Jawa-Bahasa Indonesia I (1993)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 15%|████████████                                                                       | 7/48 [00:24<02:12,  3.22s/it]

====19. Kamus Bahasa Indoensia-Melayu Riau (1997)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 17%|█████████████▊                                                                     | 8/48 [00:28<02:16,  3.40s/it]

====2. Kamus Melayu-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 19%|███████████████▌                                                                   | 9/48 [00:30<02:04,  3.18s/it]

====20. Kamus Bahasa Melayu Ambon-Indonesia (1998)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 21%|█████████████████                                                                 | 10/48 [00:31<01:36,  2.54s/it]

====21. Kamus Bahasa Indonesia-Sentani A-K (1999)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 23%|██████████████████▊                                                               | 11/48 [00:33<01:22,  2.22s/it]

====23. Kamus Dwibahasa Dayak Ngaju-Indonesia (2013)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 25%|████████████████████▌                                                             | 12/48 [00:34<01:12,  2.03s/it]

====24. Kamus Minangkabau-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 27%|██████████████████████▏                                                           | 13/48 [00:38<01:22,  2.35s/it]

====26. Kamus Bahasa Indonesia-Bahasa Tonsea II (1996)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 29%|███████████████████████▉                                                          | 14/48 [00:39<01:06,  1.96s/it]

====27. Kamus Bahasa Indonesia-Saluan (2012)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 31%|█████████████████████████▋                                                        | 15/48 [00:40<00:57,  1.74s/it]

====28. Kamus Bahasa Kutai-Bahasa Indonesia (2013)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 33%|███████████████████████████▎                                                      | 16/48 [00:44<01:15,  2.35s/it]

====29. Kata Tetun Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 35%|█████████████████████████████                                                     | 17/48 [00:45<01:03,  2.05s/it]

====31. Kamus Sumbawa-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 38%|██████████████████████████████▊                                                   | 18/48 [00:47<01:01,  2.06s/it]

====32. Kamus Melayu Langkat-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 40%|████████████████████████████████▍                                                 | 19/48 [00:49<00:56,  1.95s/it]

====33. Kamus Wolio Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 42%|██████████████████████████████████▏                                               | 20/48 [00:51<00:54,  1.94s/it]

====34. Kamus Bahasa Indonesia-Bali L-Z (1998)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 44%|███████████████████████████████████▉                                              | 21/48 [00:54<01:06,  2.45s/it]

====36. Kamus Bahasa Indonesia-Kulawi (2012)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 46%|█████████████████████████████████████▌                                            | 22/48 [00:57<01:05,  2.51s/it]

====38. Kamus Bahasa Indonesia-Karo L-Z (1999)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 48%|███████████████████████████████████████▎                                          | 23/48 [01:04<01:33,  3.74s/it]

====4. Kamus Bahasa Indonesia-Jambi A-K (1998)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 50%|█████████████████████████████████████████                                         | 24/48 [01:05<01:16,  3.17s/it]

====41. Kamus Bahasa Indonesia-Bali A-K (1997)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 52%|██████████████████████████████████████████▋                                       | 25/48 [01:09<01:18,  3.40s/it]

====42. Kamus Bahasa Indonesia-Bahasa Sunda II (1993)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 54%|████████████████████████████████████████████▍                                     | 26/48 [01:14<01:24,  3.82s/it]

====44. Kamus Melayu Deli-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 56%|██████████████████████████████████████████████▏                                   | 27/48 [01:15<01:04,  3.05s/it]

====46. Kamus Bahasa Banjar Dialek Hulu-Indonesia (2008)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 58%|███████████████████████████████████████████████▊                                  | 28/48 [01:21<01:16,  3.82s/it]

====5. Kamus Bahasa Indonesia-Bahasa Tonsea I (1996)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 60%|█████████████████████████████████████████████████▌                                | 29/48 [01:22<00:56,  2.98s/it]

====51. Kamus Bahasa Bali Kuno-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 62%|███████████████████████████████████████████████████▎                              | 30/48 [01:23<00:43,  2.40s/it]

====52. Kamus Ogan-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 65%|████████████████████████████████████████████████████▉                             | 31/48 [01:25<00:40,  2.38s/it]

====54. Kamus Bahasa Indonesia Mentawai (1998)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 67%|██████████████████████████████████████████████████████▋                           | 32/48 [01:27<00:32,  2.01s/it]

====55. Kamus Bahasa Indonesia Bakumpai I (1995)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 69%|████████████████████████████████████████████████████████▍                         | 33/48 [01:29<00:31,  2.07s/it]

====56. Kamus Lampung-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 71%|██████████████████████████████████████████████████████████                        | 34/48 [01:31<00:28,  2.06s/it]

====58. Kamus Melayu Ketapang-Indonesia A-M (2010)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 73%|███████████████████████████████████████████████████████████▊                      | 35/48 [01:32<00:25,  1.96s/it]

====60. Kamus Sunda-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 75%|█████████████████████████████████████████████████████████████▌                    | 36/48 [01:37<00:34,  2.86s/it]

====63. Kamus Bahasa Indonesia-Lampung Dialek A (1999)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 77%|███████████████████████████████████████████████████████████████▏                  | 37/48 [01:40<00:31,  2.83s/it]

====66. Kamus Melayu Bali-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 79%|████████████████████████████████████████████████████████████████▉                 | 38/48 [01:41<00:23,  2.34s/it]

====68. Kamus Dwibahasa Bahasa Talaud - Bahasa Indonesia (2018)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 81%|██████████████████████████████████████████████████████████████████▋               | 39/48 [01:44<00:21,  2.35s/it]

====71. Kamus dwibahasa Bugis-Indonesia (2017)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 83%|████████████████████████████████████████████████████████████████████▎             | 40/48 [01:44<00:14,  1.76s/it]

====78. Kamus Tolaki-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 85%|██████████████████████████████████████████████████████████████████████            | 41/48 [01:46<00:11,  1.69s/it]

====8. Kamus Indonesia-Angkola (1995)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 88%|███████████████████████████████████████████████████████████████████████▊          | 42/48 [01:47<00:08,  1.48s/it]

====84. Kamus Bahasa Biak - Indonesia (1977) -hasil-ekstraksi-one_entry_from_JSON-font.csv====


 90%|█████████████████████████████████████████████████████████████████████████▍        | 43/48 [01:47<00:06,  1.23s/it]

====85. Kamus Tondano-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 92%|███████████████████████████████████████████████████████████████████████████▏      | 44/48 [01:50<00:07,  1.78s/it]

====87. Kamus Bahasa Indonesia-Kaidipang (A-K) (1999)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 94%|████████████████████████████████████████████████████████████████████████████▉     | 45/48 [01:56<00:08,  2.85s/it]

====89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indonesia (2017)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 96%|██████████████████████████████████████████████████████████████████████████████▌   | 46/48 [01:56<00:04,  2.14s/it]

====9. Kamus Manado-Indonesia (1985)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 47/48 [01:58<00:01,  1.94s/it]

====91. Kamus Simalungun - Indonesia (edisi kedua) (2015)-hasil-ekstraksi-one_entry_from_JSON-font.csv====


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [02:01<00:00,  2.54s/it]


### Main Program (XML) ###

In [17]:
directory_CSV = "CSV XML All Information"
directory_hasil = "CSV One Entry XML With Font + Posisi Approach"

for filename in tqdm(os.listdir(directory_CSV)):
    data = pd.read_csv(directory_CSV + "/" + filename)
    data = data.dropna()
    data = data.reset_index(drop=True)
    
    input_texts = data["kata"].values.tolist()
    input_fonts = data["font"].values.tolist()
    
    for i in input_texts:
        i = str(i)
    
    # masih harus diperbaiki
    list_x = data["x"].values.tolist()
    list_y = data["y"].values.tolist()
    input_posisi = []
    
    i = 0
    while i < len(list_x):
        input_posisi.append((list_x[i],list_y[i],list_x[i],list_y[i])) # masih harus diperbaiki
        i += 1
        
    input_pages = data["page"].values.tolist()
    new_filename = os.path.splitext(filename)[0]
    
    if is_contain_bold_and_italic(input_fonts):
        print("====" + new_filename + "====")
        try:
            entry, entry_with_font, entry_with_posisi, posisi_entry, page_entry = build_corpus_one_entry_by_font_from_XML(
                input_texts, 
                input_fonts, 
                input_posisi, 
                input_pages
            )
            CSV_res = {
                "Entri":entry,
                "Entri with Font":entry_with_font,
                "Entri with Posisi":entry_with_posisi,
                "Posisi":posisi_entry,
                "Page": page_entry
            }

            result_csv = pd.DataFrame.from_dict(CSV_res)
            result_csv.to_csv(directory_hasil + "/" + new_filename + "-one_entry_from_XML_font_posisi.csv",index=False)
        except:
            print("==== Kamus Gagal ====")
            print(new_filename)

 83%|████████████████████████████████████████████████████████████████████▎             | 15/18 [00:00<00:00, 62.44it/s]

====41. Kamus Bahasa Indonesia-Bali A-K (1997)-page_58_59_kata====
====43. Kamus Bahasa Indonesia-Bahasa Minangkabau I (1994)-page_23_24_kata====
====45. Kamus Bahasa Indonesia-Bahasa Gayo II (1996)-page_56_57_kata====
====54. Kamus Bahasa Indonesia Mentawai (1998)-page_23_24_kata====
====57. Kamus Bahasa Bugis-Indonesia (1977)-page_20_21_kata====
====61. Kamus Banjar-Indonesia (1977)-page_51_52_kata====
====68. Kamus Dwibahasa Bahasa Talaud - Bahasa Indonesia (2018)-page_19_20_kata====
====77. Kamus Samawa-Indonesia Edisi 2 (2017)-page_27_28_kata====


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 60.71it/s]

====89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indonesia (2017)-page_17_18_kata====
====91. Kamus Simalungun - Indonesia (edisi kedua) (2015)-page_48_49_kata====





### Cek Kamus ###

In [13]:
target = "CSV One Entry JSON With Font + Posisi Approach/"
kamus = pd.read_csv(target + "43. Kamus Bahasa Indonesia-Bahasa Minangkabau I (1994)-page_23_24-hasil-ekstraksi-one_entry_from_JSON-font-posisi.csv")

In [14]:
# tampilkan seluruh baris dan seluruh nilai pada kolom
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

display(kamus)

# reset option
pd.reset_option("display")

Unnamed: 0,Entri,entry_font_size_pos,posisi_entry,page
0,23,"[['23', 'times-roman', 10.97, [186.94, 29.39, 200.02, 44.73]]]","[186.94, 29.39, 200.02, 44.73]",1
1,anutan n 1 conto elok; ikutan; tuladan: guru hendaknya dapat menjadi -- murid-muridnya gum handaknyo dapek manjadi conto elok dek murik-muriknyo; 2 kayakinan diikuik,"[['anutan', 'times-bold', 11.21, [30.5, 56.89, 66.02, 72.82]], ['n', 'times-bolditalic', 11.43, [69.84, 58.42, 76.0, 72.67]], ['1', 'times-bolditalic', 11.43, [82.17, 58.42, 82.16, 72.67]], ['conto', 'times-roman', 11.67, [92.15, 57.0, 116.05, 72.34]], ['elok;', 'times-roman', 11.67, [139.96, 57.0, 154.29, 72.34]], ['ikutan;', 'times-roman', 11.67, [168.65, 57.0, 187.76, 72.34]], ['tuladan:', 'times-roman', 11.67, [206.9, 57.0, 226.01, 72.34]], ['guru', 'times-italic', 11.73, [250.55, 58.92, 272.67, 72.97]], ['hendaknya', 'times-italic', 11.73, [278.21, 58.92, 322.45, 72.97]], ['dapat', 'times-italic', 11.73, [333.52, 58.92, 350.11, 72.97]], ['menjadi', 'times-italic', 11.44, [50.14, 71.17, 85.82, 85.22]], ['--', 'times-italic', 11.44, [90.92, 71.17, 96.01, 85.22]], ['murid-muridnya', 'times-italic', 11.44, [106.21, 71.17, 167.37, 85.22]], ['gum', 'times-roman', 11.56, [181.19, 70.45, 197.29, 85.79]], ['handaknyo', 'times-roman', 11.56, [202.66, 70.45, 245.59, 85.79]], ['dapek', 'times-roman', 11.56, [256.33, 70.45, 272.42, 85.79]], ['manjadi', 'times-roman', 11.56, [288.53, 70.45, 309.99, 85.79]], ['conto', 'times-roman', 11.56, [331.47, 70.45, 336.82, 85.79]], ['elok', 'times-roman', 11.38, [69.8, 82.95, 84.36, 98.29]], ['dek', 'times-roman', 11.38, [94.06, 82.95, 98.92, 98.29]], ['murik-muriknyo;', 'times-roman', 11.38, [113.47, 82.95, 171.7, 98.29]], ['2', 'times-roman', 11.38, [191.1, 82.95, 176.55, 98.29]], ['kayakinan', 'times-roman', 11.38, [200.8, 82.95, 220.22, 98.29]], ['diikuik', 'times-roman', 11.38, [268.73, 82.95, 268.74, 98.29]]]","[30.5, 56.89, 66.02, 72.82]",1
2,a.nyam. v,"[['a.nyam.', 'times-bold', 11.21, [30.69, 104.89, 70.45, 120.82]], ['v', 'times-italic', 9.96, [72.7, 105.96, 78.69, 120.01]]]","[30.69, 104.89, 70.45, 120.82]",1
3,menganyam v maanyam: dia pandai == daun kelapa inyo pandai maanyam daun karambia;,"[['menganyam', 'times-bold', 11.21, [82.09, 104.89, 141.87, 120.82]], ['v', 'times-italic', 10.22, [144.25, 105.96, 150.55, 120.01]], ['maanyam:', 'times-roman', 11.09, [153.34, 105.0, 200.57, 120.34]], ['dia', 'times-italic', 11.78, [203.75, 106.67, 220.16, 120.72]], ['pandai', 'times-italic', 11.78, [225.62, 106.67, 252.97, 120.72]], ['==', 'times-italic', 11.78, [263.9, 106.67, 263.91, 120.72]], ['daun', 'times-italic', 11.78, [280.31, 106.67, 285.78, 120.72]], ['kelapa', 'times-italic', 11.78, [307.65, 106.67, 318.59, 120.72]], ['inyo', 'times-roman', 11.03, [337.19, 105.75, 358.65, 121.09]], ['pandai', 'times-roman', 11.27, [50.64, 117.75, 81.47, 133.09]], ['maanyam', 'times-roman', 11.27, [86.61, 117.75, 117.44, 133.09]], ['daun', 'times-roman', 11.27, [127.72, 117.75, 137.99, 133.09]], ['karambia;', 'times-roman', 11.27, [153.41, 117.75, 184.23, 133.09]]]","[82.09, 104.89, 141.87, 120.82]",1
4,anyaman n dianyam; anyaman = =nya rapi sekali anyamannyo rancak bana,"[['anyaman', 'times-bold', 11.15, [50.64, 130.55, 95.99, 146.48]], ['n', 'times-italic', 10.48, [98.15, 131.87, 105.31, 145.92]], ['dianyam;', 'times-roman', 11.33, [128.26, 130.45, 164.96, 145.79]], ['anyaman', 'times-roman', 11.33, [175.44, 130.45, 201.66, 145.79]], ['=', 'times-roman', 11.33, [217.38, 130.45, 206.9, 145.79]], ['=nya', 'times-italic', 11.5, [222.69, 132.12, 242.44, 146.17]], ['rapi', 'times-italic', 11.5, [247.37, 132.12, 262.19, 146.17]], ['sekali', 'times-italic', 11.5, [272.05, 132.12, 291.81, 146.17]], ['anyamannyo', 'times-roman', 11.15, [300.94, 131.39, 358.88, 146.73]], ['rancak', 'times-roman', 11.27, [50.89, 143.89, 81.32, 159.23]], ['bana', 'times-roman', 11.27, [86.39, 143.89, 101.61, 159.23]]]","[50.64, 130.55, 95.99, 146.48]",1
5,"a.pa.bi.Ia p apobilo; bib: --Tuhan mengizinkan tahun depan saya in gin naik haji apobilo Tuhan maizinan, taun muko ambo taragak naiak haji","[['a.pa.bi.Ia', 'times-bold', 11.03, [30.69, 166.1, 75.96, 182.03]], ['p', 'times-roman', 11.67, [76.79, 166.0, 81.59, 181.34]], ['apobilo;', 'times-roman', 11.67, [86.4, 166.0, 120.02, 181.34]], ['bib:', 'times-roman', 11.67, [129.64, 166.0, 139.24, 181.34]], ['--Tuhan', 'times-italic', 11.27, [151.44, 167.62, 184.94, 181.67]], ['mengizinkan', 'times-italic', 11.27, [189.72, 167.62, 237.58, 181.67]], ['tahun', 'times-italic', 11.27, [247.14, 167.62, 261.51, 181.67]], ['depan', 'times-italic', 11.27, [275.85, 167.62, 285.44, 181.67]], ['saya', 'times-italic', 11.27, [304.56, 167.62, 304.58, 181.67]], ['in', 'times-italic', 11.27, [328.49, 167.62, 314.15, 181.67]], ['gin', 'times-italic', 11.27, [342.85, 167.62, 328.51, 181.67]], ['naik', 'times-italic', 11.56, [50.89, 180.81, 70.39, 194.86]], ['haji', 'times-italic', 11.56, [75.27, 180.81, 89.89, 194.86]], ['apobilo', 'times-roman', 11.5, [96.25, 179.64, 131.55, 194.98]], ['Tuhan', 'times-roman', 11.5, [136.59, 179.64, 156.76, 194.98]], ['maizinan,', 'times-roman', 11.5, [166.85, 179.64, 202.14, 194.98]], ['taun', 'times-roman', 11.5, [217.28, 179.64, 222.31, 194.98]], ['muko', 'times-roman', 11.5, [242.49, 179.64, 242.48, 194.98]], ['ambo', 'times-roman', 11.5, [267.7, 179.64, 262.65, 194.98]], ['taragak', 'times-roman', 11.5, [292.91, 179.64, 297.95, 194.98]], ['naiak', 'times-roman', 11.5, [333.25, 179.64, 323.16, 194.98]], ['haji', 'times-roman', 10.97, [50.64, 192.59, 68.95, 207.93]]]","[30.69, 166.1, 75.96, 182.03]",1
6,"a.pa.la.gi p apolai; labiah-labiah lai: disuruh saja mau, -- diberi upah disuruah sajo amuah, apolai diagiah upah","[['a.pa.la.gi', 'times-roman', 11.44, [30.94, 214.95, 73.59, 230.29]], ['p', 'times-roman', 11.44, [77.85, 214.95, 77.85, 230.29]], ['apolai;', 'times-roman', 11.44, [86.38, 214.95, 107.7, 230.29]], ['labiah-labiah', 'times-roman', 11.44, [120.5, 214.95, 163.14, 230.29]], ['lai:', 'times-roman', 11.44, [180.2, 214.95, 180.2, 230.29]], ['disuruh', 'times-italic', 11.78, [201.59, 216.56, 236.59, 230.61]], ['saja', 'times-italic', 11.78, [241.59, 216.56, 256.59, 230.61]], ['mau,', 'times-italic', 11.78, [266.59, 216.56, 276.59, 230.61]], ['--', 'times-italic', 11.78, [291.59, 216.56, 286.59, 230.61]], ['diberi', 'times-italic', 11.78, [306.59, 216.56, 316.59, 230.61]], ['upah', 'times-italic', 11.78, [341.59, 216.56, 336.59, 230.61]], ['disuruah', 'times-roman', 11.38, [50.89, 227.89, 88.76, 243.23]], ['sajo', 'times-roman', 11.38, [93.5, 227.89, 107.7, 243.23]], ['amuah,', 'times-roman', 11.38, [117.17, 227.89, 136.11, 243.23]], ['apolai', 'times-roman', 11.38, [150.31, 227.89, 164.52, 243.23]], ['diagiah', 'times-roman', 11.38, [183.45, 227.89, 197.66, 243.23]], ['upah', 'times-roman', 11.38, [221.32, 227.89, 216.6, 243.23]]]","[30.94, 214.95, 73.59, 230.29]",1
7,a.pa .tis a indak acuah; indak paduli: tidak baik bersikap -- terhadap lingkungan indak elok basikap indak acuah taadok lingkungan,"[['a.pa', 'times-roman', 11.61, [30.94, 250.2, 50.15, 265.54]], ['.tis', 'times-roman', 11.21, [51.1, 250.2, 66.89, 265.54]], ['a', 'times-italic', 11.56, [69.84, 251.37, 78.55, 265.42]], ['indak', 'times-roman', 11.5, [80.9, 250.0, 104.86, 265.34]], ['acuah;', 'times-roman', 11.5, [109.65, 250.0, 133.61, 265.34]], ['indak', 'times-roman', 11.5, [143.19, 250.0, 157.57, 265.34]], ['paduli:', 'times-roman', 11.5, [171.94, 250.0, 191.11, 265.34]], ['tidak', 'times-italic', 11.67, [210.0, 251.37, 234.25, 265.42]], ['baik', 'times-italic', 11.67, [239.1, 251.37, 253.65, 265.42]], ['bersikap', 'times-italic', 11.67, [263.35, 251.37, 292.45, 265.42]], ['--', 'times-italic', 11.67, [307.0, 251.37, 302.15, 265.42]], ['terhadap', 'times-italic', 11.67, [321.55, 251.37, 340.95, 265.42]], ['lingkungan', 'times-italic', 11.15, [51.1, 264.37, 102.45, 278.42]], ['indak', 'times-roman', 11.33, [104.4, 262.95, 128.18, 278.29]], ['elok', 'times-roman', 11.33, [132.94, 262.95, 147.21, 278.29]], ['basikap', 'times-roman', 11.33, [156.72, 262.95, 180.51, 278.29]], ['indak', 'times-roman', 11.33, [194.78, 262.95, 204.29, 278.29]], ['acuah', 'times-roman', 11.33, [223.32, 262.95, 228.07, 278.29]], ['taadok', 'times-roman', 11.33, [251.86, 262.95, 256.61, 278.29]], ['lingkungan', 'times-roman', 11.33, [285.16, 262.95, 304.18, 278.29]]]","[30.94, 250.2, 50.15, 265.54]",1
8,1 a.pel apel/ v wajik maikuti upacara rasmi (basipaik kamiliteran): --bendera wajik datang dl upacara bandera,"[['1', 'times-roman', 5.28, [31.44, 290.77, 33.59, 299.44]], ['a.pel', 'times-roman', 11.44, [34.29, 285.5, 56.55, 300.84]], ['apel/', 'times-roman', 11.95, [61.0, 285.5, 83.26, 300.84]], ['v', 'times-italic', 9.62, [95.04, 334.67, 100.63, 348.72]], ['wajik', 'times-roman', 11.78, [104.65, 333.5, 129.44, 348.84]], ['maikuti', 'times-roman', 11.78, [134.4, 333.5, 164.15, 348.84]], ['upacara', 'times-roman', 11.78, [174.06, 333.5, 198.86, 348.84]], ['rasmi', 'times-roman', 11.78, [213.72, 333.5, 223.65, 348.84]], ['(basipaik', 'times-roman', 11.78, [243.47, 333.5, 268.27, 348.84]], ['kamiliteran):', 'times-roman', 11.78, [293.05, 333.5, 332.72, 348.84]], ['--bendera', 'times-italic', 11.33, [51.1, 347.62, 97.87, 361.67]], ['wajik', 'times-roman', 11.33, [98.65, 346.2, 122.38, 361.54]], ['datang', 'times-roman', 11.33, [127.13, 346.2, 150.86, 361.54]], ['dl', 'times-roman', 11.33, [160.35, 346.2, 160.35, 361.54]], ['upacara', 'times-roman', 11.33, [174.59, 346.2, 193.57, 361.54]], ['bandera', 'times-roman', 11.33, [212.56, 346.2, 226.79, 361.54]]]","[31.44, 290.77, 33.59, 299.44]",1
9,ap.li.ka.si n 1 jaik aplikasi; jaik tempe: taplak meja itu diberi hiasan -- aleh meja tu diagiah iasan jaik tempe; 2 panggunaan; panerapan: apa yg disampaikan dl rapat perlu --nya apo disampaian dl rapek paralu aplikasinyo ap.re.sLa.si /apresiasi! n 1 kasadaran taadok nilai-nilai seni dan budayo; 2 panilaian (tau maaragoi) taadok sasuatu; 3 kanaikan nilai barang km arago pasanyo naiak atau parnintaan barang batambah Ap.ril n bulan kaampek tuan Masihi (tigo puluah an) ap.ri.o.ri adv anyo balandasan tiori indak mancaliak dan manyali- diaki kaadaan sabananyo; apriori: kita tidak boleh bersikap -- awak indak buliah basikap apriori 24,"[['ap.li.ka.si', 'times-bold', 11.09, [31.44, 368.64, 77.8, 384.57]], ['n', 'times-bolditalic', 10.72, [80.15, 369.96, 85.57, 384.21]], ['1', 'times-bolditalic', 10.72, [90.99, 369.96, 90.99, 384.21]], ['jaik', 'times-roman', 11.33, [97.9, 368.29, 115.25, 383.63]], ['aplikasi;', 'times-roman', 11.33, [119.59, 368.29, 154.29, 383.63]], ['jaik', 'times-roman', 11.33, [162.97, 368.29, 171.64, 383.63]], ['tempe:', 'times-roman', 11.33, [184.66, 368.29, 197.67, 383.63]], ['taplak', 'times-italic', 11.5, [214.09, 369.71, 240.98, 383.76]], ['meja', 'times-italic', 11.5, [245.47, 369.71, 258.91, 383.76]], ['itu', 'times-italic', 11.5, [267.88, 369.71, 272.36, 383.76]], ['diberi', 'times-italic', 11.5, [285.81, 369.71, 299.25, 383.76]], ['hiasan', 'times-italic', 11.5, [317.19, 369.71, 326.14, 383.76]], ['--', 'times-italic', 11.5, [348.57, 369.71, 335.1, 383.76]], ['aleh', 'times-roman', 11.21, [51.35, 381.75, 69.46, 397.09]], ['meja', 'times-roman', 11.21, [73.99, 381.75, 87.57, 397.09]], ['tu', 'times-roman', 11.21, [96.63, 381.75, 96.62, 397.09]], ['diagiah', 'times-roman', 11.21, [110.21, 381.75, 128.31, 397.09]], ['iasan', 'times-roman', 11.21, [146.43, 381.75, 150.95, 397.09]], ['jaik', 'times-roman', 11.21, [173.59, 381.75, 169.06, 397.09]], ['tempe;', 'times-roman', 11.21, [196.23, 381.75, 196.22, 397.09]], ['2', 'times-roman', 11.21, [227.92, 381.75, 200.75, 397.09]], ['panggunaan;', 'times-roman', 11.21, [236.97, 381.75, 250.55, 397.09]], ['panerapan:', 'times-roman', 11.21, [291.3, 381.75, 295.82, 397.09]], ['apa', 'times-italic', 11.56, [340.3, 383.17, 360.63, 397.22]], ['yg', 'times-italic', 11.61, [50.39, 395.62, 60.26, 409.67]], ['disampaikan', 'times-italic', 11.61, [65.19, 395.62, 114.53, 409.67]], ['dl', 'times-italic', 11.61, [124.39, 395.62, 124.4, 409.67]], ['rapat', 'times-italic', 11.61, [139.19, 395.62, 149.07, 409.67]], ['perlu', 'times-italic', 11.61, [168.79, 395.62, 173.74, 409.67]], ['--nya', 'times-italic', 11.61, [198.39, 395.62, 198.41, 409.67]], ['apo', 'times-roman', 11.5, [223.9, 394.7, 238.84, 410.04]], ['disampaian', 'times-roman', 11.5, [263.74, 394.7, 303.58, 410.04]], ['dl', 'times-roman', 11.5, [318.52, 394.7, 313.54, 410.04]], ['rapek', 'times-roman', 11.5, [333.46, 394.7, 338.44, 410.04]], ['paralu', 'times-roman', 11.27, [51.1, 407.39, 79.1, 422.73]], ['aplikasinyo', 'times-roman', 11.27, [83.77, 407.39, 130.44, 422.73]], ['ap.re.sLa.si', 'times-roman', 11.33, [31.69, 429.5, 85.59, 444.84]], ['/apresiasi!', 'times-roman', 11.27, [83.5, 430.0, 134.39, 445.34]], ['n', 'times-italic', 11.09, [136.09, 430.67, 141.44, 444.72]], ['1', 'times-italic', 11.09, [146.78, 430.67, 146.79, 444.72]], ['kasadaran', 'times-roman', 11.27, [153.84, 429.5, 194.31, 444.84]], ['taadok', 'times-roman', 11.27, [198.81, 429.5, 221.29, 444.84]], ['nilai-nilai', 'times-roman', 11.27, [230.29, 429.5, 270.76, 444.84]], ['seni', 'times-roman', 11.27, [284.25, 429.5, 288.75, 444.84]], ['dan', 'times-roman', 11.27, [306.73, 429.5, 302.24, 444.84]], ['budayo;', 'times-roman', 11.27, [324.72, 429.5, 333.72, 444.84]], ['2', 'times-roman', 11.44, [51.35, 442.95, 55.97, 458.29]], ['panilaian', 'times-roman', 11.44, [60.59, 442.95, 97.53, 458.29]], ['(tau', 'times-roman', 11.44, [106.77, 442.95, 116.0, 458.29]], ['maaragoi)', 'times-roman', 11.44, [129.86, 442.95, 157.56, 458.29]], ['taadok', 'times-roman', 11.44, [176.04, 442.95, 185.27, 458.29]], ['sasuatu;', 'times-roman', 11.44, [208.37, 442.95, 222.22, 458.29]], ['3', 'times-roman', 11.44, [249.93, 442.95, 226.84, 458.29]], ['kanaikan', 'times-roman', 11.44, [259.17, 442.95, 263.79, 458.29]], ['nilai', 'times-roman', 11.44, [300.73, 442.95, 286.88, 458.29]], ['barang', 'times-roman', 11.44, [328.44, 442.95, 314.59, 458.29]], ['km', 'times-roman', 11.38, [51.35, 455.64, 61.13, 470.98]], ['arago', 'times-roman', 11.38, [66.03, 455.64, 85.59, 470.98]], ['pasanyo', 'times-roman', 11.38, [95.38, 455.64, 119.83, 470.98]], ['naiak', 'times-roman', 11.38, [134.51, 455.64, 144.29, 470.98]], ['atau', 'times-roman', 11.38, [163.86, 455.64, 163.86, 470.98]], ['parnintaan', 'times-roman', 11.38, [188.32, 455.64, 212.78, 470.98]], ['barang', 'times-roman', 11.38, [242.13, 455.64, 242.13, 470.98]], ['batambah', 'times-roman', 11.38, [276.37, 455.64, 281.26, 470.98]], ['Ap.ril', 'times-roman', 11.44, [31.89, 478.0, 61.62, 493.34]], ['n', 'times-italic', 10.73, [63.6, 479.37, 71.1, 493.42]], ['bulan', 'times-roman', 11.38, [72.7, 478.0, 96.81, 493.34]], ['kaampek', 'times-roman', 11.38, [101.63, 478.0, 130.56, 493.34]], ['tuan', 'times-roman', 11.38, [140.21, 478.0, 149.85, 493.34]], ['Masihi', 'times-roman', 11.38, [164.32, 478.0, 178.78, 493.34]], ['(tigo', 'times-roman', 11.38, [198.07, 478.0, 202.89, 493.34]], ['puluah', 'times-roman', 11.38, [227.0, 478.0, 231.82, 493.34]], ['an)', 'times-roman', 11.38, [260.75, 478.0, 246.29, 493.34]], ['ap.ri.o.ri', 'times-roman', 11.61, [31.69, 500.54, 74.69, 515.88]], ['adv', 'times-italic', 11.03, [77.04, 501.46, 94.96, 515.51]], ['anyo', 'times-roman', 11.44, [97.7, 500.54, 116.74, 515.88]], ['balandasan', 'times-roman', 11.44, [121.5, 500.54, 164.34, 515.88]], ['tiori', 'times-roman', 11.44, [173.86, 500.54, 188.14, 515.88]], ['indak', 'times-roman', 11.44, [221.46, 500.54, 226.22, 515.88]], ['mancaliak', 'times-roman', 11.44, [250.02, 500.54, 269.06, 515.88]], ['dan', 'times-roman', 11.44, [297.62, 500.54, 283.34, 515.88]], ['manyali-', 'times-roman', 11.44, [316.66, 500.54, 321.42, 515.88]], ['diaki', 'times-roman', 11.44, [51.1, 514.0, 74.81, 529.34]], ['kaadaan', 'times-roman', 11.44, [79.56, 514.0, 108.01, 529.34]], ['sabananyo;', 'times-roman', 11.44, [136.47, 514.0, 169.67, 529.34]], ['apriori:', 'times-roman', 11.44, [188.64, 514.0, 207.61, 529.34]], ['kita', 'times-italic', 11.56, [230.9, 515.17, 248.87, 529.22]], ['tidak', 'times-italic', 11.56, [253.37, 515.17, 271.34, 529.22]], ['boleh', 'times-italic', 11.56, [280.33, 515.17, 293.81, 529.22]], ['bersikap', 'times-italic', 11.56, [307.29, 515.17, 329.76, 529.22]], ['--', 'times-italic', 11.56, [347.73, 515.17, 338.75, 529.22]], ['awak', 'times-roman', 11.33, [51.6, 526.69, 70.26, 542.03]], ['indak', 'times-roman', 11.33, [74.93, 526.69, 93.59, 542.03]], ['buliah', 'times-roman', 11.33, [102.92, 526.69, 121.58, 542.03]], ['basikap', 'times-roman', 11.33, [135.58, 526.69, 154.24, 542.03]], ['apriori', 'times-roman', 11.33, [172.91, 526.69, 186.9, 542.03]], ['24', 'times-roman', 10.85, [198.25, 30.6, 211.04, 45.94]]]","[31.44, 368.64, 77.8, 384.57]","[1, 2]"
