In [1]:
import pandas as pd
import numpy as np
import os
import re
import json as js
from pathlib import Path
from tqdm import tqdm
from ast import literal_eval

In [2]:
directory_kamus = "Daftar Kamus Analisis Machine Readable"
directory_kamus_full = "[Full] Daftar Kamus Ekstraksi"

### Algoritma One Entry Corpus ###

In [3]:
POS = ["v","a","n","pron","adv","num","p"]

In [4]:
# Algoritma Tambahan
def is_contain_bold_and_italic(font):
    contains_bold = False; contains_italic = False
    for i in font:
        if "bold" in i.lower(): contains_bold = True
        if "italic" in i.lower(): contains_italic = True
        if contains_bold == True and contains_italic == True: return True
    return False

def is_last_fonem(s): # baru dapat handle fonem (/../) dan ([...])
    if re.match(r'^.*\]$',str(s)): return True
    if re.match(r'^.*\/$',str(s)): return True
    return False

def is_start_fonem(s): # baru dapat handle fonem (/../) dan ([...])
    if re.match(r'^\[.*',str(s)): return True
    if re.match(r'^\/.*',str(s)): return True
    return False

def is_bold_contains_POS(s):
    kata = s.strip()
    
    if len(kata) > 1:
        if is_contain_only_whitespaces(kata[-2]) and (kata[-1] in POS): return True
    else:
        if (kata[-1] in POS): return True
    
    return False

def is_contain_only_whitespaces(s):
    if re.match(r'^\s*$', str(s)): return True
    return False

def is_end_entri(s):
    symbol = [";",",",":"]
    if s in symbol:
        return True
    else:
        return False

In [74]:
# make entry by font
def make_entry_by_font(data):
    result = {
        "Entri":[],
        "entry_font_size_pos":[],
        "posisi_entry":[],
        "page":[]
    }
    
    entry = []; entry_with_font_size_pos = []; pos_dummy = None; page_dummy = None
    
    for ind in data.index:
        txt = data["text"][ind]
        size = data["size"][ind]
        size = round(size,2)
        fnt = data["font"][ind].lower()
        x0 = round(data["x0"][ind],2)
        y0 = round(data["y0"][ind],2)
        x1 = round(data["x1"][ind],2)
        y1 = round(data["y1"][ind],2)
        pos = [x0,y0,x1,y1]
        page = data["page"][ind]
        
        if "bold" in fnt and entry == []: # start entry
            entry.append(txt)
            entry_with_font_size_pos.append([txt,fnt,size,[x0,y0,x1,y1]])
            pos_dummy = pos
            page_dummy = page
            
        elif "bold" in fnt and entry != []:
            prev_txt = data["text"][ind-1].strip()
            prev_fnt = data["font"][ind-1].lower()
            entry_result = " ".join(entry).strip()
            
            if "bold" in prev_fnt and not txt[0].isdigit() and is_bold_contains_POS(entry_result): # handle prakategorial tanpa koma
                result["Entri"].append(entry_result)
                result["entry_font_size_pos"].append(entry_with_font_size_pos)
                result["posisi_entry"].append(pos_dummy)
                
                if page == page_dummy: 
                    result["page"].append(page_dummy)
                else:
                    result["page"].append([page_dummy,page])
                    
                entry = []; entry_with_font_size_pos = []; pos_dummy = None; page_dummy = None
                entry.append(txt) # mulai entry baru
                entry_with_font_size_pos.append([txt,fnt,size,[x0,y0,x1,y1]])
                pos_dummy = pos
                page_dummy = page
                
            elif "bold" in prev_fnt and (txt[0].isdigit() or not is_bold_contains_POS(entry_result)): # handle kata bold yang terpisah
                entry.append(txt) 
                entry_with_font_size_pos.append([txt,fnt,size,[x0,y0,x1,y1]])
                
            elif "bold" not in prev_fnt and (txt[0].isdigit() or is_start_fonem(txt)): # polisemi dan fonem bold
                entry.append(txt) 
                entry_with_font_size_pos.append([txt,fnt,size,[x0,y0,x1,y1]])
                
            else: 
                result["Entri"].append(entry_result)
                result["entry_font_size_pos"].append(entry_with_font_size_pos)
                result["posisi_entry"].append(pos_dummy)
                
                if page == page_dummy: 
                    result["page"].append(page_dummy)
                else:
                    result["page"].append([page_dummy,page])
                    
                entry = []; entry_with_font_size_pos = []; pos_dummy = None; page_dummy = None
                entry.append(txt) # mulai entry baru
                entry_with_font_size_pos.append([txt,fnt,size,[x0,y0,x1,y1]])
                pos_dummy = pos
                page_dummy = page
                
        elif "bold" not in fnt.lower() and entry != []:
            entry.append(txt) 
            entry_with_font_size_pos.append([txt,fnt,size,[x0,y0,x1,y1]])
            
        else:
            result["Entri"].append(txt.strip())
            result["entry_font_size_pos"].append([[txt,fnt,size,[x0,y0,x1,y1]]])
            result["posisi_entry"].append(pos)
            result["page"].append(page)
            

    if entry != []: # jika ada entry yang tertinggal
        entry_result = " ".join(entry).strip()
        result["Entri"].append(entry_result)
        result["entry_font_size_pos"].append(entry_with_font_size_pos)
        result["posisi_entry"].append(pos_dummy)
        result["page"].append(page_dummy)

    return result

In [75]:
# algoritma bersihkan entry dari fonem
def clean_entry(data):
    result = {
        "Entri":[],
        "entry_font_size_pos":[],
        "posisi_entry":[],
        "page":[]
    }
    
    for i in range(len(data["Entri"])): # remove fonem
        txt = data["Entri"][i] # data text
        
        if not is_contain_only_whitespaces(txt):
            
            entry_font_size_pos = data["entry_font_size_pos"][i]
            txt = re.sub(r'\[.*?\]',"",txt)
            entry_font_size_pos = clean_entry_font_size_paranthesis(entry_font_size_pos)

            txt = re.sub(r'\/.*?\/',"",txt)
            entry_font_size_pos = clean_entry_font_size_slash(entry_font_size_pos)

            clean = re.sub(' +', ' ', txt) ## remove multiple whitespace
            result["Entri"].append(clean.strip())
            result["entry_font_size_pos"].append(entry_font_size_pos)

            result['posisi_entry'].append(data['posisi_entry'][i])
            result['page'].append(data['page'][i])
    
    for j in range(1,len(result['Entri'])): # fix symbol
        array_simbol = []; array_simbol_font_size_pos = []
        
        prev_txt_split = result["Entri"][j-1].split(" ")
        prev_entri_font_size_pos = result['entry_font_size_pos'][j-1]
        
        # buang seluruh simbol, kecuali ; pada entri sebelumnya
        while (prev_txt_split[-1] != "") and (not is_end_entri(prev_txt_split[-1][-1])):
            if (prev_txt_split[-1][0].isalnum()) or (prev_txt_split[-1][-1].isalnum()): 
                break
                
            else:
                if (prev_txt_split==[] or prev_entri_font_size_pos == []):break
                
                array_simbol.append(prev_txt_split[-1])
                array_simbol_font_size_pos.append(prev_entri_font_size_pos[-1])
                del prev_txt_split[-1]
                del prev_entri_font_size_pos[-1]
                
                result["Entri"][j-1] = " ".join(prev_txt_split)
                result['entry_font_size_pos'][j-1] = prev_entri_font_size_pos
            
            if (prev_txt_split==[] or prev_entri_font_size_pos == []):break
        
        txt_split = result['Entri'][j].split(" ")
        if is_end_entri(txt_split[0]): 
            result['Entri'][j-1] = result['Entri'][j-1] + txt_split[0]
            result['entry_font_size_pos'][j-1].append(result['entry_font_size_pos'][j][0])
            
            del txt_split[0]
            result['entry_font_size_pos'][j] = result['entry_font_size_pos'][j][1:]
            result['Entri'][j] = " ".join(txt_split)
        
        if array_simbol != []:
            new_entry = []
            new_entry.extend(array_simbol)
            new_entry.extend(txt_split)
            result['Entri'][j] = " ".join(new_entry)
            
            new_entry_font_size_pos = []
            new_entry_font_size_pos.extend(array_simbol_font_size_pos)
            new_entry_font_size_pos.extend(result['entry_font_size_pos'][j])
            result['entry_font_size_pos'][j] = new_entry_font_size_pos    
    
    for l in range(len(result['entry_font_size_pos'])):
        if result['entry_font_size_pos'][l] != []:
            result['posisi_entry'][l] = result['entry_font_size_pos'][l][0][-1]
        
    return result

In [76]:
def clean_entry_font_size_paranthesis(data):
    clean_data = []
    i = 0
    
    while i < len(data):
        txt = data[i][0]
        if re.match(r'^.*\[.*?\].*$',str(txt)): ## kasus ...[..]...
            clean = re.sub(r'\[.*?\]',"",txt)
            if clean == "":
                i += 1
            else:
                clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                i += 1
        elif re.match(r'^.*\[.*',str(txt)): ## kasus ...[...
            nxt = i+1
            if nxt > len(data)-1: # i di indeks terakhir
                clean_data.append(data[i])
                break
                
            nxt_txt = data[nxt][0]
            while not re.match(r'^.*\].*$',str(nxt_txt)): # mencari "...]...."
                nxt += 1
                if nxt > len(data)-1: break
                nxt_txt = data[nxt][0]
            
            if nxt > len(data)-1: # jika "....]..." tidak ditemukan
                for k in range(i,nxt):
                    clean_data.append(data[k])
                break
            else:
                ## append [ pertama
                clean = txt.split("[",1)[0]
                if clean != "":
                    clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                    
                ## append ] kedua
                clean_nxt = nxt_txt.split("]",1)[1]
                if clean_nxt != "":
                    clean_data.append([clean_nxt.strip(),data[nxt][1],data[nxt][2],data[i][3]])
                
                i = nxt+1
        else:
            clean_data.append(data[i])
            i += 1
    
    return clean_data


def clean_entry_font_size_slash(data):
    clean_data = []
    i = 0
    
    while i < len(data):
        txt = data[i][0]
        if re.match(r'^.*\/.*?\/.*$',str(txt)): ## kasus .../../...
            clean = re.sub(r'\/.*?\/',"",txt)
            if clean == "":
                i += 1
            else:
                clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                i += 1
        elif re.match(r'^.*\/.*',str(txt)): ## kasus .../...
            nxt = i+1
            if nxt > len(data)-1: # i di indeks terakhir
                clean_data.append(data[i])
                break
                
            nxt_txt = data[nxt][0]
            while not re.match(r'^.*\/.*$',str(nxt_txt)): # mencari ".../...."
                nxt += 1
                if nxt > len(data)-1: break
                nxt_txt = data[nxt][0]
            
            if nxt > len(data)-1: # jika "..../..." tidak ditemukan
                for k in range(i,nxt):
                    clean_data.append(data[k])
                break
            else:
                ## append / pertama
                clean = txt.split("/",1)[0]
                if clean != "":
                    clean_data.append([clean.strip(),data[i][1],data[i][2],data[i][3]])
                    
                ## append / kedua
                clean_nxt = nxt_txt.split("/",1)[1]
                if clean_nxt != "":
                    clean_data.append([clean_nxt.strip(),data[nxt][1],data[nxt][2],data[i][3]])
                
                i = nxt+1
        else:
            clean_data.append(data[i])
            i += 1
    
    return clean_data

In [77]:
def fix_page(pages):
    clean_page = []
    cnt = 1;
    
    for i in range(len(pages)):
        if i == 0:
            clean_page.append(cnt)
        else:
            if isinstance(pages[i], list):
                clean_page.append([cnt,cnt+1])
                cnt += 1
            else:
                if isinstance(pages[i-1], list):
                    clean_page.append(cnt)
                else:
                    if pages[i] == pages[i-1]:
                        clean_page.append(cnt)
                    else:
                        cnt += 1
                        clean_page.append(cnt)
    return clean_page

In [79]:
# memisahkan prakategorial
def seperate_prakategorial(data):
    result = {
        "Entri":[],
        "entry_font_size_pos":[],
        "posisi_entry":[],
        "page":[]
    }
    
    for i in range(len(data["Entri"])):
        txt = data["Entri"][i]
        split_txt = txt.strip().split(",",1)
        
        if len(split_txt) < 2 or txt[-1] == ",": # tidak terdapat koma atau koma berada di akhir
            result['Entri'].append(txt)
            result['entry_font_size_pos'].append(data['entry_font_size_pos'][i])
            result['page'].append(data['page'][i])
            result['posisi_entry'].append(data['posisi_entry'][i])
        
        else:
            frst_entri = split_txt[0].strip().split(" ")
            sec_entri = split_txt[1].strip().split(" ")
            
            for j in range(len(frst_entri)):
                frst_entri[j] = frst_entri[j].strip()
            
            for k in range(len(sec_entri)):
                sec_entri[k] = sec_entri[k].strip()
                
            if len(frst_entri) <= 2 and (frst_entri[0] == "" or frst_entri[0] == ","): # koma berada di awal entri
                result['Entri'].append(txt)
                result['entry_font_size_pos'].append(data['entry_font_size_pos'][i])
                result['page'].append(data['page'][i])
                result['posisi_entry'].append(data['posisi_entry'][i])
            
            else:
                inf_frst_entri = data['entry_font_size_pos'][i][:len(frst_entri)]
                
                if "bold" in inf_frst_entri[-1][1].lower() or frst_entri[-1] in POS:
                    if (len(frst_entri) + len(sec_entri)) == len(data['entry_font_size_pos'][i]): # kasus koma menempel
                        frst_entri[-1] = frst_entri[-1] + ","
                        inf_sec_entri = data['entry_font_size_pos'][i][len(frst_entri):]

                    else: # kasus koma tidak menempel
                        frst_entri.append(",")
                        inf_frst_entri = data['entry_font_size_pos'][i][:len(frst_entri)+1]
                        inf_sec_entri = data['entry_font_size_pos'][i][len(frst_entri)+1:]
                        
                    # entri pertama
                    result['Entri'].append(" ".join(frst_entri))
                    result['entry_font_size_pos'].append(inf_frst_entri)
                    result['page'].append(data['page'][i])
                    result['posisi_entry'].append(data['posisi_entry'][i])

                    # entri kedua
                    result['Entri'].append(" ".join(sec_entri))
                    result['entry_font_size_pos'].append(inf_sec_entri)
                    result['page'].append(data['page'][i])
                    result['posisi_entry'].append(data['posisi_entry'][i])

                else:
                    result['Entri'].append(txt)
                    result['entry_font_size_pos'].append(data['entry_font_size_pos'][i])
                    result['page'].append(data['page'][i])
                    result['posisi_entry'].append(data['posisi_entry'][i])
                
                
    return result

In [80]:
def categorize_prakategorial(entries):
    output = []
    
    for i in entries:
        txt_split = i.split(" ")
        if i == "" or len(i)==1:
            output.append(0)
        else:
            if re.match(r'.*\,$',str(i)) and len(txt_split) <= 3: 
                output.append(1)
            elif is_contain_only_whitespaces(i[-2]) and (i[-1] in POS):
                output.append(1)
            else:
                output.append(0)
    return output

In [81]:
def build_corpus_one_entry_by_font(data):
    # tahapan awal, pendekatan dengan font
    result = make_entry_by_font(data)
    clean_result = clean_entry(result)
    clean_result = seperate_prakategorial(clean_result)
    clean_result["is_padanan_lema"] = categorize_prakategorial(clean_result["Entri"])
    clean_result["page"] = fix_page(clean_result["page"])
    
    return clean_result

### Main Program

In [85]:
# ganti directory
directory_CSV = "CSV JSON all information"
directory_hasil = "CSV One Entry JSON With Font Approach"

for filename in tqdm(os.listdir(directory_CSV)):
    data = pd.read_csv(directory_CSV + "/" + filename)
    data = data.dropna()
    data = data.reset_index(drop=True)
    input_fonts = data["font"].values.tolist()
    new_filename = os.path.splitext(filename)[0]
    
    if is_contain_bold_and_italic(input_fonts):
        print("====" + new_filename + "====")
        CSV_res = build_corpus_one_entry_by_font(data)

        result_csv = pd.DataFrame.from_dict(CSV_res)
        result_csv = result_csv.reset_index(drop=True)
        
        result_csv = result_csv.dropna()
        result_csv = result_csv.reset_index(drop=True)
        
        result_csv.to_csv(directory_hasil + "/" + new_filename + "-one_entry_from_JSON-font.csv",index=False)

 22%|██████████████████▋                                                                 | 2/9 [00:00<00:00, 13.44it/s]

====41. Kamus Bahasa Indonesia-Bali A-K (1997)-page_58_59-hasil-ekstraksi====
====43. Kamus Bahasa Indonesia-Bahasa Minangkabau I (1994)-page_23_24-hasil-ekstraksi====
====54. Kamus Bahasa Indonesia Mentawai (1998)-page_23_24-hasil-ekstraksi====
====57. Kamus Bahasa Bugis-Indonesia (1977)-page_20_21-hasil-ekstraksi====


 67%|████████████████████████████████████████████████████████                            | 6/9 [00:00<00:00, 12.83it/s]

====61. Kamus Banjar-Indonesia (1977)-page_51_52-hasil-ekstraksi====
====68. Kamus Dwibahasa Bahasa Talaud - Bahasa Indonesia (2018)-page_19_20-hasil-ekstraksi====
====77. Kamus Samawa-Indonesia Edisi 2 (2017)-page_27_28-hasil-ekstraksi====


 89%|██████████████████████████████████████████████████████████████████████████▋         | 8/9 [00:00<00:00,  9.51it/s]

====89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indonesia (2017)-page_17_18-hasil-ekstraksi====


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 10.61it/s]

====91. Kamus Simalungun - Indonesia (edisi kedua) (2015)-page_48_49-hasil-ekstraksi====





In [86]:
directory_hasil = "CSV One Entry JSON With Font Approach"

# drop null data 
for filename in tqdm(os.listdir(directory_hasil)):
    data_clean = pd.read_csv(directory_hasil + "/" + filename)
    data_clean = data_clean.dropna()
    data_clean = data_clean[data_clean["entry_font_size_pos"] != "[]"]
    data_clean = data_clean.reset_index(drop=True)
    
    data_clean.to_csv(directory_hasil + "/" + filename,index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 84.38it/s]


### Main Program Kamus Full (JSON)

In [87]:
directory_CSV = "[Full] CSV JSON all information - Final"
directory_hasil = "[Full] CSV One Entry JSON With Font Approach"

for filename in tqdm(os.listdir(directory_CSV)):
    data = pd.read_csv(directory_CSV + "/" + filename)
    
    data = data.dropna()
    data = data.reset_index(drop=True)
    
    input_fonts = data["font"].values.tolist()
    new_filename = os.path.splitext(filename)[0]
    
    if is_contain_bold_and_italic(input_fonts):
        print("====" + new_filename + "====")
        CSV_res = build_corpus_one_entry_by_font(data)
        
        result_csv = pd.DataFrame.from_dict(CSV_res)
        result_csv = result_csv[result_csv["Entri"] != ""]
        result_csv = result_csv[result_csv["entry_font_size_pos"] != "[]"]
        result_csv = result_csv.reset_index(drop=True)

        result_csv = pd.DataFrame.from_dict(CSV_res)
        result_csv.to_csv(directory_hasil + "/" + new_filename + "-one_entry_from_JSON-font.csv",index=False)

  0%|                                                                                           | 0/47 [00:00<?, ?it/s]

====10. Kamus Bahasa Indonesia-Dayak Deah Edisi I (2013)-hasil-ekstraksi====


  2%|█▊                                                                                 | 1/47 [00:06<04:39,  6.07s/it]

====12. Kamus Bahasa Indonesia-Kaidipang L-Z (2000)-hasil-ekstraksi====


  4%|███▌                                                                               | 2/47 [00:19<07:57, 10.62s/it]

====14. Kamus Bahasa Indonesia-Bahasa Minangkabau II (1994)-hasil-ekstraksi====


  6%|█████▎                                                                             | 3/47 [00:32<08:35, 11.71s/it]

====15. Kamus Bahasa Indonesia-Pasir (1997)-hasil-ekstraksi====


  9%|███████                                                                            | 4/47 [00:45<08:39, 12.07s/it]

====16. Kamus Bahasa Indonesia Karo A-K (1998)-hasil-ekstraksi====


 11%|████████▊                                                                          | 5/47 [00:58<08:40, 12.40s/it]

====17. Kamus Melayu Makasar-Indonesia (1985)-hasil-ekstraksi====


 13%|██████████▌                                                                        | 6/47 [01:02<06:37,  9.69s/it]

====18. Kamus Bahasa Jawa-Bahasa Indonesia I (1993)-hasil-ekstraksi====


 15%|████████████▎                                                                      | 7/47 [01:11<06:15,  9.38s/it]

====19. Kamus Bahasa Indoensia-Melayu Riau (1997)-hasil-ekstraksi====


 17%|██████████████▏                                                                    | 8/47 [01:23<06:37, 10.18s/it]

====2. Kamus Melayu-Indonesia (1985)-hasil-ekstraksi====


 19%|███████████████▉                                                                   | 9/47 [01:31<06:01,  9.52s/it]

====20. Kamus Bahasa Melayu Ambon-Indonesia (1998)-hasil-ekstraksi====


 21%|█████████████████▍                                                                | 10/47 [01:35<04:44,  7.68s/it]

====21. Kamus Bahasa Indonesia-Sentani A-K (1999)-hasil-ekstraksi====


 23%|███████████████████▏                                                              | 11/47 [01:39<04:00,  6.68s/it]

====23. Kamus Dwibahasa Dayak Ngaju-Indonesia (2013)-hasil-ekstraksi====


 26%|████████████████████▉                                                             | 12/47 [01:44<03:30,  6.02s/it]

====24. Kamus Minangkabau-Indonesia (1985)-hasil-ekstraksi====


 28%|██████████████████████▋                                                           | 13/47 [01:53<03:55,  6.92s/it]

====26. Kamus Bahasa Indonesia-Bahasa Tonsea II (1996)-hasil-ekstraksi====


 30%|████████████████████████▍                                                         | 14/47 [01:56<03:08,  5.72s/it]

====27. Kamus Bahasa Indonesia-Saluan (2012)-hasil-ekstraksi====


 32%|██████████████████████████▏                                                       | 15/47 [01:59<02:41,  5.04s/it]

====28. Kamus Bahasa Kutai-Bahasa Indonesia (2013)-hasil-ekstraksi====


 34%|███████████████████████████▉                                                      | 16/47 [02:10<03:31,  6.82s/it]

====29. Kata Tetun Indonesia (1985)-hasil-ekstraksi====


 36%|█████████████████████████████▋                                                    | 17/47 [02:14<02:58,  5.93s/it]

====31. Kamus Sumbawa-Indonesia (1985)-hasil-ekstraksi====


 38%|███████████████████████████████▍                                                  | 18/47 [02:19<02:41,  5.58s/it]

====32. Kamus Melayu Langkat-Indonesia (1985)-hasil-ekstraksi====


 40%|█████████████████████████████████▏                                                | 19/47 [02:23<02:29,  5.33s/it]

====33. Kamus Wolio Indonesia (1985)-hasil-ekstraksi====


 43%|██████████████████████████████████▉                                               | 20/47 [02:29<02:22,  5.28s/it]

====34. Kamus Bahasa Indonesia-Bali L-Z (1998)-hasil-ekstraksi====


 45%|████████████████████████████████████▋                                             | 21/47 [02:38<02:52,  6.64s/it]

====36. Kamus Bahasa Indonesia-Kulawi (2012)-hasil-ekstraksi====


 47%|██████████████████████████████████████▍                                           | 22/47 [02:45<02:43,  6.54s/it]

====38. Kamus Bahasa Indonesia-Karo L-Z (1999)-hasil-ekstraksi====


 49%|████████████████████████████████████████▏                                         | 23/47 [03:02<03:56,  9.85s/it]

====4. Kamus Bahasa Indonesia-Jambi A-K (1998)-hasil-ekstraksi====


 51%|█████████████████████████████████████████▊                                        | 24/47 [03:07<03:14,  8.45s/it]

====41. Kamus Bahasa Indonesia-Bali A-K (1997)-hasil-ekstraksi====


 53%|███████████████████████████████████████████▌                                      | 25/47 [03:18<03:21,  9.15s/it]

====42. Kamus Bahasa Indonesia-Bahasa Sunda II (1993)-hasil-ekstraksi====


 55%|█████████████████████████████████████████████▎                                    | 26/47 [03:32<03:43, 10.63s/it]

====44. Kamus Melayu Deli-Indonesia (1985)-hasil-ekstraksi====


 57%|███████████████████████████████████████████████                                   | 27/47 [03:36<02:48,  8.41s/it]

====46. Kamus Bahasa Banjar Dialek Hulu-Indonesia (2008)-hasil-ekstraksi====


 60%|████████████████████████████████████████████████▊                                 | 28/47 [03:51<03:21, 10.59s/it]

====5. Kamus Bahasa Indonesia-Bahasa Tonsea I (1996)-hasil-ekstraksi====


 62%|██████████████████████████████████████████████████▌                               | 29/47 [03:54<02:30,  8.35s/it]

====51. Kamus Bahasa Bali Kuno-Indonesia (1985)-hasil-ekstraksi====


 64%|████████████████████████████████████████████████████▎                             | 30/47 [03:57<01:54,  6.71s/it]

====52. Kamus Ogan-Indonesia (1985)-hasil-ekstraksi====


 66%|██████████████████████████████████████████████████████                            | 31/47 [04:04<01:47,  6.72s/it]

====54. Kamus Bahasa Indonesia Mentawai (1998)-hasil-ekstraksi====


 68%|███████████████████████████████████████████████████████▊                          | 32/47 [04:06<01:21,  5.44s/it]

====55. Kamus Bahasa Indonesia Bakumpai I (1995)-hasil-ekstraksi====


 70%|█████████████████████████████████████████████████████████▌                        | 33/47 [04:12<01:14,  5.35s/it]

====56. Kamus Lampung-Indonesia (1985)-hasil-ekstraksi====


 72%|███████████████████████████████████████████████████████████▎                      | 34/47 [04:18<01:14,  5.74s/it]

====58. Kamus Melayu Ketapang-Indonesia A-M (2010)-hasil-ekstraksi====


 74%|█████████████████████████████████████████████████████████████                     | 35/47 [04:23<01:06,  5.50s/it]

====60. Kamus Sunda-Indonesia (1985)-hasil-ekstraksi====


 77%|██████████████████████████████████████████████████████████████▊                   | 36/47 [04:35<01:21,  7.45s/it]

====63. Kamus Bahasa Indonesia-Lampung Dialek A (1999)-hasil-ekstraksi====


 79%|████████████████████████████████████████████████████████████████▌                 | 37/47 [04:44<01:18,  7.87s/it]

====66. Kamus Melayu Bali-Indonesia (1985)-hasil-ekstraksi====


 81%|██████████████████████████████████████████████████████████████████▎               | 38/47 [04:47<00:58,  6.54s/it]

====68. Kamus Dwibahasa Bahasa Talaud - Bahasa Indonesia (2018)-hasil-ekstraksi====


 83%|████████████████████████████████████████████████████████████████████              | 39/47 [04:56<00:56,  7.07s/it]

====71. Kamus dwibahasa Bugis-Indonesia (2017)-hasil-ekstraksi====


 85%|█████████████████████████████████████████████████████████████████████▊            | 40/47 [04:57<00:37,  5.32s/it]

====78. Kamus Tolaki-Indonesia (1985)-hasil-ekstraksi====


 87%|███████████████████████████████████████████████████████████████████████▌          | 41/47 [05:01<00:30,  5.07s/it]

====8. Kamus Indonesia-Angkola (1995)-hasil-ekstraksi====


 89%|█████████████████████████████████████████████████████████████████████████▎        | 42/47 [05:06<00:24,  4.95s/it]

====85. Kamus Tondano-Indonesia (1985)-hasil-ekstraksi====


 91%|███████████████████████████████████████████████████████████████████████████       | 43/47 [05:15<00:24,  6.15s/it]

====87. Kamus Bahasa Indonesia-Kaidipang (A-K) (1999)-hasil-ekstraksi====


 94%|████████████████████████████████████████████████████████████████████████████▊     | 44/47 [05:27<00:23,  7.97s/it]

====89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indonesia (2017)-hasil-ekstraksi====


 96%|██████████████████████████████████████████████████████████████████████████████▌   | 45/47 [05:29<00:12,  6.07s/it]

====9. Kamus Manado-Indonesia (1985)-hasil-ekstraksi====


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 46/47 [05:33<00:05,  5.57s/it]

====91. Kamus Simalungun - Indonesia (edisi kedua) (2015)-hasil-ekstraksi====


100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [05:44<00:00,  7.34s/it]


In [88]:
directory_hasil = "[Full] CSV One Entry JSON With Font Approach"

for filename in tqdm(os.listdir(directory_hasil)):
    data_clean = pd.read_csv(directory_hasil + "/" + filename)
    data_clean = data_clean.dropna()
    data_clean = data_clean[data_clean["entry_font_size_pos"] != "[]"]
    data_clean = data_clean.reset_index(drop=True)
    
    data_clean.to_csv(directory_hasil + "/" + filename,index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:09<00:00,  5.26it/s]


### Main Program (XML) ###

In [9]:
directory_CSV = "CSV XML All Information"
directory_hasil = "CSV One Entry XML With Font Approach"

for filename in tqdm(os.listdir(directory_CSV)):
    data = pd.read_csv(directory_CSV + "/" + filename)
    data.rename(columns={"kata":"text"},inplace=True)
    data = data.dropna()
    data = data.reset_index(drop=True)
    input_fonts = data["font"].values.tolist()
    new_filename = os.path.splitext(filename)[0]
    
    if is_contain_bold_and_italic(input_fonts):
        print("====" + new_filename + "====")
        CSV_res = build_corpus_one_entry_by_font(data)

        result_csv = pd.DataFrame.from_dict(CSV_res)
        result_csv.to_csv(directory_hasil + "/" + new_filename + "-one_entry_from_XML.csv",index=False)
#         try:
#             CSV_res = build_corpus_one_entry_by_font(data)

#             result_csv = pd.DataFrame.from_dict(CSV_res)
#             result_csv.to_csv(directory_hasil + "/" + new_filename + "-one_entry_from_XML.csv",index=False)
#         except:
#             print("==== Kamus Gagal ====")
#             print(new_filename)

 17%|█████████████▊                                                                     | 3/18 [00:00<00:00, 42.85it/s]

====41. Kamus Bahasa Indonesia-Bali A-K (1997)-page_58_59_kata====





KeyError: 'size'

### Cek Kamus ###

In [23]:
kamus = pd.read_csv("coba 89-one_entry_from_JSON-font.csv")

In [24]:
kamus = kamus.dropna()
kamus = kamus[kamus["entry_font_size_pos"] != "[]"]

In [25]:
entry_font_size_pos = []

for i in kamus["entry_font_size_pos"].values.tolist():
    entry_font_size_pos.append(literal_eval(i))

In [26]:
# tampilkan seluruh baris dan seluruh nilai pada kolom
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

display(kamus)

# reset option
pd.reset_option("display")

Unnamed: 0,Entri,entry_font_size_pos,posisi_entry,page
0,1,"[['1', 'timesnewromanpsmt', 10.0, [221.4, 535.47, 228.9, 548.81]]]","[221.4, 535.47, 228.9, 548.81]",1
1,A aba n bete; talas: nja aba dun trung ibu membakar bete,"[['A', 'timesnewromanps-boldmt', 20.0, [216.6, 54.12, 236.0, 81.82]], ['aba', 'timesnewromanps-boldmt', 12.0, [85.0, 103.27, 103.4, 119.89]], ['n', 'timesnewromanps-italicmt', 12.0, [136.0, 104.39, 142.0, 119.04]], ['bete;', 'timesnewromanpsmt', 12.0, [142.0, 103.16, 166.5, 119.17]], ['talas:', 'timesnewromanpsmt', 12.0, [171.39, 103.16, 195.89, 119.17]], ['nja', 'timesnewromanps-italicmt', 12.0, [201.39, 104.39, 217.48, 119.04]], ['aba', 'timesnewromanps-italicmt', 12.0, [222.85, 104.39, 233.57, 119.04]], ['dun', 'timesnewromanps-italicmt', 12.0, [244.31, 104.39, 249.66, 119.04]], ['trung', 'timesnewromanps-italicmt', 12.0, [265.77, 104.39, 276.48, 119.04]], ['ibu', 'timesnewromanpsmt', 12.0, [293.2, 103.16, 311.3, 119.17]], ['membakar', 'timesnewromanpsmt', 12.0, [317.33, 103.16, 359.56, 119.17]], ['bete', 'timesnewromanpsmt', 12.0, [106.4, 116.96, 129.4, 132.97]]]","[216.6, 54.12, 236.0, 81.82]",1
2,abletsing a terbalik: nemot amblemba king on abletsing dia memakai baju terbalik,"[['abletsing', 'timesnewromanps-boldmt', 12.0, [85.0, 133.27, 130.78, 149.89]], ['a', 'timesnewromanps-italicmt', 12.0, [183.59, 134.39, 189.59, 149.04]], ['terbalik:', 'timesnewromanpsmt', 12.0, [189.59, 133.16, 236.6, 149.17]], ['nemot', 'timesnewromanps-italicmt', 12.0, [237.8, 134.39, 266.85, 149.04]], ['amblemba', 'timesnewromanps-italicmt', 12.0, [272.65, 134.39, 313.32, 149.04]], ['king', 'timesnewromanps-italicmt', 12.0, [324.93, 134.39, 336.56, 149.04]], ['on', 'timesnewromanps-italicmt', 12.0, [353.98, 134.39, 348.18, 149.04]], ['abletsing', 'timesnewromanps-italicmt', 12.0, [106.4, 147.98, 153.4, 162.64]], ['dia', 'timesnewromanpsmt', 12.0, [153.4, 146.76, 168.76, 162.77]], ['memakai', 'timesnewromanpsmt', 12.0, [173.88, 146.76, 204.6, 162.77]], ['baju', 'timesnewromanpsmt', 12.0, [214.84, 146.76, 225.08, 162.77]], ['terbalik', 'timesnewromanpsmt', 12.0, [240.44, 146.76, 266.04, 162.77]]]","[85.0, 133.27, 130.78, 149.89]",1
3,abusi n kakek: kandei abusi mse ket ong kakek saya masih hidup,"[['abusi', 'timesnewromanps-boldmt', 12.0, [85.0, 160.67, 115.2, 177.29]], ['n', 'timesnewromanps-italicmt', 12.0, [154.8, 161.79, 160.8, 176.44]], ['kakek:', 'timesnewromanpsmt', 12.0, [160.8, 160.56, 200.4, 176.57]], ['kandei', 'timesnewromanps-italicmt', 12.0, [202.0, 161.79, 233.9, 176.44]], ['abusi', 'timesnewromanps-italicmt', 12.0, [239.21, 161.79, 260.48, 176.44]], ['mse', 'timesnewromanps-italicmt', 12.0, [271.11, 161.79, 276.43, 176.44]], ['ket', 'timesnewromanps-italicmt', 12.0, [292.38, 161.79, 292.38, 176.44]], ['ong', 'timesnewromanps-italicmt', 12.0, [313.64, 161.79, 308.33, 176.44]], ['kakek', 'timesnewromanpsmt', 12.0, [329.59, 160.56, 365.58, 176.57]], ['saya', 'timesnewromanpsmt', 12.0, [106.4, 174.36, 128.0, 190.37]], ['masih', 'timesnewromanpsmt', 12.0, [133.4, 174.36, 155.0, 190.37]], ['hidup', 'timesnewromanpsmt', 12.0, [165.8, 174.36, 182.0, 190.37]]]","[85.0, 160.67, 115.2, 177.29]",1
4,agha n kakak; abang: nggo syoli kandei agha orang itu itu kakak saya;,"[['agha', 'timesnewromanps-boldmt', 12.0, [85.0, 190.67, 112.6, 207.29]], ['n', 'timesnewromanps-italicmt', 12.0, [152.0, 191.79, 158.0, 206.44]], ['kakak;', 'timesnewromanpsmt', 12.0, [158.0, 190.56, 194.64, 206.57]], ['abang:', 'timesnewromanpsmt', 12.0, [200.75, 190.56, 231.28, 206.57]], ['nggo', 'timesnewromanps-italicmt', 12.0, [240.78, 191.79, 262.96, 206.44]], ['syoli', 'timesnewromanps-italicmt', 12.0, [268.51, 191.79, 290.69, 206.44]], ['kandei', 'timesnewromanps-italicmt', 12.0, [301.79, 191.79, 323.97, 206.44]], ['agha', 'timesnewromanps-italicmt', 12.0, [340.61, 191.79, 346.15, 206.44]], ['orang', 'timesnewromanpsmt', 12.0, [106.4, 204.16, 130.6, 220.17]], ['itu', 'timesnewromanpsmt', 12.0, [135.44, 204.16, 145.12, 220.17]], ['itu', 'timesnewromanpsmt', 12.0, [154.8, 204.16, 159.64, 220.17]], ['kakak', 'timesnewromanpsmt', 12.0, [174.16, 204.16, 183.84, 220.17]], ['saya;', 'timesnewromanpsmt', 12.0, [203.2, 204.16, 208.04, 220.17]]]","[85.0, 190.67, 112.6, 207.29]",1
5,agha kambung n kakak perempuan: kandei agha kambung syoli mbei ati awum kakak perempuan saya sudah menikah,"[['agha', 'timesnewromanps-boldmt', 12.0, [106.4, 220.47, 133.2, 237.09]], ['kambung', 'timesnewromanps-boldmt', 12.0, [139.9, 220.47, 180.11, 237.09]], ['n', 'timesnewromanps-italicmt', 12.0, [262.79, 221.59, 268.79, 236.24]], ['kakak', 'timesnewromanpsmt', 12.0, [268.79, 220.36, 299.04, 236.37]], ['perempuan:', 'timesnewromanpsmt', 12.0, [305.09, 220.36, 359.53, 236.37]], ['kandei', 'timesnewromanps-italicmt', 12.0, [106.4, 235.18, 140.34, 249.84]], ['agha', 'timesnewromanps-italicmt', 12.0, [145.99, 235.18, 162.97, 249.84]], ['kambung', 'timesnewromanps-italicmt', 12.0, [174.27, 235.18, 202.56, 249.84]], ['syoli', 'timesnewromanps-italicmt', 12.0, [219.52, 235.18, 230.84, 249.84]], ['mbei', 'timesnewromanps-italicmt', 12.0, [253.46, 235.18, 253.47, 249.84]], ['ati', 'timesnewromanps-italicmt', 12.0, [281.74, 235.18, 270.44, 249.84]], ['awum', 'timesnewromanps-italicmt', 12.0, [304.37, 235.18, 293.07, 249.84]], ['kakak', 'timesnewromanpsmt', 12.0, [329.01, 233.96, 365.6, 249.97]], ['perempuan', 'timesnewromanpsmt', 12.0, [106.4, 247.76, 156.54, 263.77]], ['saya', 'timesnewromanpsmt', 12.0, [162.11, 247.76, 178.83, 263.77]], ['sudah', 'timesnewromanpsmt', 12.0, [189.97, 247.76, 206.69, 263.77]], ['menikah', 'timesnewromanpsmt', 12.0, [223.4, 247.76, 245.69, 263.77]]]","[106.4, 220.47, 133.2, 237.09]",1
6,"agrog-agrog v putar, memutari: nemot gereja yap kanding agrog-agrog mereka memutari gereja","[['agrog-agrog', 'timesnewromanps-boldmt', 12.0, [85.0, 264.07, 150.39, 280.69]], ['v', 'timesnewromanps-italicmt', 12.0, [228.81, 265.19, 234.14, 279.84]], ['putar,', 'timesnewromanpsmt', 12.0, [234.21, 263.96, 269.69, 279.97]], ['memutari:', 'timesnewromanpsmt', 12.0, [275.6, 263.96, 322.91, 279.97]], ['nemot', 'timesnewromanps-italicmt', 12.0, [333.22, 265.19, 365.62, 279.84]], ['gereja', 'timesnewromanps-italicmt', 12.0, [106.42, 278.78, 139.22, 293.44]], ['yap', 'timesnewromanps-italicmt', 12.0, [144.68, 278.78, 155.62, 293.44]], ['kanding', 'timesnewromanps-italicmt', 12.0, [166.55, 278.78, 193.88, 293.44]], ['agrog-agrog', 'timesnewromanps-italicmt', 12.0, [210.28, 278.78, 254.01, 293.44]], ['mereka', 'timesnewromanpsmt', 12.0, [270.4, 277.56, 308.48, 293.57]], ['memutari', 'timesnewromanpsmt', 12.0, [314.83, 277.56, 359.25, 293.57]], ['gereja', 'timesnewromanpsmt', 12.0, [106.4, 291.36, 138.79, 307.37]]]","[85.0, 264.07, 150.39, 280.69]",1
7,agrok amtung a pusing: kat yegembu agrok amtung kepalaku terasa pusing,"[['agrok', 'timesnewromanps-boldmt', 12.0, [85.0, 307.67, 116.5, 324.29]], ['amtung', 'timesnewromanps-boldmt', 12.0, [122.8, 307.67, 154.3, 324.29]], ['a', 'timesnewromanps-italicmt', 12.0, [245.0, 308.79, 251.0, 323.44]], ['pusing:', 'timesnewromanpsmt', 12.0, [251.0, 307.56, 295.4, 323.57]], ['kat', 'timesnewromanps-italicmt', 12.0, [298.79, 308.79, 317.01, 323.44]], ['yegembu', 'timesnewromanps-italicmt', 12.0, [323.08, 308.79, 359.51, 323.44]], ['agrok', 'timesnewromanps-italicmt', 12.0, [106.4, 322.59, 134.31, 337.24]], ['amtung', 'timesnewromanps-italicmt', 12.0, [139.89, 322.59, 167.81, 337.24]], ['kepalaku', 'timesnewromanpsmt', 12.0, [173.4, 321.36, 215.29, 337.37]], ['terasa', 'timesnewromanpsmt', 12.0, [220.53, 321.36, 246.71, 337.37]], ['pusing', 'timesnewromanpsmt', 12.0, [257.18, 321.36, 278.13, 337.37]]]","[85.0, 307.67, 116.5, 324.29]",1
8,ai n bapak: kat ai nmo su Markus nama bapak saya Markus,"[['ai', 'timesnewromanps-boldmt', 12.0, [85.0, 337.67, 97.4, 354.29]], ['n', 'timesnewromanps-italicmt', 12.0, [125.39, 338.79, 131.39, 353.44]], ['bapak:', 'timesnewromanpsmt', 12.0, [131.19, 337.56, 169.59, 353.57]], ['kat', 'timesnewromanps-italicmt', 12.0, [170.19, 338.79, 186.54, 353.44]], ['ai', 'timesnewromanps-italicmt', 12.0, [191.99, 338.79, 197.44, 353.44]], ['nmo', 'timesnewromanps-italicmt', 12.0, [208.34, 338.79, 213.79, 353.44]], ['su', 'timesnewromanps-italicmt', 12.0, [230.14, 338.79, 224.69, 353.44]], ['Markus', 'timesnewromanps-italicmt', 12.0, [246.49, 338.79, 257.39, 353.44]], ['nama', 'timesnewromanpsmt', 12.0, [279.79, 337.56, 302.67, 353.57]], ['bapak', 'timesnewromanpsmt', 12.0, [308.39, 337.56, 331.27, 353.57]], ['saya', 'timesnewromanpsmt', 12.0, [342.71, 337.56, 354.15, 353.57]], ['Markus', 'timesnewromanpsmt', 12.0, [106.4, 351.16, 146.0, 367.17]]]","[85.0, 337.67, 97.4, 354.29]",1
9,alei n akal; pikiran: snou nak nmon alei manusia mempunyai akal;,"[['alei', 'timesnewromanps-boldmt', 12.0, [85.0, 367.47, 106.19, 384.09]], ['n', 'timesnewromanps-italicmt', 12.0, [144.59, 368.59, 150.59, 383.24]], ['akal;', 'timesnewromanpsmt', 12.0, [150.59, 367.36, 175.94, 383.37]], ['pikiran:', 'timesnewromanpsmt', 12.0, [181.01, 367.36, 216.5, 383.37]], ['snou', 'timesnewromanps-italicmt', 12.0, [222.38, 368.59, 244.47, 383.24]], ['nak', 'timesnewromanps-italicmt', 12.0, [250.0, 368.59, 261.04, 383.24]], ['nmon', 'timesnewromanps-italicmt', 12.0, [272.09, 368.59, 283.13, 383.24]], ['alei', 'timesnewromanps-italicmt', 12.0, [299.71, 368.59, 305.22, 383.24]], ['manusia', 'timesnewromanpsmt', 12.0, [322.6, 367.36, 365.59, 383.37]], ['mempunyai', 'timesnewromanpsmt', 12.0, [106.4, 380.96, 158.0, 396.97]], ['akal;', 'timesnewromanpsmt', 12.0, [163.73, 380.96, 186.67, 396.97]]]","[85.0, 367.47, 106.19, 384.09]",1
