In [1]:
import pandas as pd
import numpy as np
import os
import re
import json as js
from pathlib import Path
from tqdm import tqdm
from ast import literal_eval
import fastDamerauLevenshtein as fdl
import fastwer

In [2]:
source = "5. Kategorisasi Main Entry"
hasil = "6. Splitted"

### Bentuk Pertama

Algoritma ini dibangun dengan syarat :

* Pemisah antara atribut definisi lema dengan contoh kalimat adalah tanda (:)
* Pemisah antara setiap kata tujuan yang sinonim adalah tanda (;)
* Dapat digunakan pada kamus Indonesia - Daerah atau Daerah - Indonesia
* <b>Entri yang polisemi</b> sudah dipisah menjadi beberapa entri berbeda

Kamus yang memenuhi kriteria dari hasil sample dan pengolahan: <br>
Kamus Indonesia - Mentawai dan Kamus Talaud - Indonesia

In [3]:
def count_jumlah_penggunaan_titikKoma(s):
    cnt = 0; split = s.split(" ")
    for txt in split:
        if re.match(r'.*\;$',txt):
            cnt += 1
    return cnt

def count_jumlah_penggunaan_titikDua(s):
    cnt = 0; split = s.split(" ")
    for txt in split:
        if re.match(r'.*\:$',txt):
            cnt += 1
    return cnt

def is_nomor_halaman(s):
    if s.isdigit():
        return True
    return False

def find_titik_dua_in_sinonim_entri_anomali(splitted_entry):
    output = True; index = 0
    
    if len(splitted_entry) == 1:
        return output, index
    
    for i in range(1,len(splitted_entry)):
        suku_kata = splitted_entry[i].split(" ")
        if len(suku_kata) > 2: # toleransi 
            output = False; index = i; break
    return (output, index)

def split_entry_by_spec_deli_index(delimeter,index,entry):
    cnt = 0; first_output = ""; secound_output = entry; 
    first_index = []
    
    if index == 0: return
    
    while cnt < index:
        splitted_entry = secound_output.split(delimeter,1)
        if len(splitted_entry) == 1: break
        first_index.append(splitted_entry[0])
        secound_output = splitted_entry[1]
        cnt += 1
        
    first_output = (";").join(first_index)
    return first_output, secound_output

In [4]:
def split_contoh_kalimat_dan_definisi_lema(entries):
    definisi_lema = []; contoh_kalimat = []
    for entry in entries:
        titikDua = count_jumlah_penggunaan_titikDua(entry)
        if (titikDua > 0):
            splited_entry = entry.strip().split(":",1)
            definisi_lema.append(splited_entry[0].strip())
            contoh_kalimat.append(splited_entry[1].strip())
        else: # tidak punya contoh kalimat
            definisi_lema.append(entry.strip())
            contoh_kalimat.append("")
    
    # handle inkonsistensi
    clean_definisi_lema = []; clean_contoh_kalimat = []
    for i in range(len(contoh_kalimat)):
        cnt_kl = contoh_kalimat[i]; raw_lema = definisi_lema[i]
        if cnt_kl == "":
            splitted_lema = raw_lema.split(";")
            result = find_titik_dua_in_sinonim_entri_anomali(splitted_lema)
            if result[0] == False:
                lema, cln_cnt_kl = split_entry_by_spec_deli_index(";",result[1],raw_lema)
                clean_definisi_lema.append(lema); clean_contoh_kalimat.append(cln_cnt_kl)
            else:
                clean_definisi_lema.append(raw_lema); clean_contoh_kalimat.append("")

        else:
            clean_definisi_lema.append(raw_lema); clean_contoh_kalimat.append(cnt_kl)
    
    return clean_definisi_lema, clean_contoh_kalimat

def split_entry_font_pos_size(clean_definisi_lema, clean_contoh_kalimat,entry_font_size_pos):
    definisi_lema_font_size_pos = []
    contoh_kalimat_font_size_pos = []
    
    for i in range(len(clean_definisi_lema)):
        lema = clean_definisi_lema[i].strip().split(" ")
        contoh_kalimat = clean_contoh_kalimat[i]
        
        if (contoh_kalimat == ""):
            definisi_lema_font_size_pos.append(entry_font_size_pos[i])
            contoh_kalimat_font_size_pos.append([])
        else:
            result = entry_font_size_pos[i][:len(lema)] 
            definisi_lema_font_size_pos.append(entry_font_size_pos[i][:len(lema)])
            contoh_kalimat_font_size_pos.append(entry_font_size_pos[i][len(lema):])
                                                                       
    return definisi_lema_font_size_pos, contoh_kalimat_font_size_pos

In [5]:
for filename in tqdm(os.listdir(source)):
    data = pd.read_csv(source + "/" + filename)
    data = data.dropna()
    data = data.reset_index(drop=True)
    entries = data["Entri"].values.tolist()
    entry_font_pos_size = data["entry_font_size_pos"].values.tolist()
    for i in range(len(entry_font_pos_size)):
        entry_font_pos_size[i] = literal_eval(entry_font_pos_size[i])
    
    definisi_lema, contoh_kalimat = split_contoh_kalimat_dan_definisi_lema(entries)
    definisi_lema_font_pos_size, contoh_kalimat_font_pos_size = split_entry_font_pos_size(definisi_lema, contoh_kalimat, entry_font_pos_size)
    
    clean_res = {
        "definisi_lema":definisi_lema,
        "contoh_kalimat":contoh_kalimat,
        "definisi_lema_font_size_pos":definisi_lema_font_pos_size,
        "contoh_kalimat_font_size_pos":contoh_kalimat_font_pos_size,
        "posisi_entry":data["posisi_entry"].values.tolist(),
        "main_entry":data["main_entry"].values.tolist(),
        "is_padanan_lema":data["is_padanan_lema"].values.tolist(),
        "is_anomali":data["is_anomali"].values.tolist(),
        "page":data["page"].values.tolist()
    }
    
    new_filename = os.path.splitext(filename)[0]
    data_kamus_res = pd.DataFrame.from_dict(clean_res)
    data_kamus_res.to_csv(hasil + "/" + new_filename + "-splitted.csv",index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 43/43 [01:44<00:00,  2.44s/it]


In [6]:
# contoh
contoh = pd.read_csv(hasil + "/91_Replace-categorizeAnomali-WithMainEntry-splitted.csv")
contoh

Unnamed: 0,definisi_lema,contoh_kalimat,definisi_lema_font_size_pos,contoh_kalimat_font_size_pos,posisi_entry,main_entry,is_padanan_lema,is_anomali,page
0,a n huruf pertama dalam aksara Simalungun (dar...,,"[['a', 'timesnewromanps-boldmt', 10.0, [70.9, ...",[],"[70.9, 77.46, 75.9, 91.31]",1,0,0,1
1,abab n abu (hasil pembakaran),abab ni rih abu lalang; 2 debu;,"[['abab', 'timesnewromanps-boldmt', 10.0, [70....","[['abab', 'timesnewromanps-italicmt', 10.0, [1...","[70.9, 113.86, 92.06, 127.71]",1,0,0,1
2,marabab v berdebu,marabab jabu ai rumah itu berdebu,"[['marabab', 'timesnewromanps-boldmt', 10.0, [...","[['marabab', 'timesnewromanps-italicmt', 10.0,...","[85.1, 148.36, 123.96, 162.21]",0,0,0,1
3,abal n jalan setapak (yang dilalui oleh manusia),,"[['abal', 'timesnewromanps-boldmt', 10.0, [70....",[],"[70.9, 173.26, 91.8, 187.11]",1,0,0,1
4,mardalan ia hum abal dia berjalan dari jalan s...,,"[['mardalan', 'timesnewromanps-italicmt', 10.0...",[],"[85.1, 197.09, 127.85, 209.3]",0,0,1,1
...,...,...,...,...,...,...,...,...,...
10616,weiweian n alat memintal tali,,"[['weiweian', 'timesnewromanps-boldmt', 10.0, ...",[],"[160.5, 138.76, 199.96, 152.61]",0,0,0,263
10617,w . wa . weiwei,,"[['w', 'timesnewromanps-boldmt', 9.0, [325.4, ...",[],"[325.4, 15.2, 329.35, 27.67]",0,0,1,"[263, 264]"
10618,1 ya n huruf keempat belas dalam aksara Simalu...,,"[['1', 'timesnewromanps-boldmt', 6.5, [70.9, 7...",[],"[70.9, 76.61, 74.15, 85.62]",1,0,0,264
10619,2 ya n bara,,"[['2', 'timesnewromanps-boldmt', 6.5, [70.9, 1...",[],"[70.9, 113.01, 74.15, 122.02]",1,0,0,264
