In [1]:
import pandas as pd
import numpy as np
import os
import re
import json as js
from pathlib import Path
from tqdm import tqdm
import fastDamerauLevenshtein as fdl
import fastwer

In [2]:
directory_manual_entry = "Manual Entry"
directory_hasil = "Hasil Evaluasi"

### Program Evaluasi

In [3]:
def find_similarity(gold_entries, entries):
    entries_result = []
    gold_entries_result = []
    similarity_values = []
    
    for i in range(len(gold_entries)):
        gold_entry = gold_entries[i]
        max_value = 0
        sentence_similar = ""

        for j in range(len(entries)):
            entry = entries[j]
            similarity = fdl.damerauLevenshtein(gold_entry, entry, similarity=True)

            if (similarity > max_value):
                max_value = similarity
                sentence_similar = entry
    
        gold_entries_result.append(gold_entry)
        entries_result.append(sentence_similar)
        similarity_values.append(max_value)
    
    return gold_entries_result, entries_result, similarity_values

In [4]:
def evaluate_entry(gold_entries, entries):
    gold_entries_result, entries_result, similarity_values = find_similarity(gold_entries, entries)
    
    result = {
        "gold_entry":gold_entries_result,
        "similar_entry":entries_result,
        "similarity_values":similarity_values
    }
    
    return result

### Main Program (JSON General Font Approach)

In [5]:
def run_program_evaluation(manual_entries_directory, target_entries_directory, hasil_directory, input_type, approach_type):
    manual_entries = sorted(os.listdir(manual_entries_directory))
    entries_extraction_JSON_generic = sorted(os.listdir(target_entries_directory))
    
    ## ringkasan evaluasi
    kamus = []
    banyak_entri_sebenarnya = []
    banyak_entri_hasil_ekstraksi = []
    mean_similarity_tiap_entri = []
    persentasi_entri_1 = [] # diatas 0.8
    persentasi_entri_2 = [] # diatas 0.85
    persentasi_entri_3 = [] # diatas 0.9
    persentasi_entri_benar = [] # == 1
    word_error_rate = []
    char_error_rate = []
    
    for i in tqdm(range(len(manual_entries))):
        filename_manual_entry = manual_entries[i]
        filename_entry_extraction = entries_extraction_JSON_generic[i]
        print("===== Evaluasi Kamus " + filename_manual_entry + " =====")
        
        manual_entry = pd.read_csv(manual_entries_directory + "/" + filename_manual_entry)
        entry_extraction = pd.read_csv(target_entries_directory + "/" + filename_entry_extraction)
        
        entry_extraction = entry_extraction.dropna()
        entry_extraction = entry_extraction.reset_index(drop=True)
        
        entries = entry_extraction["Entri"].values.tolist()
        gold_entries = manual_entry["Entri"].values.tolist()
        
        result = evaluate_entry(gold_entries, entries)
        
        ## simpan hasil dalam CSV
        filename_result = os.path.splitext(filename_manual_entry)[0]
        filename_result = filename_result[22:]
        data_res = pd.DataFrame.from_dict(result)
        filename_res = "/Hasil Evaluasi Pendekatan " + str(approach_type) + " " + str(input_type)
        data_res.to_csv(hasil_directory + filename_res + filename_result + ".csv",index=False)
        
        ## summarize
        kamus.append(os.path.splitext(filename_entry_extraction)[0])
        banyak_entri_sebenarnya.append(len(gold_entries))
        banyak_entri_hasil_ekstraksi.append(len(entries))

        # count 0.8
        count_true = len(data_res[data_res["similarity_values"] > 0.8])
        persen_true = round(count_true/len(gold_entries),2)
        persentasi_entri_1.append(persen_true)
        
        # count 0.85
        count_true = len(data_res[data_res["similarity_values"] > 0.85])
        persen_true = round(count_true/len(gold_entries),2)
        persentasi_entri_2.append(persen_true)
        
        # count 0.9
        count_true = len(data_res[data_res["similarity_values"] > 0.9])
        persen_true = round(count_true/len(gold_entries),2)
        persentasi_entri_3.append(persen_true)
        
        # count == 1
        count_true = len(data_res[data_res["similarity_values"] == 1])
        persen_true = round(count_true/len(gold_entries),2)
        persentasi_entri_benar.append(persen_true)
        
        # WER, CER
        expected = ("\n").join(gold_entries)
        result = ("\n").join(entries)
        word_error_rate.append(round(fastwer.score_sent(result,expected, char_level=False),2))
        char_error_rate.append(round(fastwer.score_sent(result,expected, char_level=True),2))
        mean_similarity_tiap_entri.append(data_res["similarity_values"].mean())
        
    # simpan ringkasan pada file CSV
    summarize_result = {
        "kamus" : kamus,
        "banyak_entri_sebenarnya" : banyak_entri_sebenarnya,
        "banyak_entri_pengolahan" : banyak_entri_hasil_ekstraksi,
        "mean_similarity_per_entri":mean_similarity_tiap_entri,
        "Persentase entri diatas 0.8":persentasi_entri_1,
        "Persentase entri diatas 0.85":persentasi_entri_2,
        "Persentase entri diatas 0.9":persentasi_entri_3,
        "Persentase entri benar":persentasi_entri_benar,
        "word_error_rate":word_error_rate,
        "characted_error_rate":char_error_rate
    }

    data_summarize = pd.DataFrame.from_dict(summarize_result)
    filename_summarize = "/Ringkasan Evaluasi Pendekatan " + approach_type + " bentuk " + input_type + ".csv"
    data_summarize.to_csv(hasil_directory + filename_summarize,index=False)

### JSON Font Approach

In [12]:
run_program_evaluation(
    directory_manual_entry,
    "CSV One Entry JSON With Font Approach",
    "Hasil Evaluasi",
    "JSON",
    "Font (2)"
)

 11%|█████████▎                                                                          | 1/9 [00:00<00:01,  5.51it/s]

===== Evaluasi Kamus Dataset Manual Entri - 41. Kamus Bahasa Indonesia - Bali.csv =====


 22%|██████████████████▋                                                                 | 2/9 [00:00<00:01,  4.82it/s]

===== Evaluasi Kamus Dataset Manual Entri - 43. Kamus Bahasa Indonesia - Minangkabau I.csv =====


 33%|████████████████████████████                                                        | 3/9 [00:00<00:00,  6.12it/s]

===== Evaluasi Kamus Dataset Manual Entri - 54. Kamus Bahasa Indonesia Mentawai.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 57. Kamus Bahasa Bugis-Indonesia.csv =====


 44%|█████████████████████████████████████▎                                              | 4/9 [00:01<00:02,  2.26it/s]

===== Evaluasi Kamus Dataset Manual Entri - 61. Kamus Banjar-Indonesia.csv =====


 67%|████████████████████████████████████████████████████████                            | 6/9 [00:01<00:01,  2.90it/s]

===== Evaluasi Kamus Dataset Manual Entri - 68. Kamus Talaud - Indonesia.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 77. Kamus Samawa-Indonesia.csv =====


 78%|█████████████████████████████████████████████████████████████████▎                  | 7/9 [00:02<00:00,  2.72it/s]

===== Evaluasi Kamus Dataset Manual Entri - 89. Kamus Bahasa Mooi - Indonesia.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 91. Kamus Simalungun Indonesia.csv =====


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.11it/s]


In [13]:
data_summarize = pd.read_csv("Hasil Evaluasi" + "/Ringkasan Evaluasi Pendekatan Font (2) bentuk JSON.csv")
data_summarize

Unnamed: 0,kamus,banyak_entri_sebenarnya,banyak_entri_pengolahan,mean_similarity_per_entri,Persentase entri diatas 0.8,Persentase entri diatas 0.85,Persentase entri diatas 0.9,Persentase entri benar,word_error_rate,characted_error_rate
0,41. Kamus Bahasa Indonesia-Bali A-K (1997)-pag...,62,55,0.841378,0.71,0.71,0.68,0.34,23.08,3.63
1,43. Kamus Bahasa Indonesia-Bahasa Minangkabau ...,28,17,0.607245,0.39,0.39,0.36,0.11,16.63,7.27
2,54. Kamus Bahasa Indonesia Mentawai (1998)-pag...,53,44,0.860293,0.72,0.72,0.64,0.38,24.15,4.01
3,57. Kamus Bahasa Bugis-Indonesia (1977)-page_2...,88,34,0.375872,0.03,0.02,0.01,0.01,79.32,29.19
4,61. Kamus Banjar-Indonesia (1977)-page_51_52-h...,112,12,0.289234,0.01,0.01,0.0,0.0,68.34,32.0
5,68. Kamus Dwibahasa Bahasa Talaud - Bahasa Ind...,56,66,0.980579,0.98,0.98,0.96,0.91,2.61,3.2
6,77. Kamus Samawa-Indonesia Edisi 2 (2017)-page...,68,31,0.590762,0.37,0.35,0.35,0.24,39.71,31.07
7,89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indones...,36,35,0.983382,0.97,0.97,0.97,0.89,2.66,0.51
8,91. Kamus Simalungun - Indonesia (edisi kedua)...,82,72,0.879138,0.79,0.78,0.76,0.6,12.46,3.85


In [31]:
run_program_evaluation(
    directory_manual_entry,
    "CSV One Entry JSON With Font + Posisi Approach",
    "Hasil Evaluasi",
    "JSON",
    "Font + Posisi (2)"
)

 11%|█████████▎                                                                          | 1/9 [00:00<00:01,  6.78it/s]

===== Evaluasi Kamus Dataset Manual Entri - 41. Kamus Bahasa Indonesia - Bali.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 43. Kamus Bahasa Indonesia - Minangkabau I.csv =====


 33%|████████████████████████████                                                        | 3/9 [00:00<00:00,  6.42it/s]

===== Evaluasi Kamus Dataset Manual Entri - 54. Kamus Bahasa Indonesia Mentawai.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 57. Kamus Bahasa Bugis-Indonesia.csv =====


 44%|█████████████████████████████████████▎                                              | 4/9 [00:01<00:02,  2.33it/s]

===== Evaluasi Kamus Dataset Manual Entri - 61. Kamus Banjar-Indonesia.csv =====


 67%|████████████████████████████████████████████████████████                            | 6/9 [00:01<00:01,  3.00it/s]

===== Evaluasi Kamus Dataset Manual Entri - 68. Kamus Talaud - Indonesia.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 77. Kamus Samawa-Indonesia.csv =====


 78%|█████████████████████████████████████████████████████████████████▎                  | 7/9 [00:02<00:00,  2.76it/s]

===== Evaluasi Kamus Dataset Manual Entri - 89. Kamus Bahasa Mooi - Indonesia.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 91. Kamus Simalungun Indonesia.csv =====


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.16it/s]


In [32]:
data_summarize = pd.read_csv("Hasil Evaluasi" + "/Ringkasan Evaluasi Pendekatan Font + Posisi (2) bentuk JSON.csv")
data_summarize

Unnamed: 0,kamus,banyak_entri_sebenarnya,banyak_entri_pengolahan,mean_similarity_per_entri,Persentase entri diatas 0.8,Persentase entri diatas 0.85,Persentase entri diatas 0.9,Persentase entri benar,word_error_rate,characted_error_rate
0,41. Kamus Bahasa Indonesia-Bali A-K (1997)-pag...,62,55,0.845098,0.73,0.73,0.69,0.37,22.79,3.56
1,43. Kamus Bahasa Indonesia-Bahasa Minangkabau ...,28,20,0.752159,0.64,0.64,0.61,0.21,15.61,7.15
2,54. Kamus Bahasa Indonesia Mentawai (1998)-pag...,53,47,0.849251,0.7,0.7,0.6,0.38,23.73,4.05
3,57. Kamus Bahasa Bugis-Indonesia (1977)-page_2...,88,36,0.390141,0.06,0.02,0.02,0.02,79.32,29.19
4,61. Kamus Banjar-Indonesia (1977)-page_51_52-h...,112,26,0.443272,0.16,0.15,0.12,0.04,65.45,31.64
5,68. Kamus Dwibahasa Bahasa Talaud - Bahasa Ind...,56,57,0.980579,0.98,0.98,0.96,0.91,4.96,3.2
6,77. Kamus Samawa-Indonesia Edisi 2 (2017)-page...,68,35,0.60132,0.4,0.38,0.38,0.26,40.69,31.07
7,89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indones...,36,35,0.983382,0.97,0.97,0.97,0.89,2.66,0.51
8,91. Kamus Simalungun - Indonesia (edisi kedua)...,82,79,0.945416,0.94,0.91,0.9,0.67,10.86,3.72


### JSON Font + Posisi

In [38]:
run_program_evaluation(
    directory_manual_entry,
    "CSV One Entry JSON With Font + Posisi Approach",
    "Hasil Evaluasi",
    "JSON",
    "Font_Posisi"
)

 11%|█████████▎                                                                          | 1/9 [00:00<00:01,  5.87it/s]

===== Evaluasi Kamus Dataset Manual Entri - 41. Kamus Bahasa Indonesia - Bali.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 43. Kamus Bahasa Indonesia - Minangkabau I.csv =====


 33%|████████████████████████████                                                        | 3/9 [00:00<00:01,  5.81it/s]

===== Evaluasi Kamus Dataset Manual Entri - 54. Kamus Bahasa Indonesia Mentawai.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 57. Kamus Bahasa Bugis-Indonesia.csv =====


 44%|█████████████████████████████████████▎                                              | 4/9 [00:01<00:02,  2.10it/s]

===== Evaluasi Kamus Dataset Manual Entri - 61. Kamus Banjar-Indonesia.csv =====


 67%|████████████████████████████████████████████████████████                            | 6/9 [00:02<00:01,  2.66it/s]

===== Evaluasi Kamus Dataset Manual Entri - 68. Kamus Talaud - Indonesia.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 77. Kamus Samawa-Indonesia.csv =====


 89%|██████████████████████████████████████████████████████████████████████████▋         | 8/9 [00:02<00:00,  3.23it/s]

===== Evaluasi Kamus Dataset Manual Entri - 89. Kamus Bahasa Mooi - Indonesia.csv =====
===== Evaluasi Kamus Dataset Manual Entri - 91. Kamus Simalungun Indonesia.csv =====


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:03<00:00,  2.84it/s]


In [39]:
data_summarize = pd.read_csv("Hasil Evaluasi" + "/Ringkasan Evaluasi Pendekatan Font_Posisi bentuk JSON.csv")
data_summarize

Unnamed: 0,kamus,banyak_entri_sebenarnya,banyak_entri_pengolahan,mean_similarity_per_entri,Persentase entri diatas 0.8,Persentase entri diatas 0.85,Persentase entri diatas 0.9,Persentase entri benar,word_error_rate,characted_error_rate
0,41. Kamus Bahasa Indonesia-Bali A-K (1997)-pag...,62,55,0.845098,0.73,0.73,0.69,0.37,22.79,3.56
1,43. Kamus Bahasa Indonesia-Bahasa Minangkabau ...,28,20,0.752159,0.64,0.64,0.61,0.21,15.61,7.15
2,54. Kamus Bahasa Indonesia Mentawai (1998)-pag...,53,47,0.849251,0.7,0.7,0.6,0.38,23.73,4.05
3,57. Kamus Bahasa Bugis-Indonesia (1977)-page_2...,88,36,0.390141,0.06,0.02,0.02,0.02,79.32,29.19
4,61. Kamus Banjar-Indonesia (1977)-page_51_52-h...,112,26,0.443272,0.16,0.15,0.12,0.04,65.45,31.64
5,68. Kamus Dwibahasa Bahasa Talaud - Bahasa Ind...,56,57,0.980579,0.98,0.98,0.96,0.91,4.96,3.2
6,77. Kamus Samawa-Indonesia Edisi 2 (2017)-page...,68,35,0.60132,0.4,0.38,0.38,0.26,40.69,31.07
7,89. Kamus Dwibahasa Bahasa Mooi-Bahasa Indones...,36,35,0.983382,0.97,0.97,0.97,0.89,2.66,0.51
8,91. Kamus Simalungun - Indonesia (edisi kedua)...,82,79,0.945416,0.94,0.91,0.9,0.67,10.86,3.72
