In [1]:
#1. rutin1 import module
import pandas as pd
import os
import sys

In [2]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

In [3]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r29.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [8]:
# preprocess data
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df['preprocessed_text'] = df['preprocessed_text'].apply(preprocess)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

cv = CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(df['preprocessed_text'])

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

# extract from random title
feature_names = cv.get_feature_names_out() #sblmnya method ini .get_feature_names() muncul error

n_tfidf = 3

In [12]:
from utils import sort_coo, extract_topn_from_vector

predict_tfidf = pd.DataFrame()
for index, row in df.iterrows():
    doc = row['preprocessed_text']
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
    
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items, n_tfidf)

    keyword_list = pd.DataFrame.from_dict(keywords, orient='index', columns=['score'])
    keyword_list.index.name = 'keyword'
    keyword_list.reset_index(inplace=True)

    a = pd.DataFrame(keyword_list.keyword).T.reset_index(drop=True)
    b = pd.DataFrame(keyword_list.score).round(2).T.reset_index(drop=True)
    keywords = pd.concat([a, b], axis=1)
    predict_tfidf = pd.concat([predict_tfidf, keywords], ignore_index=True)

predict_tfidf.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3']
predict_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3
0,fase tender,diajukan fase tender,diajukan fase,0.32,0.32,0.32
1,template document,template,document,0.41,0.36,0.22
2,ruangan kantor,ruangan,kantor,0.39,0.34,0.29


Evaluation

In [13]:
from utils import check_similarity, eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [14]:
# Evaluation TFIDF
predict_tfidf_list = predict_tfidf[['key_1','key_2','key_3']].values.tolist()
eval_tfidf = eval(predict_tfidf_list, targets, True).round(3)
eval_tfidf.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf = eval_tfidf[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,full_match,partial_match,partial_match,0.429,1.0
2,no_match,no_match,partial_match,0.143,0.333


In [15]:
# Calculate TFIDF Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall = eval_tfidf['flex_recall'].mean()
tfidf_prec = eval_tfidf['flex_prec'].mean()
tfidf_f1 = 2 * (tfidf_prec * tfidf_recall) / (tfidf_prec + tfidf_recall)

# Create a DataFrame with the scores
summary = pd.DataFrame({'tfidf': [tfidf_recall, tfidf_prec, tfidf_f1]}, index=['recall', 'precision', 'F1'])
summary = summary.round(3)
summary

Unnamed: 0,tfidf
recall,0.132
precision,0.308
F1,0.185


In [16]:
# Combine dataframe predict_tfidf, df_targets and result_tfidf
predict_tfidf = pd.concat([predict_tfidf, df_targets,eval_tfidf], axis=1)
predict_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3,0,1,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,fase tender,diajukan fase tender,diajukan fase,0.32,0.32,0.32,persetujuan tertulis,prosedur,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,template document,template,document,0.41,0.36,0.22,template document,exhibit c,acuan,pengelolaan,dokumen,,,full_match,partial_match,partial_match,0.429,1.0
2,ruangan kantor,ruangan,kantor,0.39,0.34,0.29,ruang kantor,change inquiry,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,partial_match,0.143,0.333


In [19]:
# Write predictions to excel file
from utils import write_excel

sheet_name = 'tfidf'
write_excel(predict_tfidf, sheet_name, "std_tfidf_prediction.xlsx")

Todo
- menyederhanakan preprocess dengan tempatkan di utils, practical word. jiak memerlukan dataset, agar repot menentukan relative path-nya; solusi skrg fungsi dipindahkan ke main file
- 