In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary

In [2]:
# Load the dataset
path = "./data/dataset_ekstraksi_r27.xlsx"
df = pd.read_excel(path)
df["text"] = df["judul"] +". "+ df["isi"]

In [3]:
# Apply preprocessing to the sentence list
def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    with open('./data/stopword_tala_sastrawi.txt', 'r') as f:
        stopword_tala_sastrawi = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopword_tala_sastrawi)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df['preprocessed_text'] = df['preprocessed_text'].apply(preprocess)

usulan personil proposed key personnel surat kontraktor no jtb rj pj tertanggal oktober perihal usulan personil pengganti personil diajukan fase tender perusahaan tambahan penjelasan kontraktor perusahaan memahami pergantian personil disebabkan nya durasi pengumuman pemenang lelang tanggal efektif kontrak perusahaan mengharapkan personil pengganti memiliki kualifikasi minimal kualifikasi personil diajukan fase tender perusahaan mengharapkan kontraktor perubahan organisasi diajukan fase tender organisasi personil diajukan fase tender salah faktor menentukan penilaian teknis penawaran diinformasikan perusahaan assessment personil pengganti diusulkan kandidat diterima personil pengganti informasi telah fungsi fungsi terkait konsorsium rjj email tersendiri rangka kontraktor prosedur dilaksanakan persetujuan tertulis perusahaan kontraktor kesempatan membuat perbandingan organisasi surat diatas organisasi diajukan fase tender menyampaikannya perusahaan penjelasan alasan perbedaan organisasi 

In [4]:
#Function for sorting tf_idf in descending order
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
        
    return results

def get_top_phrase(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,3),  
            max_features=2000).fit([corpus])
    bag_of_words = vec1.transform([corpus])
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]

In [5]:
from openpyxl import load_workbook

def write_excel(df, sheet_name, filename):
    """
    Writes the given dataframe to an excel file with the given filename and sheet name.
    If the sheet already exists in the file, the data in the sheet will be overwritten.
    """
    try:
        book = load_workbook(filename)  # Load the existing workbook
    except FileNotFoundError:
        book = Workbook()  # If the file doesn't exist, create a new workbook

    writer = pd.ExcelWriter(filename, engine='openpyxl')
    writer.book = book
    
    if sheet_name in book.sheetnames:  # If sheet already exists, delete it
        idx = book.sheetnames.index(sheet_name)
        sheet = book[sheet_name]
        book.remove(sheet)
        writer.sheets = {ws.title:ws for ws in book.worksheets}
        
    df.to_excel(writer, sheet_name=sheet_name, index=False)
    writer.save()

In [6]:
cv = CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(df['preprocessed_text'])

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

# extract from random title
feature_names = cv.get_feature_names_out() #sblmnya method ini .get_feature_names() muncul error

n_tfidf = 3

In [7]:
predict_tfidf = pd.DataFrame()
for index, row in df.iterrows():
    doc = row['preprocessed_text']
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
    
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items, n_tfidf) # format keywprd dictionary. bagaimana menyimpan dictionary dalam dataframe?
    keyword_list = pd.DataFrame.from_dict(keywords, orient='index').index.values.tolist() # merubah dalam format list
    #predict_tfidf = predict_tfidf.append(pd.DataFrame([keyword_list])) # sebelnya berhasil tapi muncul future warning
    predict_tfidf = pd.concat([predict_tfidf, pd.DataFrame([keyword_list])])

In [8]:
predict_tfidf = pd.DataFrame()
for index, row in df.iterrows():
    doc = row['preprocessed_text']
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
    
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items, n_tfidf)

    keyword_list = pd.DataFrame.from_dict(keywords, orient='index', columns=['score'])
    keyword_list.index.name = 'keyword'
    keyword_list.reset_index(inplace=True)

    a = pd.DataFrame(keyword_list.keyword).T.reset_index(drop=True)
    b = pd.DataFrame(keyword_list.score).round(2).T.reset_index(drop=True)
    keywords = pd.concat([a, b], axis=1)
    predict_tfidf = pd.concat([predict_tfidf, keywords], ignore_index=True)

predict_tfidf.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3']
predict_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3
0,fase tender,personil,personil pengganti,0.34,0.33,0.3
1,template,document,pengelolaan dokumen,0.32,0.2,0.16
2,ruangan kantor,ruangan,kantor,0.32,0.28,0.24


EVALUATION

In [9]:
from utils.f_evaluation import check_similarity, eval

path = "./data/dataset_ekstraksi_r27.xlsx"
df = pd.read_excel(path)
targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [10]:
# Evaluation TFIDF
predict_tfidf_list = predict_tfidf[['key_1','key_2','key_3']].values.tolist()
eval_tfidf = eval(predict_tfidf_list, targets, True).round(3)
eval_tfidf.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf = eval_tfidf[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,partial_match,partial_match,no_match,0.286,0.667
2,no_match,no_match,partial_match,0.143,0.333


In [11]:
# Calculate TFIDF Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall = eval_tfidf['flex_recall'].mean()
tfidf_prec = eval_tfidf['flex_prec'].mean()
tfidf_f1 = 2 * (tfidf_prec * tfidf_recall) / (tfidf_prec + tfidf_recall)

# Create a DataFrame with the scores
summary = pd.DataFrame({'tfidf': [tfidf_recall, tfidf_prec, tfidf_f1]}, index=['recall', 'precision', 'F1'])
summary = summary.round(3)
summary

Unnamed: 0,tfidf
recall,0.13
precision,0.303
F1,0.182


In [12]:
# Combine dataframe predict_tfidf, df_targets and result_tfidf
predict_tfidf = pd.concat([predict_tfidf, df_targets,eval_tfidf], axis=1)
predict_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3,0,1,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,fase tender,personil,personil pengganti,0.34,0.33,0.3,persetujuan tertulis,prosedur,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,template,document,pengelolaan dokumen,0.32,0.2,0.16,template document,exhibit c,acuan,pengelolaan,dokumen,,,partial_match,partial_match,no_match,0.286,0.667
2,ruangan kantor,ruangan,kantor,0.32,0.28,0.24,ruang kantor,change inquiry,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,partial_match,0.143,0.333


In [None]:
# Write predictions to excel file
sheet_name = 'tfidf'
output_file = 'result/02_phrase_prediction.xlsx'
write_excel(predict_tfidf, sheet_name, output_file)