In [1]:
#1. rutin1 import module
import pandas as pd
import os
import sys

In [2]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(repo_root)

In [3]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r30_lg.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [4]:
# preprocess data
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "notebooks/stopwords_tuning/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df['preprocessed_text'] = df['preprocessed_text'].apply(preprocess)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

cv = CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(df['preprocessed_text'])

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

# extract from random title
feature_names = cv.get_feature_names_out() #sblmnya method ini .get_feature_names() muncul error

n_tfidf = 10

In [6]:
from utils import sort_coo, extract_topn_from_vector

predict_tfidf = pd.DataFrame()
for index, row in df.iterrows():
    doc = row['preprocessed_text']
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
    
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items, n_tfidf)

    keyword_list = pd.DataFrame.from_dict(keywords, orient='index', columns=['score'])
    keyword_list.index.name = 'keyword'
    keyword_list.reset_index(inplace=True)

    a = pd.DataFrame(keyword_list.keyword).T.reset_index(drop=True)
    b = pd.DataFrame(keyword_list.score).round(2).T.reset_index(drop=True)
    keywords = pd.concat([a, b], axis=1)
    predict_tfidf = pd.concat([predict_tfidf, keywords], ignore_index=True)

predict_tfidf.columns = ['key_1', 'key_2','key_3', 'key_4', 'key_5','key_6', 'key_7', 'key_8','key_9','key_10','score_1', 'score_2','score_3','score_4', 'score_5','score_6','score_7', 'score_8','score_9','score_10'] 
predict_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,score_1,score_2,score_3,score_4,score_5,score_6,score_7,score_8,score_9,score_10
0,fase tender,diajukan fase tender,diajukan fase,personil,personil pengganti,organisasi,fase,tender,pengganti,diajukan,0.32,0.32,0.32,0.31,0.28,0.27,0.24,0.24,0.21,0.18
1,template,document,ctr exhibit coordination,ctr exhibit,ctr,pada jambaran tiung,pada jambaran,acuan pengelolaan,processing facilities,project mengacu,0.4,0.25,0.22,0.22,0.21,0.21,0.21,0.21,0.2,0.19
2,ruangan kantor,ruangan,kantor,artikel,appendix,disediakan lokasi,accommodation,lingkup scope,komposisi,scope,0.38,0.34,0.29,0.22,0.2,0.19,0.19,0.18,0.18,0.17


Evaluation

In [7]:
from utils import check_similarity, eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [8]:
# Evaluation TextRank top 10
predict_tfidf_list_10 = predict_tfidf[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10']].values.tolist()
eval_tfidf_10 = eval(predict_tfidf_list_10, targets, True).round(3)
eval_tfidf_10.columns = ['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf_10 = eval_tfidf_10[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_tfidf_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,flex_recall,flex_prec
0,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,full_match,no_match,0.143,0.1
1,partial_match,partial_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.286,0.2
2,no_match,no_match,partial_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.143,0.1


In [9]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall_10 = eval_tfidf_10['flex_recall'].mean()
tfidf_prec_10 = eval_tfidf_10['flex_prec'].mean()
tfidf_f1_10 = 2 * (tfidf_prec_10 * tfidf_recall_10) / (tfidf_prec_10 + tfidf_recall_10)

# Create a DataFrame with the scores
summary_10 = pd.DataFrame({'textrank': [tfidf_recall_10, tfidf_prec_10, tfidf_f1_10]}, index=['recall', 'precision', 'F1'])
summary_10 = summary_10.round(3)
summary_10

Unnamed: 0,textrank
recall,0.32
precision,0.224
F1,0.263


In [10]:
# Evaluation TextRank top 5
predict_tfidf_list_5 = predict_tfidf[['key_1','key_2','key_3', 'key_4','key_5']].values.tolist()
eval_tfidf_5 = eval(predict_tfidf_list_5, targets, True).round(3)
eval_tfidf_5.columns = ['key_1','key_2','key_3', 'key_4','key_5','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf_5 = eval_tfidf_5[['key_1','key_2','key_3', 'key_4','key_5', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_tfidf_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,flex_recall,flex_prec
0,no_match,no_match,no_match,no_match,no_match,0.0,0.0
1,partial_match,partial_match,no_match,no_match,no_match,0.286,0.4
2,no_match,no_match,partial_match,no_match,no_match,0.143,0.2


In [11]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall_5 = eval_tfidf_5['flex_recall'].mean()
tfidf_prec_5 = eval_tfidf_5['flex_prec'].mean()
tfidf_f1_5 = 2 * (tfidf_prec_5 * tfidf_recall_5) / (tfidf_prec_5 + tfidf_recall_5)

# Create a DataFrame with the scores
summary_5 = pd.DataFrame({'textrank': [tfidf_recall_5, tfidf_prec_5, tfidf_f1_5]}, index=['recall', 'precision', 'F1'])
summary_5 = summary_5.round(3)
summary_5

Unnamed: 0,textrank
recall,0.201
precision,0.281
F1,0.234


In [12]:
# Evaluation TextRank top 3
predict_tfidf_list_3 = predict_tfidf[['key_1','key_2','key_3']].values.tolist()
eval_tfidf_3 = eval(predict_tfidf_list_3, targets, True).round(3)
eval_tfidf_3.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf_3 = eval_tfidf_3[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_tfidf_3.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,partial_match,partial_match,no_match,0.286,0.667
2,no_match,no_match,partial_match,0.143,0.333


In [13]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall_3 = eval_tfidf_3['flex_recall'].mean()
tfidf_prec_3 = eval_tfidf_3['flex_prec'].mean()
tfidf_f1_3 = 2 * (tfidf_prec_3 * tfidf_recall_3) / (tfidf_prec_3 + tfidf_recall_3)

# Create a DataFrame with the scores
summary_3 = pd.DataFrame({'textrank': [tfidf_recall_3, tfidf_prec_3, tfidf_f1_3]}, index=['recall', 'precision', 'F1'])
summary_3 = summary_3.round(3)
summary_3

Unnamed: 0,textrank
recall,0.133
precision,0.309
F1,0.186


In [14]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_tfidf_10 = pd.concat([predict_tfidf, df_targets, eval_tfidf_10], axis=1)
predict_tfidf_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,key_3.1,key_4.1,key_5.1,key_6.1,key_7.1,key_8.1,key_9.1,key_10.1,flex_recall,flex_prec
0,fase tender,diajukan fase tender,diajukan fase,personil,personil pengganti,organisasi,fase,tender,pengganti,diajukan,...,no_match,no_match,no_match,no_match,no_match,no_match,full_match,no_match,0.143,0.1
1,template,document,ctr exhibit coordination,ctr exhibit,ctr,pada jambaran tiung,pada jambaran,acuan pengelolaan,processing facilities,project mengacu,...,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.286,0.2
2,ruangan kantor,ruangan,kantor,artikel,appendix,disediakan lokasi,accommodation,lingkup scope,komposisi,scope,...,partial_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.143,0.1


In [15]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_tfidf_5 = pd.concat([predict_tfidf, df_targets, eval_tfidf_5], axis=1)
predict_tfidf_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,4,5,6,key_1.1,key_2.1,key_3.1,key_4.1,key_5.1,flex_recall,flex_prec
0,fase tender,diajukan fase tender,diajukan fase,personil,personil pengganti,organisasi,fase,tender,pengganti,diajukan,...,,,,no_match,no_match,no_match,no_match,no_match,0.0,0.0
1,template,document,ctr exhibit coordination,ctr exhibit,ctr,pada jambaran tiung,pada jambaran,acuan pengelolaan,processing facilities,project mengacu,...,dokumen,,,partial_match,partial_match,no_match,no_match,no_match,0.286,0.4
2,ruangan kantor,ruangan,kantor,artikel,appendix,disediakan lokasi,accommodation,lingkup scope,komposisi,scope,...,services for company,exhibit a,,no_match,no_match,partial_match,no_match,no_match,0.143,0.2


In [16]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_tfidf_3 = pd.concat([predict_tfidf, df_targets, eval_tfidf_3], axis=1)
predict_tfidf_3.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,fase tender,diajukan fase tender,diajukan fase,personil,personil pengganti,organisasi,fase,tender,pengganti,diajukan,...,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,template,document,ctr exhibit coordination,ctr exhibit,ctr,pada jambaran tiung,pada jambaran,acuan pengelolaan,processing facilities,project mengacu,...,acuan,pengelolaan,dokumen,,,partial_match,partial_match,no_match,0.286,0.667
2,ruangan kantor,ruangan,kantor,artikel,appendix,disediakan lokasi,accommodation,lingkup scope,komposisi,scope,...,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,partial_match,0.143,0.333


In [17]:
# Write predictions to excel file
from utils import write_excel

sheet_name_10 = 'SE1_tfidf_10'
sheet_name_5 = 'SE1_tfidf_5'
sheet_name_3 = 'SE1_tfidf_3'

output_file = 'SE1_tfidf.xlsx'
write_excel(predict_tfidf_10, sheet_name_10, output_file)
write_excel(predict_tfidf_5, sheet_name_5, output_file)
write_excel(predict_tfidf_3, sheet_name_3, output_file)

  writer.book = book
  writer.save()
