In [34]:
#1. rutin1 import module
import pandas as pd
import os
import sys
import warnings
import matplotlib.pyplot as plt
#from sklearn.metrics.pairwise import cosine_similarity

warnings.simplefilter(action='ignore', category=UserWarning)

In [35]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(repo_root)

In [36]:
#3. rutin3 Load the dataset
#dataset_path = os.path.join(repo_root, "notebooks/postager_nlp-id/dataset_ekstraksi_r29_pos_sm.xlsx")
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r30_lg.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [37]:
# tuning paramater
tuning_multiplier = 1  #aktor pengali dari score jika kata tersebut merupakan frase. default = 1 (perlu variasi 0.6 - 0.75)
tuning_f_phrase = 3  #score minimum utk bisa disebut frase
m_prediksi = 10  #jumlah top -n keyword prediksi
n_top_phrase = 3   #jumlah frase yg akan di cari dalam fungsi get_top_phrase

In [38]:
# Preprocess
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    #stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    stopwords_path = os.path.join(repo_root, "notebooks/stopwords_tuning/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df_tr = df['preprocessed_text'].apply(preprocess)

In [39]:
import numpy as np
import math
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer

def build_graph(vocab_len, processed_text, vocabulary):
    """
    Builds a weighted edge graph based on co-occurrences of words in the text.
    + perlu ada tambahan formula untuk menghitung score kata yg ada dalam title menjadi lebih besar. (1, 1.5, 2)
    """
    weighted_edge = np.zeros((vocab_len, vocab_len), dtype=np.float32)
    score = np.ones((vocab_len), dtype=np.float32)
    window_size = 3  
    covered_coocurrences = []

    for i in range(vocab_len):
        for j in range(vocab_len):
            if j == i:
                weighted_edge[i][j] = 0
            else:
                for window_start in range(len(processed_text) - window_size):
                    window_end = window_start + window_size
                    window = processed_text[window_start:window_end]
                    if (vocabulary[i] in window) and (vocabulary[j] in window):
                        index_of_i = window_start + window.index(vocabulary[i])
                        index_of_j = window_start + window.index(vocabulary[j])
                        if [index_of_i,index_of_j] not in covered_coocurrences:
                            weighted_edge[i][j] += 1 / math.fabs(index_of_i - index_of_j)
                            covered_coocurrences.append([index_of_i, index_of_j])

    inout = np.sum(weighted_edge, axis=1)
  
    MAX_ITERATIONS = 50
    d = 0.85
    threshold = 0.0001
    for _ in range(MAX_ITERATIONS):
        prev_score = np.copy(score)
        for i in range(vocab_len):
            summation = 0
            for j in range(vocab_len):
                if weighted_edge[i][j] != 0:
                    summation += (weighted_edge[i][j] / inout[j]) * score[j]
            score[i] = (1 - d) + d * summation
        if np.sum(np.fabs(prev_score - score)) <= threshold:
            break

    return vocabulary, score

def score_phrases(unique_phrases, vocabulary, score, multiplier=tuning_multiplier):
    """
    Computes the score of each phrase using the given vocabulary, word scores, and multiplier.
    """
    phrase_scores = []
    keywords = []
    for phrase in unique_phrases:
        phrase_score = 0
        keyword = ''
        for word in phrase:
            keyword += str(word) + " "
            phrase_score += score[vocabulary.index(word)]
        phrase_score *= multiplier
        phrase_scores.append(phrase_score)
        keywords.append(keyword.strip())

    return keywords, phrase_scores


def get_top_phrase(corpus, n=n_top_phrase):  #perlu ada improvement karena phrase yg di hasilkan masih blm proper
    vec1 = CountVectorizer(ngram_range=(2,3),  
            max_features=2000).fit([corpus])
    bag_of_words = vec1.transform([corpus])
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    # perlu di buat filter jika pola tidak mengikuti kaidah kata majemuk indonesia di excludekan.
    return words_freq[:n]

def predict_keywords(text, m=10, f_phrase=5, tuning_multiplier=1):
    """
    Predicts the top m keywords and top f_phrase phrases for the given text.
    processed_text = text keseluruhan
    vocabulary = unique word dalam proccesesed_text
    """
    processed_text = word_tokenize(text)
    vocabulary = list(set(processed_text))
    vocab_len = len(vocabulary)
    vocabulary, score = build_graph(vocab_len, processed_text, vocabulary)
    unigram = pd.DataFrame({
        'Keyword': vocabulary,
        'Score': score
    }).nlargest(m, 'Score')
    
    bi_trigram = pd.DataFrame(get_top_phrase(text, n=50), columns=['Phrase', 'Score'])
    bi_trigram = bi_trigram[bi_trigram['Score'] >= f_phrase]
    bi_trigram['Tokens'] = bi_trigram['Phrase'].apply(word_tokenize)
    unique_phrases = bi_trigram['Tokens'].values.tolist()
    keywords, phrase_scores = score_phrases(unique_phrases, vocabulary, score, tuning_multiplier) #BUG_1 not accesed by pylance, krn tidak di gunakan di procss selanjutnya
    # memasukan score ke dalam dataframe
    bi_trigram = pd.DataFrame({
        'Phrase': keywords,
        'Score': phrase_scores
    }).nlargest(m, 'Score')

      # Combine unigram and bi_trigram dataframes
    predict_keywords = pd.concat([unigram, bi_trigram[['Phrase', 'Score']].rename(columns={'Phrase': 'Keyword'})])\
                    .sort_values('Score', ascending=False)\
                    .nlargest(m, 'Score')\
                    .reset_index(drop=True)

    return predict_keywords

In [40]:
predict_textrank = pd.DataFrame()
data_ind = 778
print('Processing index', data_ind, end='...! ')
keyphrase = predict_keywords(df_tr[data_ind], m_prediksi, tuning_f_phrase, tuning_multiplier).reset_index(drop=True)
print(keyphrase)
a = pd.DataFrame(keyphrase.Keyword).T.reset_index(drop=True)
print(a)
b = pd.DataFrame(keyphrase.Score).round(2).T.reset_index(drop=True)
print(b)
keyphrase = pd.concat([a, b], axis=1)
print('keyphrase.shape[1] :', keyphrase.shape[1])
    
# Ensure that keyphrase has the same number of columns as max_columns by filling in with NaN
max_columns = 20
missing_cols = max_columns - keyphrase.shape[1]
print(missing_cols)

if missing_cols > 0:
    # diisi Nan
    #nan_cols = pd.DataFrame(np.nan, index=keyphrase.index, columns=[f'col{col}' for col in range(keyphrase.shape[1], max_columns)])
    #keyphrase = pd.concat([keyphrase, nan_cols], axis=1)

    # diisi 0
    zero_cols = pd.DataFrame(11, index=keyphrase.index, columns=[f'col{col}' for col in range(keyphrase.shape[1], max_columns)])
    keyphrase = pd.concat([keyphrase, zero_cols], axis=1)

predict_textrank = pd.concat([predict_textrank, keyphrase], ignore_index=True)
print('Done')

Processing index 778...!                  Keyword     Score
0      rockwool malaysia  5.646587
1                usa aml  4.865901
2                    aml  3.340655
3               rockwool  3.226769
4  approved manufacturer  3.114580
5               malaysia  2.419818
6             insulation  2.122571
7                lapinus  1.846596
8               approved  1.558757
9           manufacturer  1.555823
                   0        1    2         3                      4         5  \
0  rockwool malaysia  usa aml  aml  rockwool  approved manufacturer  malaysia   

            6        7         8             9  
0  insulation  lapinus  approved  manufacturer  
      0     1     2     3     4     5     6     7     8     9
0  5.65  4.87  3.34  3.23  3.11  2.42  2.12  1.85  1.56  1.56
keyphrase.shape[1] : 20
0
Done


In [41]:
predict_textrank

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1
0,rockwool malaysia,usa aml,aml,rockwool,approved manufacturer,malaysia,insulation,lapinus,approved,manufacturer,5.65,4.87,3.34,3.23,3.11,2.42,2.12,1.85,1.56,1.56


In [42]:
print(df_tr[976])

pembayaran rekening tambahan side letter sehubungan bersama memohon membayarkan tagihan sebagai terinci tabel dapat rekening tambahan side letter cl inv iii march cover letter invoice inv pp rek usd march correct invoice progress feb inv mp rek usd march correct invoice payment milestone inv cco rek usd march correct invoice dapat perhatian ucapkan


In [43]:
# Predict keywords for all sentences in the dataframe and save the in dataframe
#from utils import build_graph, score_phrases, get_top_phrase, predict_keywords, write_excel
predict_textrank = pd.DataFrame()
for i in df_tr.index: # ada error di index 394
#for i in df.loc[970:].index:
    print('Processing index', i, end='...! ')
    keyphrase = predict_keywords(df_tr[i], m_prediksi, tuning_f_phrase, tuning_multiplier).reset_index(drop=True)
    a = pd.DataFrame(keyphrase.Keyword).T.reset_index(drop=True)
    b = pd.DataFrame(keyphrase.Score).round(2).T.reset_index(drop=True)
    keyphrase = pd.concat([a, b], axis=1)
    
    # Ensure that keyphrase has the same number of columns as max_columns by filling in with NaN
    max_columns = 20
    missing_cols = max_columns - keyphrase.shape[1]
    if missing_cols > 0:
        nan_cols = pd.DataFrame(np.nan, index=keyphrase.index, columns=[f'col{col}' for col in range(keyphrase.shape[1], max_columns)])
        keyphrase = pd.concat([keyphrase, nan_cols], axis=1)

    predict_textrank = pd.concat([predict_textrank, keyphrase], ignore_index=True)
    print('Done')
#predict_textrank.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3'] 
predict_textrank.columns = ['key_1', 'key_2','key_3', 'key_4', 'key_5','key_6', 'key_7', 'key_8','key_9','key_10','score_1', 'score_2','score_3','score_4', 'score_5','score_6','score_7', 'score_8','score_9','score_10'] 

Processing index 0...! Done
Processing index 1...! Done
Processing index 2...! Done
Processing index 3...! Done
Processing index 4...! Done
Processing index 5...! Done
Processing index 6...! Done
Processing index 7...! Done
Processing index 8...! Done
Processing index 9...! Done
Processing index 10...! Done
Processing index 11...! Done
Processing index 12...! Done
Processing index 13...! Done
Processing index 14...! Done
Processing index 15...! Done
Processing index 16...! Done
Processing index 17...! Done
Processing index 18...! Done
Processing index 19...! Done
Processing index 20...! Done
Processing index 21...! Done
Processing index 22...! Done
Processing index 23...! Done
Processing index 24...! Done
Processing index 25...! Done
Processing index 26...! Done
Processing index 27...! Done
Processing index 28...! Done
Processing index 29...! Done
Processing index 30...! Done
Processing index 31...! Done
Processing index 32...! Done
Processing index 33...! Done
Processing index 34...! 

EVALUATION

In [44]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [45]:
# Evaluation TextRank top 10
predict_textrank_list_10 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10']].values.tolist()
eval_textrank_10 = eval(predict_textrank_list_10, targets, True).round(3)
eval_textrank_10.columns = ['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_10 = eval_textrank_10[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,flex_recall,flex_prec
0,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.0,0.0
1,partial_match,partial_match,full_match,full_match,no_match,no_match,no_match,no_match,no_match,no_match,0.571,0.4
2,no_match,partial_match,no_match,no_match,no_match,no_match,no_match,partial_match,no_match,no_match,0.286,0.2


In [46]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_10 = eval_textrank_10['flex_recall'].mean()
textrank_prec_10 = eval_textrank_10['flex_prec'].mean()
textrank_f1_10 = 2 * (textrank_prec_10 * textrank_recall_10) / (textrank_prec_10 + textrank_recall_10)

# Create a DataFrame with the scores
summary_10 = pd.DataFrame({'textrank': [textrank_recall_10, textrank_prec_10, textrank_f1_10]}, index=['recall', 'precision', 'F1'])
summary_10 = summary_10.round(3)
summary_10

Unnamed: 0,textrank
recall,0.318
precision,0.222
F1,0.261


In [47]:
# Evaluation TextRank top 5
predict_textrank_list_5 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5']].values.tolist()
eval_textrank_5 = eval(predict_textrank_list_5, targets, True).round(3)
eval_textrank_5.columns = ['key_1','key_2','key_3', 'key_4','key_5','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_5 = eval_textrank_5[['key_1','key_2','key_3', 'key_4','key_5', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,flex_recall,flex_prec
0,no_match,no_match,no_match,no_match,no_match,0.0,0.0
1,partial_match,partial_match,full_match,full_match,no_match,0.571,0.8
2,no_match,partial_match,no_match,no_match,no_match,0.143,0.2


In [48]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_5 = eval_textrank_5['flex_recall'].mean()
textrank_prec_5 = eval_textrank_5['flex_prec'].mean()
textrank_f1_5 = 2 * (textrank_prec_5 * textrank_recall_5) / (textrank_prec_5 + textrank_recall_5)

# Create a DataFrame with the scores
summary_5 = pd.DataFrame({'textrank': [textrank_recall_5, textrank_prec_5, textrank_f1_5]}, index=['recall', 'precision', 'F1'])
summary_5 = summary_5.round(3)
summary_5

Unnamed: 0,textrank
recall,0.189
precision,0.265
F1,0.221


In [49]:
# Evaluation TextRank top 3
predict_textrank_list_3 = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank_3 = eval(predict_textrank_list_3, targets, True).round(3)
eval_textrank_3.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_3 = eval_textrank_3[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,partial_match,partial_match,full_match,0.429,1.0
2,no_match,partial_match,no_match,0.143,0.333


In [50]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_3 = eval_textrank_3['flex_recall'].mean()
textrank_prec_3 = eval_textrank_3['flex_prec'].mean()
textrank_f1_3 = 2 * (textrank_prec_3 * textrank_recall_3) / (textrank_prec_3 + textrank_recall_3)

# Create a DataFrame with the scores
summary_3 = pd.DataFrame({'textrank': [textrank_recall_3, textrank_prec_3, textrank_f1_3]}, index=['recall', 'precision', 'F1'])
summary_3 = summary_3.round(3)
summary_3

Unnamed: 0,textrank
recall,0.116
precision,0.271
F1,0.163


In [51]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_10 = pd.concat([predict_textrank, df_targets, eval_textrank_10], axis=1)
predict_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,key_3.1,key_4.1,key_5.1,key_6.1,key_7.1,key_8.1,key_9.1,key_10.1,flex_recall,flex_prec
0,personil diajukan fase,diajukan fase tender,organisasi diajukan fase,personil pengganti,personil diajukan,fase tender,organisasi diajukan,personil,diajukan fase,tender,...,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.0,0.0
1,document,template,pengelolaan,acuan,demikian,facilities,processing,biru,atas,tiung,...,full_match,full_match,no_match,no_match,no_match,no_match,no_match,no_match,0.571,0.4
2,ruangan kantor,kantor,ruangan,artikel,appendix,usulan,scope,company,dengan,komposisi,...,no_match,no_match,no_match,no_match,no_match,partial_match,no_match,no_match,0.286,0.2


In [52]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_5 = pd.concat([predict_textrank, df_targets, eval_textrank_5], axis=1)
predict_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,4,5,6,key_1.1,key_2.1,key_3.1,key_4.1,key_5.1,flex_recall,flex_prec
0,personil diajukan fase,diajukan fase tender,organisasi diajukan fase,personil pengganti,personil diajukan,fase tender,organisasi diajukan,personil,diajukan fase,tender,...,,,,no_match,no_match,no_match,no_match,no_match,0.0,0.0
1,document,template,pengelolaan,acuan,demikian,facilities,processing,biru,atas,tiung,...,dokumen,,,partial_match,partial_match,full_match,full_match,no_match,0.571,0.8
2,ruangan kantor,kantor,ruangan,artikel,appendix,usulan,scope,company,dengan,komposisi,...,services for company,exhibit a,,no_match,partial_match,no_match,no_match,no_match,0.143,0.2


In [53]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_3 = pd.concat([predict_textrank, df_targets, eval_textrank_3], axis=1)
predict_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,personil diajukan fase,diajukan fase tender,organisasi diajukan fase,personil pengganti,personil diajukan,fase tender,organisasi diajukan,personil,diajukan fase,tender,...,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,document,template,pengelolaan,acuan,demikian,facilities,processing,biru,atas,tiung,...,acuan,pengelolaan,dokumen,,,partial_match,partial_match,full_match,0.429,1.0
2,ruangan kantor,kantor,ruangan,artikel,appendix,usulan,scope,company,dengan,komposisi,...,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,partial_match,no_match,0.143,0.333


In [54]:
# Write predictions to excel file
from utils import write_excel

sheet_name_10 = '1_tr_phrase_countvectorizer_10'
sheet_name_5 = '1_tr_phrase_countvectorizer_5'
sheet_name_3 = '1_tr_phrase_countvectorizer_3'

output_file = '1_tr_phrase_countvectorizer.xlsx'
write_excel(predict_textrank_10, sheet_name_10, output_file)
write_excel(predict_textrank_5, sheet_name_5, output_file)
write_excel(predict_textrank_3, sheet_name_3, output_file)

  writer.book = book
  writer.save()
