In [1]:
import pandas as pd
import re
import numpy as np
import math
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary

In [2]:
# tuning paramater
tuning_multiplier = 1  #aktor pengali dari score jika kata tersebut merupakan frase. default = 1 (perlu variasi 0.6 - 0.75)
tuning_f_phrase = 3  #score minimum utk bisa disebut frase
m_prediksi = 3  #jumlah top -n keyword prediksi
n_top_phrase = 3   #jumlah frase yg akan di cari dalam fungsi get_top_phrase

In [3]:
# Preprocess
def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    with open('./data/stopword_tala_sastrawi.txt', 'r') as f:
        stopword_tala_sastrawi = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopword_tala_sastrawi)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

In [4]:
df = pd.read_excel('data/dataset_ekstraksi_r27.xlsx', sheet_name='dataset')
df['judul'] = df['judul'].astype(str) + " "
df = df["judul"] + df["isi"]
df = df.apply(preprocess)
df = df.apply(preprocess)

In [5]:
def build_graph(vocab_len, processed_text, vocabulary):
    """
    Builds a weighted edge graph based on co-occurrences of words in the text.
    + perlu ada tambahan formula untuk menghitung score kata yg ada dalam title menjadi lebih besar. (1, 1.5, 2)
    """
    weighted_edge = np.zeros((vocab_len, vocab_len), dtype=np.float32)
    score = np.ones((vocab_len), dtype=np.float32)
    window_size = 3  
    covered_coocurrences = []

    for i in range(vocab_len):
        for j in range(vocab_len):
            if j == i:
                weighted_edge[i][j] = 0
            else:
                for window_start in range(len(processed_text) - window_size):
                    window_end = window_start + window_size
                    window = processed_text[window_start:window_end]
                    if (vocabulary[i] in window) and (vocabulary[j] in window):
                        index_of_i = window_start + window.index(vocabulary[i])
                        index_of_j = window_start + window.index(vocabulary[j])
                        if [index_of_i,index_of_j] not in covered_coocurrences:
                            weighted_edge[i][j] += 1 / math.fabs(index_of_i - index_of_j)
                            covered_coocurrences.append([index_of_i, index_of_j])

    inout = np.sum(weighted_edge, axis=1)
  
    MAX_ITERATIONS = 50
    d = 0.85
    threshold = 0.0001
    for _ in range(MAX_ITERATIONS):
        prev_score = np.copy(score)
        for i in range(vocab_len):
            summation = 0
            for j in range(vocab_len):
                if weighted_edge[i][j] != 0:
                    summation += (weighted_edge[i][j] / inout[j]) * score[j]
            score[i] = (1 - d) + d * summation
        if np.sum(np.fabs(prev_score - score)) <= threshold:
            break

    return vocabulary, score

In [6]:
def score_phrases(unique_phrases, vocabulary, score, multiplier=tuning_multiplier):
    """
    Computes the score of each phrase using the given vocabulary, word scores, and multiplier.
    """
    phrase_scores = []
    keywords = []
    for phrase in unique_phrases:
        phrase_score = 0
        keyword = ''
        for word in phrase:
            keyword += str(word) + " "
            phrase_score += score[vocabulary.index(word)]
        phrase_score *= multiplier
        phrase_scores.append(phrase_score)
        keywords.append(keyword.strip())

    return keywords, phrase_scores

In [7]:
def get_top_phrase(corpus, n=n_top_phrase):  #perlu ada improvement karena phrase yg di hasilkan masih blm proper
    vec1 = CountVectorizer(ngram_range=(2,3),  
            max_features=2000).fit([corpus])
    bag_of_words = vec1.transform([corpus])
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    # perlu di buat filter jika pola tidak mengikuti kaidah kata majemuk indonesia di excludekan.
    return words_freq[:n]

In [8]:
def predict_keywords(text, m=5, f_phrase=5, tuning_multiplier=1):
    """
    Predicts the top m keywords and top f_phrase phrases for the given text.
    processed_text = text keseluruhan
    vocabulary = unique word dalam proccesesed_text
    """
    processed_text = word_tokenize(text)
    vocabulary = list(set(processed_text))
    vocab_len = len(vocabulary)
    vocabulary, score = build_graph(vocab_len, processed_text, vocabulary)
    unigram = pd.DataFrame({
        'Keyword': vocabulary,
        'Score': score
    }).nlargest(m, 'Score')
    
    bi_trigram = pd.DataFrame(get_top_phrase(text, n=50), columns=['Phrase', 'Score'])
    bi_trigram = bi_trigram[bi_trigram['Score'] >= f_phrase]
    bi_trigram['Tokens'] = bi_trigram['Phrase'].apply(word_tokenize)
    unique_phrases = bi_trigram['Tokens'].values.tolist()
    keywords, phrase_scores = score_phrases(unique_phrases, vocabulary, score, tuning_multiplier) #BUG_1 not accesed by pylance, krn tidak di gunakan di procss selanjutnya
    # memasukan score ke dalam dataframe
    bi_trigram = pd.DataFrame({
        'Phrase': keywords,
        'Score': phrase_scores
    }).nlargest(m, 'Score')

      # Combine unigram and bi_trigram dataframes
    predict_keywords = pd.concat([unigram, bi_trigram[['Phrase', 'Score']].rename(columns={'Phrase': 'Keyword'})])\
                    .sort_values('Score', ascending=False)\
                    .nlargest(m, 'Score')\
                    .reset_index(drop=True)

    return predict_keywords

In [9]:
from openpyxl import load_workbook

def write_excel(df, sheet_name, filename):
    """
    Writes the given dataframe to an excel file with the given filename and sheet name.
    If the sheet already exists in the file, the data in the sheet will be overwritten.
    """
    try:
        book = load_workbook(filename)  # Load the existing workbook
    except FileNotFoundError:
        book = Workbook()  # If the file doesn't exist, create a new workbook

    writer = pd.ExcelWriter(filename, engine='openpyxl')
    writer.book = book
    
    if sheet_name in book.sheetnames:  # If sheet already exists, delete it
        idx = book.sheetnames.index(sheet_name)
        sheet = book[sheet_name]
        book.remove(sheet)
        writer.sheets = {ws.title:ws for ws in book.worksheets}
        
    df.to_excel(writer, sheet_name=sheet_name, index=False)
    writer.save()

In [10]:
# Predict keywords for all sentences in the dataframe and save the in dataframe
predict_textrank = pd.DataFrame()
for i in df.index:
    keyphrase = predict_keywords(df[i], m_prediksi, tuning_f_phrase, tuning_multiplier).reset_index(drop=True)
    a = pd.DataFrame(keyphrase.Keyword).T.reset_index(drop=True)
    b = pd.DataFrame(keyphrase.Score).round(2).T.reset_index(drop=True)
    keyphrase = pd.concat([a, b], axis=1)
    #keyphrase.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3'] 

    predict_textrank = pd.concat([predict_textrank, keyphrase], ignore_index=True)
predict_textrank.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3'] 

EVALUATION

In [11]:
from utils.f_evaluation import check_similarity, eval

df = pd.read_excel('data/dataset_ekstraksi_r27.xlsx', sheet_name='dataset')
targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [12]:
# Evaluation TextRank
predict_textrank_list = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank = eval(predict_textrank_list, targets, True).round(3)
eval_textrank.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank = eval_textrank[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,no_match,full_match,no_match,0.143,0.333
2,no_match,no_match,no_match,0.0,0.0


In [13]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall = eval_textrank['flex_recall'].mean()
textrank_prec = eval_textrank['flex_prec'].mean()
textrank_f1 = 2 * (textrank_prec * textrank_recall) / (textrank_prec + textrank_recall)

# Create a DataFrame with the scores
summary = pd.DataFrame({'textrank': [textrank_recall, textrank_prec, textrank_f1]}, index=['recall', 'precision', 'F1'])
summary = summary.round(3)
summary

Unnamed: 0,textrank
recall,0.083
precision,0.193
F1,0.116


In [14]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank = pd.concat([predict_textrank, df_targets, eval_textrank], axis=1)
predict_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3,0,1,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,personil diajukan fase,diajukan fase tender,personil diajukan,9.55,7.33,7.15,persetujuan tertulis,prosedur,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,jtb,dokumen,gpf,2.25,1.62,1.55,template document,exhibit c,acuan,pengelolaan,dokumen,,,no_match,full_match,no_match,0.143,0.333
2,scope of work,ruangan kantor,of work,5.18,4.36,3.75,ruang kantor,change inquiry,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,no_match,0.0,0.0


In [15]:
# Write predictions to excel file
sheet_name = 'tr_phrase'
output_file = 'result/02_phrase_prediction.xlsx'
write_excel(predict_textrank, sheet_name, output_file)