R1 dengan w2v model yg di peroleh dari pretraining model wikipedia indonesia.
yg di process unigram, basis dari paper Yujun

In [10]:
#1. rutin1 import module
import pandas as pd
import os
import sys
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

In [12]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r29_sm.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [13]:
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary

def preprocess(text):
    '''
    fungsi untuk menghilangkan karakter yg tidak bermakna dan menghilangkan stopword.
    referensi stopword: tala + sastrawi + custom
    '''
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]
    
    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text) # 2x cleaning stop word
    text = str.remove(text)
    return text

def preprocess_tokenize(text):
    '''
    fungsi untuk memproses text yg sudah di process di preprocess() menjadi token
    dilakukan 3x preprocessing, karena stopword masih sering lewat kalau hanya 1x
    '''
    text = preprocess(text)
    text = preprocess(text)
    text = preprocess(text)
    tokens = text.split()
    tokens = [token for token in tokens if token]  # remove any empty tokens
    return tokens

In [14]:
def gen_word_embeddings(text_row, w2v_model):
    # Get the word embeddings for each word in the preprocessed text
    word_embeddings = {}
    for word in text_row:
        if word in w2v_model.wv:
            word_embeddings[word] = w2v_model.wv[word]
    return word_embeddings

In [15]:
def construct_weighted_graph(word_embeddings):
    # Calculate the cosine similarity matrix between all pairs of word embeddings
    cosine_similarities = cosine_similarity(list(word_embeddings.values()))
    # Construct a weighted graph representation of the text
    graph = {}
    for i, word_i in enumerate(word_embeddings.keys()):
        graph[word_i] = {}
        for j, word_j in enumerate(word_embeddings.keys()):
            if i != j:
                graph[word_i][word_j] = cosine_similarities[i][j]
    return graph

In [16]:
def textrank(graph, d=0.85, max_iter=100, tol=1e-4):
    # Initialize all node scores to 1
    scores = {node: 1 for node in graph.keys()}
    # Iterate until convergence
    for i in range(max_iter):
        old_scores = dict(scores)
        for node_i, neighbors in graph.items():
            # Calculate the new score for node i
            score_i = 1 - d
            for node_j, weight_ij in neighbors.items():
                score_i += d * weight_ij * scores[node_j] / sum(graph[node_j].values())
            scores[node_i] = score_i
        # Check for convergence
        max_diff = max([abs(old_scores[node] - scores[node]) for node in graph.keys()])
        if max_diff < tol:
            break
    return scores

In [17]:
def extract_keywords(w2v_model, text_row, n=10):
    preprocessed_text = preprocess_tokenize(text_row)
    word_embeddings = gen_word_embeddings(preprocessed_text, w2v_model)
    graph = construct_weighted_graph(word_embeddings)
    scores = textrank(graph)
    sorted_words = sorted(scores.items(), key=lambda item: item[1], reverse=True)
    top_n_keywords = [word for word, score in sorted_words[:n]]
    keyword_df = pd.DataFrame({'keywords': top_n_keywords, 'score': [score for word, score in sorted_words[:n]]})

    return keyword_df

In [18]:
from gensim.models import Word2Vec

pred_w2v_tr_tune = pd.DataFrame()
w2v_path = os.path.join(repo_root, "models/w2v/idwiki_word2vec_100_new_lower.model")
w2v_model = Word2Vec.load(w2v_path)
for i in df.index:
    text_row = df['text'][i]
    unigram = extract_keywords(w2v_model, text_row, n=3).reset_index(drop=True)
    a = pd.DataFrame(unigram.keywords).T.reset_index(drop=True)
    b = pd.DataFrame(unigram.score).T.reset_index(drop=True)

    # add extra empty columns to a and b dataframes if necessary
    if a.shape[1] < 3:
        for i in range(3 - a.shape[1]):
            a[f'col{i+1}'] = ''
            b[f'col{i+1}'] = ''
    unigram = pd.concat([a, b], axis=1)
    if unigram.shape[1] < 6:
        for i in range(6 - unigram.shape[1]):
            unigram[f'col{i+1}'] = ''
    unigram.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3']

    pred_w2v_tr_tune = pd.concat([pred_w2v_tr_tune, unigram], ignore_index=True)
    pred_w2v_tr_tune[['score_1', 'score_2', 'score_3']] = pred_w2v_tr_tune[['score_1', 'score_2', 'score_3']].round(3)

pred_w2v_tr_tune

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3
0,penilaian,prosedur,teknis,1.699,1.597,1.426
1,facilities,biru,processing,6.818348e+121,5.64893e+121,1.304287e+121
2,specification,table,services,1.717,1.713,1.63
3,pengadaan,inspeksi,control,2.066,1.873,1.733
4,iwan,bp,hamzah,3.793,2.629,2.614
5,facilities,tiung,conduct,1.279973e+61,1.210797e+61,6.923449000000001e+60
6,itt,sensitivity,mitigation,2.65394e+27,2.272604e+27,2.202527e+27
7,berkoordinasi,aspek,dampak,1.282634e+38,1.204476e+38,1.0958320000000001e+38
8,persyaratan,ketentuan,dibutuhkan,1.764,1.749,1.633
9,perizinan,peraturan,pemerintah,2.165,1.888,1.852


EVALUASI

In [19]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [20]:
# Evaluation TextRank
predict_w2v_tr_list = pred_w2v_tr_tune[['key_1','key_2','key_3']].values.tolist()
eval_w2v_textrank = eval(predict_w2v_tr_list, targets, True).round(3)
eval_w2v_textrank.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_w2v_textrank = eval_w2v_textrank[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_w2v_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,full_match,no_match,0.143,0.333
1,no_match,no_match,no_match,0.0,0.0
2,no_match,no_match,partial_match,0.143,0.333


In [21]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
w2v_textrank_recall = eval_w2v_textrank['flex_recall'].mean()
w2v_textrank_prec = eval_w2v_textrank['flex_prec'].mean()
w2v_textrank_f1 = 2 * (w2v_textrank_prec * w2v_textrank_recall) / (w2v_textrank_prec + w2v_textrank_recall)

# Create a DataFrame with the scores
summary = pd.DataFrame({'textrank': [w2v_textrank_recall, w2v_textrank_prec, w2v_textrank_f1]}, index=['recall', 'precision', 'F1'])
summary = summary.round(3)
summary

Unnamed: 0,textrank
recall,0.059
precision,0.138
F1,0.083


In [22]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_w2v_textrank = pd.concat([pred_w2v_tr_tune, df_targets, eval_w2v_textrank], axis=1)
predict_w2v_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3,0,1,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,penilaian,prosedur,teknis,1.699,1.597,1.426,persetujuan tertulis,prosedur,usulan,pengganti,,,,no_match,full_match,no_match,0.143,0.333
1,facilities,biru,processing,6.818348e+121,5.64893e+121,1.304287e+121,template document,exhibit c,acuan,pengelolaan,dokumen,,,no_match,no_match,no_match,0.0,0.0
2,specification,table,services,1.717,1.713,1.63,ruang kantor,change inquiry,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,partial_match,0.143,0.333


In [23]:
# Write predictions to excel file
from utils import write_excel

sheet_name = 'w2v_ia_textrank'
output_file = 'w2v_ia_textrank.xlsx'
write_excel(predict_w2v_textrank, sheet_name, output_file)

  writer.book = book
  writer.save()
