Combine PKE textrank and word2vec - unigram

1. Imports & Setup

In [18]:
#1. rutin1 import module
import pandas as pd
import os
import sys
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

2. Load Dataset

In [20]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "notebooks/postager_nlp-id/dataset_ekstraksi_r29_pos_sm.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]
df_pos = df['pos_sentence_list']

In [21]:
# Preprocess
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df_tr = df['preprocessed_text'].apply(preprocess)

3. Process

In [22]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

def extract_keyphrases_with_embeddings(text, w2v_model, n=3):
    # Get words that have embeddings
    words = [word for word in text.split() if word in w2v_model.wv.key_to_index]
    word_embeddings = [w2v_model.wv[word] for word in words]

    # If no word embeddings are present, return an empty list
    if not word_embeddings:
        return []
    
    # Compute the cosine similarity between word embeddings
    cosine_matrix = cosine_similarity(word_embeddings)
    
    # Create a graph and connect words with high similarity
    G = nx.Graph()
    for i in range(len(words)):
        for j in range(len(words)):
            if cosine_matrix[i][j] > 0.5:  # This threshold can be adjusted
                G.add_edge(words[i], words[j], weight=cosine_matrix[i][j])
    
    # Compute the PageRank scores to rank the words
    scores = nx.pagerank(G)

    # Extract top N keyphrases along with their scores
    ranked_words = sorted(((scores[word], word) for word in words if word in scores), reverse=True)
    keyphrases_with_scores = [(word, score) for score, word in ranked_words[:n]]

    return keyphrases_with_scores


In [23]:
w2v_path = os.path.join(repo_root, "models/w2v/idwiki_word2vec_100_new_lower.model")
w2v_model = Word2Vec.load(w2v_path)

multitext

In [24]:
predict_textrank = pd.DataFrame()
for i in df_tr.index:
    #text = df["text"][i]
    text = df_tr[i]
    keyphrases = extract_keyphrases_with_embeddings(text, w2v_model, 3)
    df_keyphrases = pd.DataFrame(keyphrases, columns=['Keyword', 'Score'])
    a = pd.DataFrame(df_keyphrases.Keyword).T.reset_index(drop=True)
    b = pd.DataFrame(df_keyphrases.Score).round(3).T.reset_index(drop=True)
    df_keyphrases = pd.concat([a, b], axis=1)
    df_keyphrases.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3']
    predict_textrank = pd.concat([predict_textrank, df_keyphrases], ignore_index=True)
predict_textrank


Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3
0,penilaian,penjelasan,penjelasan,0.021,0.02,0.02
1,facilities,processing,document,0.062,0.062,0.056
2,akan,ruang,provided,0.023,0.023,0.021
3,pengadaan,control,instrument,0.054,0.052,0.049
4,persetujuan,bp,request,0.052,0.043,0.043
5,facilities,soil,soil,0.055,0.053,0.053
6,mitigation,analysis,solusi,0.026,0.022,0.022
7,memperhatikan,ketentuan,ketentuan,0.029,0.024,0.024
8,ketentuan,ketentuan,ketentuan,0.023,0.023,0.023
9,mekanisme,regarding,perizinan,0.019,0.018,0.018


EVALUATION

In [25]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [26]:
# Evaluation TextRank
predict_textrank_list = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank = eval(predict_textrank_list, targets, True).round(3)
eval_textrank.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank = eval_textrank[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,no_match,no_match,partial_match,0.143,0.333
2,no_match,partial_match,no_match,0.143,0.333


In [27]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall = eval_textrank['flex_recall'].mean()
textrank_prec = eval_textrank['flex_prec'].mean()
textrank_f1 = 2 * (textrank_prec * textrank_recall) / (textrank_prec + textrank_recall)

# Create a DataFrame with the scores
summary = pd.DataFrame({'textrank': [textrank_recall, textrank_prec, textrank_f1]}, index=['recall', 'precision', 'F1'])
summary = summary.round(3)
summary

Unnamed: 0,textrank
recall,0.057
precision,0.133
F1,0.08


In [28]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank = pd.concat([predict_textrank, df_targets, eval_textrank], axis=1)
predict_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3,0,1,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,penilaian,penjelasan,penjelasan,0.021,0.02,0.02,persetujuan tertulis,prosedur,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,facilities,processing,document,0.062,0.062,0.056,template document,exhibit c,acuan,pengelolaan,dokumen,,,no_match,no_match,partial_match,0.143,0.333
2,akan,ruang,provided,0.023,0.023,0.021,ruang kantor,change inquiry,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,partial_match,no_match,0.143,0.333


In [29]:
# Write predictions to excel file
from utils import write_excel

sheet_name = 'w2v_tr_unigram'
output_file = 'w2v_textrank_unigram.xlsx'
write_excel(predict_textrank, sheet_name, output_file)

  writer.book = book
  writer.save()
