Combine PKE textrank and word2vec N-gram

V1 Feature
- Membuat jalan program
- Ada deteksi unigram/bigram/trigram

V2
- filter jika kata tidak ada dalam model embedding w2v
- implementasi stopwords

1. Imports & Setup

In [1]:
#1. rutin1 import module
import pandas as pd
import os
import sys
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

2. Load Dataset

In [3]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "notebooks/postager_nlp-id/dataset_ekstraksi_r29_pos_sm.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]
#df_pos = df['pos_sentence_list']

In [4]:
# Preprocess
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df_tr = df['preprocessed_text'].apply(preprocess)

3. Process

In [5]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

from nltk.util import ngrams

def generate_ngrams(words, n=2):
    """Generate ngrams from a list of words."""
    return [" ".join(gram) for gram in ngrams(words, n)]

def get_phrase_embedding(phrase, w2v_model):
    """Get the averaged word embedding for a phrase."""
    words = phrase.split()
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return None

from collections import Counter

def extract_keyphrases_with_ngrams(text, w2v_model, n=3):
    # Read stopwords from the file
    stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    with open(stopwords_path, 'r') as file:
        stopwords = set(file.read().strip().splitlines())

    # Tokenize the text into unigrams
    pre_unigrams = text.split()
    unigrams = [word for word in text.split() if word not in stopwords]
    
    # Generate bigrams and trigrams
    bigrams = generate_ngrams(pre_unigrams, 2)
    trigrams = generate_ngrams(pre_unigrams, 3)

    # Count occurrences of bigrams and trigrams
    bigram_counts = Counter(bigrams)
    trigram_counts = Counter(trigrams)

    # Filter bigrams and trigrams that appear more than 2 times
    bigrams = [bigram for bigram in bigrams if bigram_counts[bigram] > 2]
    trigrams = [trigram for trigram in trigrams if trigram_counts[trigram] > 2]

    # Combine unigrams, filtered bigrams, and filtered trigrams
    all_tokens = unigrams + bigrams + trigrams
    
    # Get embeddings for each token (averaging word embeddings for bigrams/trigrams)
    token_embeddings = [get_phrase_embedding(token, w2v_model) for token in all_tokens]
    
    # Filter out tokens that don't have embeddings
    tokens, embeddings = zip(*[(token, emb) for token, emb in zip(all_tokens, token_embeddings) if emb is not None])
    
    # Compute the cosine similarity between token embeddings
    cosine_matrix = cosine_similarity(embeddings)
    
    # Create a graph and connect tokens with high similarity
    G = nx.Graph()
    for i in range(len(tokens)):
        for j in range(len(tokens)):
            if cosine_matrix[i][j] > 0.5:  # This threshold can be adjusted
                G.add_edge(tokens[i], tokens[j], weight=cosine_matrix[i][j])
    
    # Compute the PageRank scores to rank the tokens
    scores = nx.pagerank(G)

    # Extract top N keyphrases along with their scores
    ranked_tokens = sorted(((scores[token], token) for token in tokens if token in scores), reverse=True)
    keyphrases_with_scores = [(token, score) for score, token in ranked_tokens[:n]]

    return keyphrases_with_scores

In [6]:
w2v_path = os.path.join(repo_root, "models/w2v/idwiki_word2vec_100_new_lower.model")
w2v_model = Word2Vec.load(w2v_path)

multitext

In [8]:
text = df["text"][i]
text

'Izin Mendirikan Bangunan "Bangunan Pioneer" - GPF Project\n(Construction Permit for "Pioneer Building" - GPF Project). Terlampir kami sampaikan surat lzin\nMendirikan Bangunan No. 410 Tahun 2017\nyang diterbitkan oleh Dinas Penanaman\nModal dan Pelayanan Terpadu Satu Pintu\nKab: Bojonegoro pada tanggal 29 Desember\n2017, sebagai referensi terkait dengan\npelaksanaan pekerjaan.\nDemikian disampaikan, terima kasih atas\nperhatiannya.'

In [9]:
keyphrases = extract_keyphrases_with_ngrams(text, w2v_model, 3)
keyphrases

[('referensi', 0.5), ('diterbitkan', 0.5)]

In [11]:
predict_textrank = pd.DataFrame()
for i in df_tr.index:
    text = df["text"][i] # sblm di preprocess
    #text = df_tr[i] # setelah di preprocess
    keyphrases = extract_keyphrases_with_ngrams(text, w2v_model, 3)
    df_keyphrases = pd.DataFrame(keyphrases, columns=['Keyword', 'Score'])
    a = pd.DataFrame(df_keyphrases.Keyword).T.reset_index(drop=True)
    b = pd.DataFrame(df_keyphrases.Score).round(3).T.reset_index(drop=True)
    df_keyphrases = pd.concat([a, b], axis=1)

    # Check if there are missing columns and add them with zero values
    missing_columns = 6 - df_keyphrases.shape[1]
    for _ in range(missing_columns):
        df_keyphrases[df_keyphrases.shape[1]] = 0

    df_keyphrases.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3']
    predict_textrank = pd.concat([predict_textrank, df_keyphrases], ignore_index=True)
predict_textrank

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3
0,yang diajukan,yang diajukan,yang diajukan,0.028,0.028,0.028
1,perhatiannya,pengelolaan,acuan,0.333,0.333,0.333
2,ruangan kantor,ruangan kantor,ruangan kantor,0.04,0.04,0.04
3,nya,inspeksi,fungsi,0.167,0.167,0.167
4,tanggai,kunjungan,dilaksanakan,0.25,0.25,0.25
5,conduct,to,the,0.167,0.167,0.167
6,solusi yang,solusi yang,solusi yang,0.039,0.039,0.039
7,memperhatikan,menyetujui,memenuhi,0.058,0.041,0.04
8,memenuhi,memenuhi,memperhatikan,0.043,0.043,0.039
9,regarding,memperhatikan,on,0.04,0.036,0.036


EVALUATION

In [None]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [None]:
# Evaluation TextRank
predict_textrank_list = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank = eval(predict_textrank_list, targets, True).round(3)
eval_textrank.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank = eval_textrank[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,no_match,no_match,no_match,0.0,0.0
2,no_match,no_match,no_match,0.0,0.0


In [None]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall = eval_textrank['flex_recall'].mean()
textrank_prec = eval_textrank['flex_prec'].mean()
textrank_f1 = 2 * (textrank_prec * textrank_recall) / (textrank_prec + textrank_recall)

# Create a DataFrame with the scores
summary = pd.DataFrame({'textrank': [textrank_recall, textrank_prec, textrank_f1]}, index=['recall', 'precision', 'F1'])
summary = summary.round(3)
summary

Unnamed: 0,textrank
recall,0.005
precision,0.011
F1,0.007


In [None]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank = pd.concat([predict_textrank, df_targets, eval_textrank], axis=1)
predict_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3,0,1,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,yang diajukan pada,yang diajukan pada,yang diajukan pada,0.022,0.022,0.022,persetujuan tertulis,prosedur,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,sampaikan,ucapkan,yang,0.074,0.072,0.071,template document,exhibit c,acuan,pengelolaan,dokumen,,,no_match,no_match,no_match,0.0,0.0
2,dan,dan,dan,0.022,0.022,0.022,ruang kantor,change inquiry,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,no_match,0.0,0.0


In [None]:
# Write predictions to excel file
'''
from utils import write_excel

sheet_name = 'w2v_tr_phrase'
output_file = 'w2v_textrank_ngram_v2.xlsx'
write_excel(predict_textrank, sheet_name, output_file)
'''

  writer.book = book
  writer.save()
