Combine PKE textrank and word2vec N-gram
Feature 

V1 Feature
- Membuat jalan program
- Ada deteksi unigram/bigram/trigram

V2
- filter jika kata tidak ada dalam model embedding w2v
- implementasi stopwords

V3
- deteksi bigram dari frekuency lebih dari 2x

V4
- detection bigram and trigram using nlp-id
- preprocessing tanpa stopwords

V5
- unigram and trigram di normalize hurufnya
- score jika kata ada dalam judul

V6
- membuat filter Verb dan Noun dari nlp-id


1. Imports & Setup

In [27]:
#1. rutin1 import module
import pandas as pd
import os
import sys
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

2. Load Dataset

In [29]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "notebooks/postager_nlp-id/dataset_ekstraksi_r29_pos_sm.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

#df_pos = df['pos_sentence_list']

In [30]:
# Preprocess
import re

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    return text

df_tr = df['text'].apply(preprocess)
df["judul"] = df["judul"].apply(preprocess)

3. Process

In [31]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

from nltk.util import ngrams

def generate_ngrams(words, n=2):
    """Generate ngrams from a list of words."""
    return [" ".join(gram) for gram in ngrams(words, n)]

def get_phrase_embedding(phrase, w2v_model):
    """Get the averaged word embedding for a phrase."""
    words = phrase.split()
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return None

from collections import Counter
from nlp_id.tokenizer import PhraseTokenizer 

def detect_bigram(text):
    text = preprocess(text)
    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Filter the list to include only bigrams (phrases with exactly two words)
    bigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 1]

    return bigrams_only

def detect_trigram(text):
    text = preprocess(text)
    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Filter the list to include only trigrams (phrases with exactly three words)
    trigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 2]

    return trigrams_only

def extract_keyphrases_with_ngrams(text, w2v_model, judul, n=3):
    # Read stopwords from the file
    #stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt") 
    stopwords_path = os.path.join(repo_root, "notebooks/stopwords_tuning/all_stop_words.txt")
    with open(stopwords_path, 'r') as file:
        stopwords = set(file.read().strip().splitlines())

    # Tokenize the text into unigrams
    unigrams = [word for word in text.split() if word not in stopwords]

    # Generate bigrams and trigrams using nlp-id
    bigrams = detect_bigram(text)
    trigrams = detect_trigram(text)
    
    # Combine unigrams, filtered bigrams, and filtered trigrams
    all_tokens = unigrams + bigrams + trigrams
    
    # Get embeddings for each token (averaging word embeddings for bigrams/trigrams)
    token_embeddings = [get_phrase_embedding(token, w2v_model) for token in all_tokens]
    
    # Filter out tokens that don't have embeddings
    tokens, embeddings = zip(*[(token, emb) for token, emb in zip(all_tokens, token_embeddings) if emb is not None])
    
    # Compute the cosine similarity between token embeddings
    cosine_matrix = cosine_similarity(embeddings)
    
    # Create a graph and connect tokens with high similarity
    G = nx.Graph()
    for i in range(len(tokens)):
        for j in range(len(tokens)):
            if cosine_matrix[i][j] > 0.5:  # This threshold can be adjusted
                G.add_edge(tokens[i], tokens[j], weight=cosine_matrix[i][j])
    
    # Compute the PageRank scores to rank the tokens
    scores = nx.pagerank(G)

    # Modify scores if token is in title letter
    for token in scores:
        if any(token in title for title in judul):
            scores[token] *= 2

    # Extract top N keyphrases along with their scores
    ranked_tokens = sorted(((scores[token], token) for token in tokens if token in scores), reverse=True)
    
    keyphrases_with_scores = []
    seen_tokens = set()  # Set to keep track of tokens that have already been added

    for score, token in ranked_tokens:
        if token not in seen_tokens:
            keyphrases_with_scores.append((token, score))
            seen_tokens.add(token)  # Mark the token as seen
            if len(keyphrases_with_scores) >= n:
                break  # Stop when the desired number of keyphrases is reached

    return keyphrases_with_scores

In [32]:
#w2v_path = os.path.join(repo_root, "models/w2v_100/idwiki_word2vec_100_new_lower.model")
w2v_path = os.path.join(repo_root, "models/w2v_200/idwiki_word2vec_200_new_lower.model")
w2v_model = Word2Vec.load(w2v_path)

multitext

In [34]:
predict_textrank = pd.DataFrame()
for i in df_tr.index:
    text = df["text"][i] # sblm di preprocess
    #text = df_tr[i] # setelah di preprocess
    ls_judul = df["judul"][i].split()
    keyphrases = extract_keyphrases_with_ngrams(text, w2v_model, ls_judul, 3)
    df_keyphrases = pd.DataFrame(keyphrases, columns=['Keyword', 'Score'])
    a = pd.DataFrame(df_keyphrases.Keyword).T.reset_index(drop=True)
    b = pd.DataFrame(df_keyphrases.Score).round(3).T.reset_index(drop=True)
    df_keyphrases = pd.concat([a, b], axis=1)

    # Check if there are missing columns and add them with zero values
    missing_columns = 6 - df_keyphrases.shape[1]
    for _ in range(missing_columns):
        df_keyphrases[df_keyphrases.shape[1]] = 0

    df_keyphrases.columns = ['key_1', 'key_2','key_3','score_1', 'score_2','score_3']
    predict_textrank = pd.concat([predict_textrank, df_keyphrases], ignore_index=True)
predict_textrank.head(3)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3
0,usulan,key,usulan personil penting,0.043,0.031,0.023
1,ucapkan,template document,perhatiannya,0.167,0.167,0.167
2,ruangan,ruang,inquiry,0.041,0.041,0.04


EVALUATION

In [35]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [36]:
# Evaluation TextRank
predict_textrank_list = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank = eval(predict_textrank_list, targets, True).round(3)
eval_textrank.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank = eval_textrank[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,full_match,no_match,no_match,0.143,0.333
1,no_match,full_match,no_match,0.143,0.333
2,no_match,partial_match,partial_match,0.286,0.667


In [37]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall = eval_textrank['flex_recall'].mean()
textrank_prec = eval_textrank['flex_prec'].mean()
textrank_f1 = 2 * (textrank_prec * textrank_recall) / (textrank_prec + textrank_recall)

# Create a DataFrame with the scores
summary = pd.DataFrame({'textrank': [textrank_recall, textrank_prec, textrank_f1]}, index=['recall', 'precision', 'F1'])
summary = summary.round(3)
summary

Unnamed: 0,textrank
recall,0.11
precision,0.255
F1,0.153


In [38]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank = pd.concat([predict_textrank, df_targets, eval_textrank], axis=1)
predict_textrank.head(3)

Unnamed: 0,key_1,key_2,key_3,score_1,score_2,score_3,0,1,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,usulan,key,usulan personil penting,0.043,0.031,0.023,persetujuan tertulis,prosedur,usulan,pengganti,,,,full_match,no_match,no_match,0.143,0.333
1,ucapkan,template document,perhatiannya,0.167,0.167,0.167,template document,exhibit c,acuan,pengelolaan,dokumen,,,no_match,full_match,no_match,0.143,0.333
2,ruangan,ruang,inquiry,0.041,0.041,0.04,ruang kantor,change inquiry,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,partial_match,partial_match,0.286,0.667


In [39]:
# Write predictions to excel file
from utils import write_excel

sheet_name = 'w2v_tr_phrase'
output_file = 'w2v_textrank_ngram_v5.xlsx'
write_excel(predict_textrank, sheet_name, output_file)

BadZipFile: File is not a zip file

next todo
1. unigram and trigram di normalize hurufnya
2. hanya kata verb dan noun yg akan di hitung
3. score jika kata ada dalam judul