Combine PKE textrank and word2vec - unigram

1. Imports & Setup

In [1]:
#1. rutin1 import module
import pandas as pd
import os
import sys
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

2. Load Dataset

In [3]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "notebooks/postager_nlp-id/dataset_ekstraksi_r29_pos_sm.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]
df_pos = df['pos_sentence_list']

In [4]:
# Preprocess
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df_tr = df['preprocessed_text'].apply(preprocess)

3. Process

In [9]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

def extract_keyphrases_with_embeddings(text, w2v_model):
    # Get words that have embeddings
    words = [word for word in text.split() if word in w2v_model.wv.key_to_index]
    word_embeddings = [w2v_model.wv[word] for word in words]

    # If no word embeddings are present, return an empty list
    if not word_embeddings:
        return []
    
    # Compute the cosine similarity between word embeddings
    cosine_matrix = cosine_similarity(word_embeddings)
    
    # Create a graph and connect words with high similarity
    G = nx.Graph()
    for i in range(len(words)):
        for j in range(len(words)):
            if cosine_matrix[i][j] > 0.5:  # This threshold can be adjusted
                G.add_edge(words[i], words[j], weight=cosine_matrix[i][j])
    
    # Compute the PageRank scores to rank the words
    scores = nx.pagerank(G)

    # Extract top N keyphrases along with their scores
    ranked_words = sorted(((scores[word], word) for word in words if word in scores), reverse=True)
    keyphrases_with_scores = [(word, score) for score, word in ranked_words[:3]]

    return keyphrases_with_scores


In [6]:
w2v_path = os.path.join(repo_root, "models/w2v/idwiki_word2vec_100_new_lower.model")
w2v_model = Word2Vec.load(w2v_path)

single text

In [7]:
# Assuming w2v_model is your loaded Word2Vec model
text = df_tr[1]
keyphrases = extract_keyphrases_with_embeddings(text, w2v_model)
print(keyphrases)

[('facilities', 0.06241239526183723), ('processing', 0.06168987817329495), ('document', 0.05590062009264529), ('document', 0.05590062009264529), ('procedure', 0.055817361636743196), ('tiung', 0.05263157894736842), ('project', 0.05263157894736842), ('perhatiannya', 0.05263157894736842), ('pengelolaan', 0.05263157894736842), ('pada', 0.05263157894736842)]


multitext

In [10]:
# Assuming df_pos is your dataframe with texts
results = []

# Iterate over the dataframe
for text in df_pos:
    keyphrases_with_scores = extract_keyphrases_with_embeddings(text, w2v_model)
    results.extend(keyphrases_with_scores)

# Convert the results to a dataframe
predict_textrank = pd.DataFrame(results, columns=['Keyword', 'Score'])
predict_textrank.head()


Unnamed: 0,Keyword,Score
0,usulan,0.5
1,personil,0.5
2,personil,0.5
3,tiung,1.0
4,instrument,1.0
