POS Filter

In [21]:
#1. rutin1 import module
import pandas as pd
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import string

import networkx as nx
from collections import defaultdict

warnings.simplefilter(action='ignore', category=UserWarning)

In [22]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(repo_root)

In [23]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r30_lg.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [24]:
#4 Preprocess
import re
'''
stopwords tidak masuk dalam preprocessing
'''
def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    return text

df["text"] = df['text'].apply(preprocess)
df["judul"] = df["judul"].apply(preprocess)

In [25]:
#5 Load stopword
stopwords_path = os.path.join(repo_root, "notebooks/stopwords_tuning/all_stop_words.txt")
with open(stopwords_path, 'r') as file:
    stopwords = set(file.read().strip().splitlines())

In [26]:
# Fungsi Phrase Detection

from collections import Counter
from nlp_id_local.tokenizer import PhraseTokenizer 
from nlp_id_local.postag import PosTag
from nltk.util import ngrams

def generate_ngrams(words, n=2):
    """Generate ngrams from a list of words."""
    return [" ".join(gram) for gram in ngrams(words, n)]

def detect_bigram(text):
    
    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Include only bigrams whose individual words are in available_tokens
    bigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 1]

    return bigrams_only

def detect_trigram(text):

    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Include only trigrams whose individual words are in available_tokens
    trigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 2 ]

    return trigrams_only

# Function to incorporate bigrams and trigrams in the correct sequence
def incorporate_bigrams_trigrams(unigrams, bigrams, trigrams):
    combined_tokens = []
    skip = 0

    for i in range(len(unigrams)):
        if skip > 0:
            skip -= 1
            continue

        bigram_formed = ' '.join(unigrams[i:i+2]) in bigrams
        trigram_formed = ' '.join(unigrams[i:i+3]) in trigrams

        if bigram_formed:
            combined_tokens.append(' '.join(unigrams[i:i+2]))
            skip = 1
        elif trigram_formed:
            combined_tokens.append(' '.join(unigrams[i:i+3]))
            skip = 2
        else:
            combined_tokens.append(unigrams[i])

    return combined_tokens

def detect_all_tokens(text):
    unigrams = [word for word in text.split()]
    bigrams = detect_bigram(text)
    trigrams = detect_trigram(text)
    
    # Incorporating bigrams and trigrams into the sequence of tokens
    all_tokens = incorporate_bigrams_trigrams(unigrams, bigrams, trigrams)

    # Combine unigrams, filtered bigrams, and filtered trigrams
    #all_tokens = unigrams + bigrams + trigrams

    return all_tokens

In [27]:
# Fungsi Visualisasi
def visualize_graph(G, labels):
    # Remove self-loops (edges that connect a node to itself)
    G.remove_edges_from(nx.selfloop_edges(G))

    fig = plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(G, seed=42)
    #nx.draw(G, pos=pos, with_labels=False, font_weight="bold", node_size=5000, node_color='skyblue')
    nx.draw(G, pos=pos, with_labels=False, font_weight="bold")
    nx.draw_networkx_labels(G, pos, labels, font_size=12)
    plt.show()

In [28]:
# Load Word2vec Model
import os
from gensim.models import Word2Vec

def load_word2vec_model(model_path):
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"The provided Word2Vec model path does not exist: {model_path}")
    
    w2v_model = Word2Vec.load(model_path) 
    available_tokens = set(w2v_model.wv.key_to_index)
    
    return w2v_model, available_tokens

def get_unique_tokens_pos(all_tokens, pos_model_path):
    """
    Get unique POS tags for tokens.
    """
    pos_model_path = os.path.join(repo_root, "notebooks/nlp-id_retraining/train_tuned.pkl")
    postagger = PosTag(pos_model_path)
    pos_tokens = []
    seen_tokens = set()
    
    for token in all_tokens:
        if token not in seen_tokens:
            seen_tokens.add(token)
            tokens_pos = postagger.get_phrase_tag(token)
            pos_tokens.append(tokens_pos)
    return pos_tokens

def flatten_list_of_lists(list_of_lists):
    """
    Flatten a list of lists into a single list.
    """
    return [item for sublist in list_of_lists for item in sublist]

def get_phrase_embedding(phrase, w2v_model):
    """Get the averaged word embedding for a phrase."""
    words = phrase.split()
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return None

In [29]:
# Target 1 : Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(w1, w2, w2v_model):
    if w1 not in w2v_model.wv or w2 not in w2v_model.wv:
        return 0
    vec1 = w2v_model.wv[w1]
    vec2 = w2v_model.wv[w2]
    similarity = cosine_similarity([vec1], [vec2])[0][0]
    return similarity

TEXTRANK and ADJUSTMENT

In [30]:
# Load Word2Vec Model
def load_word2vec_model(model_path):
    w2v_model = Word2Vec.load(model_path)
    available_tokens = w2v_model.wv.index_to_key
    return w2v_model, available_tokens


In [31]:
# Postagging Function
def postag_tokens(tokens, postagger):
    pos_tokens = [postagger.get_phrase_tag(token) for token in tokens]
    # Flatten the list
    return [item for sublist in pos_tokens for item in sublist]


In [32]:
# Build Co-occurrence Matrix
def build_co_occurrence_matrix(words, window_size=3):
    co_occurrence = defaultdict(int)
    for i in range(len(words) - window_size + 1):
        window = words[i:i+window_size]
        for j in range(window_size):
            for k in range(j+1, window_size):
                w1, w2 = sorted([window[j], window[k]])
                if w1 != w2:
                    co_occurrence[(w1, w2)] += 1
    return co_occurrence

# Build Graph and Compute TextRank
def build_graph_and_compute_textrank(co_occurrence, w2v_model):
    G = nx.Graph()
    for (w1, w2), weight1 in co_occurrence.items():
        weight2 = get_cosine_similarity(w1, w2, w2v_model)
        weight3 = weight1 * weight2
        if weight2 > 0:
            G.add_edge(w1, w2, weight=weight3)
    return nx.pagerank(G)

In [33]:
# Filter and Rank Keywords
def filter_and_rank_keywords(scores, stopwords, num_keywords=10):
    ranked_words = sorted(((score, word) for word, score in scores.items()), reverse=True)
    ranked_words_nstopword = [(score, word) for score, word in ranked_words if word not in stopwords]
    
    keyphrases_with_scores = []
    seen_tokens = set()
    for score, token in ranked_words_nstopword:
        if token not in seen_tokens:
            keyphrases_with_scores.append((token, score))
            seen_tokens.add(token)
            if len(keyphrases_with_scores) >= num_keywords:
                break

    return keyphrases_with_scores

# Attach POS Tags to Keywords
def attach_pos_tags(keywords, pos_tokens):
    pos_dict = dict(pos_tokens)
    return [(word, score, pos_dict.get(word, 'UNK')) for word, score in keywords]

# Filter by Selected POS Tags
def filter_by_pos(keywords_with_pos, selected_pos):
    return [item for item in keywords_with_pos if item[2] in selected_pos]



#SAMPLE SINGLE DATASET

In [43]:
# Load Word2Vec Model
w2v_model_path = os.path.join(repo_root, "notebooks/word2vec_model/w2v_wiki_own_phrase_training_200.model")
w2v_model, available_tokens = load_word2vec_model(w2v_model_path)
#sample_tokens = list(available_tokens)[:5]
#print(sample_tokens)

# Processing Text
text = df["text"][1]
words = detect_all_tokens(text)
print(text)
print("================================================================")
print(words)

# Postagging
from nlp_id.postag import PosTag
postagger = PosTag() 
pos_tokens = postag_tokens(words, postagger)
print(pos_tokens)

# Building Co-occurrence Matrix
co_occurrence = build_co_occurrence_matrix(words)

# Building Graph and Computing TextRank
scores = build_graph_and_compute_textrank(co_occurrence, w2v_model)
print("mengandung stopword:", scores)
print("================================================================")

# Filtering and Ranking Keywords
keyphrases_with_scores = filter_and_rank_keywords(scores, stopwords)
print("stopword di filter :",keyphrases_with_scores)
print("================================================================")

# Attaching POS Tags
keywords_with_pos = attach_pos_tags(keyphrases_with_scores, pos_tokens)
print("diberi postag :",keywords_with_pos)
print("================================================================")

# Filtering by Selected POS Tags
selected_pos_tags = {'NN', 'NNP', 'VB', 'NP', 'VP'}
filtered_list = filter_by_pos(keywords_with_pos, selected_pos_tags)
print("filtered postag :", filtered_list)


template document jtb gpf project mengacu kepada dokumen jtb cp ctr exhibit coordination procedure kami sampaikan template document yang akan dipergunakan pada proyek jambaran tiung biru jtb gas processing facilities gpf demikian disampaikan sebagai acuan pengelolaan dokumen atas perhatiannya kami ucapkan terima kasih
['template document', 'jtb', 'gpf', 'project', 'mengacu', 'kepada', 'dokumen', 'jtb', 'cp', 'ctr', 'exhibit', 'coordination', 'procedure', 'kami', 'sampaikan', 'template document', 'yang', 'akan', 'dipergunakan', 'pada', 'proyek', 'jambaran', 'tiung', 'biru', 'jtb', 'gas', 'processing', 'facilities', 'gpf', 'demikian', 'disampaikan', 'sebagai', 'acuan', 'pengelolaan', 'dokumen', 'atas', 'perhatiannya', 'kami', 'ucapkan', 'terima', 'kasih']
[('template document', 'NP'), ('jtb', 'SC'), ('gpf', 'NN'), ('project', 'NN'), ('mengacu', 'VP'), ('kepada', 'IN'), ('dokumen', 'NN'), ('jtb', 'SC'), ('cp', 'NN'), ('ctr', 'NN'), ('exhibit', 'NN'), ('coordination', 'FW'), ('procedure', 

ALL DATASETS

In [50]:
postagger = PosTag()
# Load Word2Vec Model
w2v_model_path = os.path.join(repo_root, "notebooks/word2vec_model/w2v_wiki_own_phrase_training_200.model")
w2v_model, available_tokens = load_word2vec_model(w2v_model_path)

predict_textrank = pd.DataFrame()
#for i in df.index:
for i in df.loc[2:5].index:    
    print('Processing index', i, end='...! ')
    # Processing Text
    text = df["text"][i]
    words = detect_all_tokens(text)
    # Postagging
    pos_tokens = postag_tokens(words, postagger)
    # Building Co-occurrence Matrix
    co_occurrence = build_co_occurrence_matrix(words)
    # Building Graph and Computing TextRank
    scores = build_graph_and_compute_textrank(co_occurrence, w2v_model)
    # Filtering and Ranking Keywords
    keyphrases_with_scores = filter_and_rank_keywords(scores, stopwords)
    # Attaching POS Tags
    keywords_with_pos = attach_pos_tags(keyphrases_with_scores, pos_tokens)
    # Filtering by Selected POS Tags
    selected_pos_tags = {'NN', 'NNP', 'VB', 'NP', 'VP'}
    filtered_list = filter_by_pos(keywords_with_pos, selected_pos_tags)

    df_keyphrases = pd.DataFrame(filtered_list, columns=['Keyword', 'Score', 'pos'])
    a = pd.DataFrame(df_keyphrases.Keyword).T.reset_index(drop=True)
    a = a.reindex(columns=range(10), fill_value=0)
    b = pd.DataFrame(df_keyphrases.Score).round(3).T.reset_index(drop=True)
    b = b.reindex(columns=range(10), fill_value=0)
    c = pd.DataFrame(df_keyphrases.pos).round(3).T.reset_index(drop=True)
    c = c.reindex(columns=range(10), fill_value=0)
    df_keyphrases = pd.concat([a, b, c], axis=1)

    # Check if there are missing columns and add them with zero values
    missing_columns = 30 - df_keyphrases.shape[1]
    for _ in range(missing_columns):
        df_keyphrases[df_keyphrases.shape[1]] = 0

    df_keyphrases.columns = ['key_1', 'key_2','key_3', 'key_4', 'key_5','key_6', 'key_7', 'key_8','key_9','key_10',
                             'score_1', 'score_2','score_3','score_4', 'score_5','score_6','score_7', 'score_8','score_9','score_10',
                             'pos_1', 'pos_2','pos_3', 'pos_4', 'pos_5','pos_6', 'pos_7', 'pos_8','pos_9','pos_10'] 
    predict_textrank = pd.concat([predict_textrank, df_keyphrases], ignore_index=True)
    print('Done')
predict_textrank.head(3)

Processing index 2...! Done
Processing index 3...! Done
Processing index 4...! Done
Processing index 5...! Done


Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,pos_1,pos_2,pos_3,pos_4,pos_5,pos_6,pos_7,pos_8,pos_9,pos_10
0,kantor,ruangan,usulan,artikel,lokasi,accommodation,diskusi,klarifikasi,tabel,0,...,NN,NN,NN,NN,NN,NN,NN,NN,NN,0
1,pengadaan,konstruksi,deliverables,fungsi,menanggapi,inspeksi,0,0,0,0,...,NN,NN,NN,NN,VP,NN,0,0,0,0
2,iwan,permit,request,hamzah,manager,bp,kunjungan,lapangan,0,0,...,NN,NN,NN,NN,NN,NN,NN,NN,0,0


EVALUATION

In [51]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [52]:
# Evaluation TextRank top 10
predict_textrank_list_10 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10']].values.tolist()
eval_textrank_10 = eval(predict_textrank_list_10, targets, True).round(3)
eval_textrank_10.columns = ['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_10 = eval_textrank_10[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,flex_recall,flex_prec
0,no_match,no_match,full_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.143,0.1
1,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.0,0.0
2,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.0,0.0


In [53]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_10 = eval_textrank_10['flex_recall'].mean()
textrank_prec_10 = eval_textrank_10['flex_prec'].mean()
textrank_f1_10 = 2 * (textrank_prec_10 * textrank_recall_10) / (textrank_prec_10 + textrank_recall_10)

# Create a DataFrame with the scores
summary_10 = pd.DataFrame({'textrank': [textrank_recall_10, textrank_prec_10, textrank_f1_10]}, index=['recall', 'precision', 'F1'])
summary_10 = summary_10.round(3)
summary_10

Unnamed: 0,textrank
recall,0.036
precision,0.025
F1,0.029


In [54]:
# Evaluation TextRank top 5
predict_textrank_list_5 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5']].values.tolist()
eval_textrank_5 = eval(predict_textrank_list_5, targets, True).round(3)
eval_textrank_5.columns = ['key_1','key_2','key_3', 'key_4','key_5','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_5 = eval_textrank_5[['key_1','key_2','key_3', 'key_4','key_5', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,flex_recall,flex_prec
0,no_match,no_match,full_match,no_match,no_match,0.143,0.2
1,no_match,no_match,no_match,no_match,no_match,0.0,0.0
2,no_match,no_match,no_match,no_match,no_match,0.0,0.0


In [55]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_5 = eval_textrank_5['flex_recall'].mean()
textrank_prec_5 = eval_textrank_5['flex_prec'].mean()
textrank_f1_5 = 2 * (textrank_prec_5 * textrank_recall_5) / (textrank_prec_5 + textrank_recall_5)

# Create a DataFrame with the scores
summary_5 = pd.DataFrame({'textrank': [textrank_recall_5, textrank_prec_5, textrank_f1_5]}, index=['recall', 'precision', 'F1'])
summary_5 = summary_5.round(3)
summary_5

Unnamed: 0,textrank
recall,0.036
precision,0.05
F1,0.042


In [56]:
# Evaluation TextRank top 3
predict_textrank_list_3 = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank_3 = eval(predict_textrank_list_3, targets, True).round(3)
eval_textrank_3.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_3 = eval_textrank_3[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,full_match,0.143,0.333
1,no_match,no_match,no_match,0.0,0.0
2,no_match,no_match,no_match,0.0,0.0


In [57]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_3 = eval_textrank_3['flex_recall'].mean()
textrank_prec_3 = eval_textrank_3['flex_prec'].mean()
textrank_f1_3 = 2 * (textrank_prec_3 * textrank_recall_3) / (textrank_prec_3 + textrank_recall_3)

# Create a DataFrame with the scores
summary_3 = pd.DataFrame({'textrank': [textrank_recall_3, textrank_prec_3, textrank_f1_3]}, index=['recall', 'precision', 'F1'])
summary_3 = summary_3.round(3)
summary_3

Unnamed: 0,textrank
recall,0.036
precision,0.083
F1,0.05


In [58]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_10 = pd.concat([predict_textrank, df_targets, eval_textrank_10], axis=1)
predict_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,key_3.1,key_4.1,key_5.1,key_6.1,key_7.1,key_8.1,key_9.1,key_10.1,flex_recall,flex_prec
0,kantor,ruangan,usulan,artikel,lokasi,accommodation,diskusi,klarifikasi,tabel,0.0,...,full_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.143,0.1
1,pengadaan,konstruksi,deliverables,fungsi,menanggapi,inspeksi,0,0,0,0.0,...,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.0,0.0
2,iwan,permit,request,hamzah,manager,bp,kunjungan,lapangan,0,0.0,...,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.0,0.0


In [59]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_5 = pd.concat([predict_textrank, df_targets, eval_textrank_5], axis=1)
predict_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,4,5,6,key_1.1,key_2.1,key_3.1,key_4.1,key_5.1,flex_recall,flex_prec
0,kantor,ruangan,usulan,artikel,lokasi,accommodation,diskusi,klarifikasi,tabel,0.0,...,,,,no_match,no_match,full_match,no_match,no_match,0.143,0.2
1,pengadaan,konstruksi,deliverables,fungsi,menanggapi,inspeksi,0,0,0,0.0,...,dokumen,,,no_match,no_match,no_match,no_match,no_match,0.0,0.0
2,iwan,permit,request,hamzah,manager,bp,kunjungan,lapangan,0,0.0,...,services for company,exhibit a,,no_match,no_match,no_match,no_match,no_match,0.0,0.0


In [60]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_3 = pd.concat([predict_textrank, df_targets, eval_textrank_3], axis=1)
predict_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,kantor,ruangan,usulan,artikel,lokasi,accommodation,diskusi,klarifikasi,tabel,0.0,...,usulan,pengganti,,,,no_match,no_match,full_match,0.143,0.333
1,pengadaan,konstruksi,deliverables,fungsi,menanggapi,inspeksi,0,0,0,0.0,...,acuan,pengelolaan,dokumen,,,no_match,no_match,no_match,0.0,0.0
2,iwan,permit,request,hamzah,manager,bp,kunjungan,lapangan,0,0.0,...,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,no_match,0.0,0.0


In [61]:
import pandas as pd
from openpyxl import load_workbook

def write_excel(df, sheet_name, filename):
    """
    Writes the given dataframe to an excel file with the given filename and sheet name.
    If the sheet already exists in the file, the data in the sheet will be overwritten.
    """
    try:
        # Try to load the existing workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
            if sheet_name in writer.book.sheetnames:
                # If sheet already exists, remove it
                sheet = writer.book[sheet_name]
                writer.book.remove(sheet)

            # Write the dataframe to the excel file
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    except FileNotFoundError:
        # If the file doesn't exist, create a new workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='w') as writer:
            df.to_excel(writer, sheet_name=sheet_name, index=False)


In [None]:
# Write predictions to excel file
#from utils import write_excel

sheet_name_10 = 'SE5_tr_w2v_posfilter_10'
sheet_name_5 = 'SE5_tr_w2v_posfilter_5'
sheet_name_3 = 'SE5_tr_w2v_posfilter_3'

output_file = 'SE5_tr_w2v_posfilter.xlsx'
write_excel(predict_textrank_10, sheet_name_10, output_file)
write_excel(predict_textrank_5, sheet_name_5, output_file)
write_excel(predict_textrank_3, sheet_name_3, output_file)