POS Filter

In [12]:
#1. rutin1 import module
import pandas as pd
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import string

import networkx as nx
from collections import defaultdict

warnings.simplefilter(action='ignore', category=UserWarning)

In [13]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(repo_root)

In [14]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r30_lg.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [15]:
# Preprocess
import re
'''
stopwords tidak masuk dalam preprocessing
'''
def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    return text

df["text"] = df['text'].apply(preprocess)
df["judul"] = df["judul"].apply(preprocess)

In [16]:
stopwords_path = os.path.join(repo_root, "notebooks/stopwords_tuning/all_stop_words.txt")
with open(stopwords_path, 'r') as file:
    stopwords = set(file.read().strip().splitlines())

In [17]:
from collections import Counter
from nlp_id_local.tokenizer import PhraseTokenizer 
from nlp_id_local.postag import PosTag
from nltk.util import ngrams

def generate_ngrams(words, n=2):
    """Generate ngrams from a list of words."""
    return [" ".join(gram) for gram in ngrams(words, n)]

def detect_bigram(text):
    
    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Include only bigrams whose individual words are in available_tokens
    bigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 1]

    return bigrams_only

def detect_trigram(text):

    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Include only trigrams whose individual words are in available_tokens
    trigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 2 ]

    return trigrams_only

def detect_all_tokens(text):
    unigrams = [word for word in text.split()]
    bigrams = detect_bigram(text)
    trigrams = detect_trigram(text)
    
    # Combine unigrams, filtered bigrams, and filtered trigrams
    all_tokens = unigrams + bigrams + trigrams

    return all_tokens

def visualize_graph(G, labels):
    # Remove self-loops (edges that connect a node to itself)
    G.remove_edges_from(nx.selfloop_edges(G))

    fig = plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(G, seed=42)
    #nx.draw(G, pos=pos, with_labels=False, font_weight="bold", node_size=5000, node_color='skyblue')
    nx.draw(G, pos=pos, with_labels=False, font_weight="bold")
    nx.draw_networkx_labels(G, pos, labels, font_size=12)
    plt.show()

In [18]:
import os
from gensim.models import Word2Vec

def load_word2vec_model(model_path):
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"The provided Word2Vec model path does not exist: {model_path}")
    
    w2v_model = Word2Vec.load(model_path) 
    available_tokens = set(w2v_model.wv.key_to_index)
    
    return w2v_model, available_tokens

# Example usage
model_path = os.path.join(repo_root, "models/w2v_200/idwiki_word2vec_200_new_lower.model")
w2v_model, available_tokens = load_word2vec_model(model_path)

# Show a sample of available tokens
sample_tokens = list(available_tokens)[:5]
sample_tokens

from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(w1, w2, w2v_model):
    if w1 not in w2v_model.wv or w2 not in w2v_model.wv:
        return 0
    vec1 = w2v_model.wv[w1]
    vec2 = w2v_model.wv[w2]
    similarity = cosine_similarity([vec1], [vec2])[0][0]
    return similarity



EXPLORATORY PURPOSE

In [19]:
# Example usage
model_path = os.path.join(repo_root, "models/w2v_200/idwiki_word2vec_200_new_lower.model")
w2v_model, available_tokens = load_word2vec_model(model_path)

# Show a sample of available tokens
sample_tokens = list(available_tokens)[:5]
sample_tokens

['mcallister', 'suffers', 'vandalisme', 'ahriman', 'mgf']

In [20]:
model_path = os.path.join(repo_root, "notebooks/nlp-id_retraining/train_tuned.pkl")

def get_unique_tokens_pos(all_tokens, model_path):
    """
    Get unique POS tags for tokens.
    """
    postagger = PosTag(model_path)
    pos_tokens = []
    seen_tokens = set()
    
    for token in all_tokens:
        if token not in seen_tokens:
            seen_tokens.add(token)
            tokens_pos = postagger.get_phrase_tag(token)
            pos_tokens.append(tokens_pos)
    return pos_tokens

def flatten_list_of_lists(list_of_lists):
    """
    Flatten a list of lists into a single list.
    """
    return [item for sublist in list_of_lists for item in sublist]

def filter_tokens_by_pos(flat_tokens, pos_filters):
    """
    Filter tokens based on their POS tags and ensure they're unique.
    """
    seen_tokens = set()
    return [token[0] for token in flat_tokens if token[1] in pos_filters and not (token[0] in seen_tokens or seen_tokens.add(token[0]))]

def get_phrase_embedding(phrase, w2v_model):
    """Get the averaged word embedding for a phrase."""
    words = phrase.split()
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return None

In [62]:
def textrank(text, num_keywords=10):
    # Tokenize the text
    words = detect_all_tokens(text)
    # Load stopwords for the specified language
    stop_words = stopwords
    
    # Filter out stopwords and punctuation
    words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]
    
    # Path to the POS tagging model
    model_path = os.path.join(repo_root, "notebooks/nlp-id_retraining/train_tuned.pkl")
    
    # Get unique POS tags for tokens
    pos_tokens = get_unique_tokens_pos(words, model_path)
    flat_pos_tokens = flatten_list_of_lists(pos_tokens)
    selected_pos = {'NN', 'NNP', 'VB', 'NP', 'VP'}  # Exclude FW
    filtered_tokens = filter_tokens_by_pos(flat_pos_tokens, selected_pos)
    
    # Use filtered_tokens instead of words for the following processing
    # Build a co-occurrence matrix
    co_occurrence = defaultdict(int)
    window_size = 3
    for i in range(len(filtered_tokens) - window_size + 1):
        window = filtered_tokens[i:i+window_size]
        for j in range(window_size):
            for k in range(j+1, window_size):
                w1, w2 = sorted([window[j], window[k]])
                if w1 != w2:
                    co_occurrence[(w1, w2)] += 1
    
    # Build a graph
    G = nx.Graph()
    for (w1, w2), weight1 in co_occurrence.items():
        weight2 = get_cosine_similarity(w1, w2, w2v_model)
        weight3 = weight1 * weight2
        if weight2 > 0:
            G.add_edge(w1, w2, weight=weight3)
    
    # Compute TextRank scores
    scores = nx.pagerank(G)
    
    # Prepare labels
    labels = {node: f'{node}\n({score:.2f})' for node, score in scores.items()}

    # Sort words by scores
    ranked_words = sorted(((score, word) for word, score in scores.items()), reverse=True)
    
    # Extract the top keywords
    keywords = [word for score, word in ranked_words[:num_keywords]]

    keyphrases_with_scores = []
    seen_tokens = set()  # Set to keep track of tokens that have already been added

    for score, token in ranked_words:
        if token not in seen_tokens:
            keyphrases_with_scores.append((token, score))
            seen_tokens.add(token)  # Mark the token as seen
            if len(keyphrases_with_scores) >= num_keywords:
                break
    
    return keyphrases_with_scores, labels


In [60]:
def textrank(text, num_keywords=10):
    # Tokenize the text
    words = detect_all_tokens(text)
    # Load stopwords for the specified language
    stop_words = stopwords
    
    # Filter out stopwords and punctuation
    words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]
    
    # Filter tokens only for selected POS
    pos_tokens = get_unique_tokens_pos(words, model_path)
    flat_pos_tokens = flatten_list_of_lists(pos_tokens)
    selected_pos = {'NN', 'NNP', 'VB', 'NP', 'VP'} # FW di exclude
    filtered_tokens = filter_tokens_by_pos(flat_pos_tokens, selected_pos)

    # Get embeddings for each token (averaging word embeddings for bigrams/trigrams)
    token_embeddings = [get_phrase_embedding(token, w2v_model) for token in filtered_tokens]
    
    # Filter out tokens that don't have embeddings
    tokens, embeddings = zip(*[(token, emb) for token, emb in zip(filtered_tokens, token_embeddings) if emb is not None])

    # Compute the cosine similarity between token embeddings
    cosine_matrix = cosine_similarity(embeddings)

    # Create a mapping from tokens to indices
    token_to_index = {token: idx for idx, token in enumerate(tokens)}

    # Build a co-occurrence matrix
    co_occurrence = defaultdict(int)
    window_size = 3
    for i in range(len(words) - window_size + 1):
        window = words[i:i+window_size]
        for j in range(window_size):
            for k in range(j+1, window_size):
                w1, w2 = sorted([window[j], window[k]])
                if w1 != w2:
                    co_occurrence[(w1, w2)] += 1
    
    #print("stop-1")
    
    # Build a graph
    G = nx.Graph()
    for (w1, w2), weight1 in co_occurrence.items():
        # Convert w1 and w2 to indices
        idx1 = token_to_index.get(w1)
        idx2 = token_to_index.get(w2)
        if idx1 is not None and idx2 is not None:
            # Look up the cosine similarity value
            weight2 = cosine_matrix[idx1][idx2]
            weight3 = weight1 * weight2
            print("weight1: ", weight1, "weight2: ", weight2, "weight3: ", weight3)
            if weight3 > 0:
                G.add_edge(w1, w2, weight=weight3)
    
    #print("stop-1")
    # Compute TextRank scores
    scores = nx.pagerank(G)
    print("stop-2")
    # Prepare labels
    labels = {node: f'{node}\n({score:.2f})' for node, score in scores.items()}

    # Sort words by scores
    ranked_words = sorted(((score, word) for word, score in scores.items()), reverse=True)
    
    # Extract the top keywords
    keywords = [word for score, word in ranked_words[:num_keywords]]

    keyphrases_with_scores = []
    seen_tokens = set()  # Set to keep track of tokens that have already been added

    for score, token in ranked_words:
        if token not in seen_tokens:
            keyphrases_with_scores.append((token, score))
            seen_tokens.add(token)  # Mark the token as seen
            if len(keyphrases_with_scores) >= num_keywords:
                break 
    
    return keyphrases_with_scores, labels


ITERASI UNTUK ALL DATA

In [63]:
predict_textrank = pd.DataFrame()
#for i in df.index:
for i in df.loc[2:3].index:    
    print('Processing index', i, end='...! ')
    text = df["text"][i]
    keyphrases, labels = textrank(text, num_keywords=10)
    df_keyphrases = pd.DataFrame(keyphrases, columns=['Keyword', 'Score'])
    a = pd.DataFrame(df_keyphrases.Keyword).T.reset_index(drop=True)
    b = pd.DataFrame(df_keyphrases.Score).round(3).T.reset_index(drop=True)
    df_keyphrases = pd.concat([a, b], axis=1)

    # Check if there are missing columns and add them with zero values
    missing_columns = 20 - df_keyphrases.shape[1]
    for _ in range(missing_columns):
        df_keyphrases[df_keyphrases.shape[1]] = 0

    df_keyphrases.columns = ['key_1', 'key_2','key_3', 'key_4', 'key_5','key_6', 'key_7', 'key_8','key_9','key_10','score_1', 'score_2','score_3','score_4', 'score_5','score_6','score_7', 'score_8','score_9','score_10'] 
    predict_textrank = pd.concat([predict_textrank, df_keyphrases], ignore_index=True)
    print('Done')
predict_textrank.head(3)

Processing index 2...! Done
Processing index 3...! Done


Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,score_1,score_2,score_3,score_4,score_5,score_6,score_7,score_8,score_9,score_10
0,ruangan,fasilitas,kesepahaman,persyaratan,klarifikasi,provided,specification,disediakan,exhibit,provisions,0.034,0.033,0.031,0.03,0.03,0.029,0.028,0.028,0.028,0.027
1,konstruksi,inspeksi,pengadaan,deliverables,instrument,lead,fungsi,pic,mac,ditindaklanjuti,0.147,0.135,0.12,0.107,0.106,0.103,0.082,0.08,0.078,0.041


In [51]:
text = df["text"][2]
text

'change inquiry terkait usulan perubahan lingkup kerja scope of work terkait aakomodasi dan ruangan kantor bagi perusahaan mengacu pada risalah rapat mingguan proyek jtb tanggal oktober butir nomor perihal usulan perubahan lingkup kerja scope of work terkait akomodasi dan ruangan kantor bagi perusahaan yang akan disediakan oleh kontraktor di lokasi iapangan proyek jtb nanti dengan ini disampaikan detail dari usulan tersebut dengan merujuk pada draft kontrak jtb no exhibit scope of work appendix contractor provided faciiities services for company dalam hal ini perusahaan bermaksud untuk mengeluarkan beberapa detaii dibawah dari iingkup kerja kontraktor artikel berikut tabel accommodation provisions at jtb work site artikel specification for accommodation faciiities selain itu dengan merujuk appendix yang sama pada table offices for company at work sites perusahaan bermaksud untuk mengubah komposisi ruangan kantor yang akan disediakan oleh kontraktor di lokasi lapangan jtb sebagai beriku

In [52]:
words = detect_all_tokens(text)
# Load stopwords for the specified language
stop_words = stopwords
    
# Filter out stopwords and punctuation
words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]

# Filter tokens only for selected POS
pos_tokens = get_unique_tokens_pos(words, model_path)
flat_pos_tokens = flatten_list_of_lists(pos_tokens)
selected_pos = {'NN', 'NNP', 'VB', 'NP', 'VP'} # FW di exclude
filtered_tokens = filter_tokens_by_pos(flat_pos_tokens, selected_pos)

# Get embeddings for each token (averaging word embeddings for bigrams/trigrams)
token_embeddings = [get_phrase_embedding(token, w2v_model) for token in filtered_tokens]
    
# Filter out tokens that don't have embeddings
tokens, embeddings = zip(*[(token, emb) for token, emb in zip(filtered_tokens, token_embeddings) if emb is not None])

# Compute the cosine similarity between token embeddings
cosine_matrix = cosine_similarity(embeddings)

# Create a mapping from tokens to indices
token_to_index = {token: idx for idx, token in enumerate(tokens)}

# Build a co-occurrence matrix
co_occurrence = defaultdict(int)
window_size = 3
for i in range(len(words) - window_size + 1):
    window = words[i:i+window_size]
    for j in range(window_size):
        for k in range(j+1, window_size):
            w1, w2 = sorted([window[j], window[k]])
            if w1 != w2:
                co_occurrence[(w1, w2)] += 1

In [59]:
embeddings

(array([-0.71836114, -0.136896  ,  0.27016383,  0.9596047 ,  1.024652  ,
         0.5947294 , -2.2668989 ,  0.62044585,  1.6218511 , -0.07148043,
        -0.33811706, -0.7942006 , -0.15159613,  0.48766953, -1.4052051 ,
        -1.5146728 , -0.36034408,  1.1042314 ,  1.379718  ,  0.8279138 ,
        -1.8058146 , -2.2353516 , -0.10186423,  1.4271581 , -1.6157588 ,
         0.21610251,  0.20297745,  2.1320896 , -0.6631063 , -1.0607784 ,
         1.9267074 ,  1.3095099 ,  0.4224259 , -0.7780308 ,  1.998459  ,
        -0.19648096,  1.4829661 ,  0.91902727, -0.3392262 , -1.5060623 ,
         0.24620531,  0.46940145,  0.884749  ,  1.3725715 ,  2.3011136 ,
        -0.29624993, -0.47156608, -0.97174144, -1.6487098 ,  1.5750712 ,
        -2.7269826 , -1.0029615 , -0.3117341 , -0.14963403,  0.15339556,
        -1.1506423 , -0.82434016,  1.3854079 , -1.0185047 ,  3.3096771 ,
         0.6527332 ,  0.3250351 , -0.35816813, -0.6380095 ,  0.10352544,
        -0.54938257,  2.414384  ,  0.891717  ,  1.0

In [58]:
cosine_matrix

array([[ 1.0000001 ,  0.1252531 ,  0.07225999, ..., -0.01430885,
         0.27603194,  0.18782316],
       [ 0.1252531 ,  1.0000001 ,  0.3094655 , ...,  0.00866984,
         0.25537154,  0.17564687],
       [ 0.07225999,  0.3094655 ,  1.0000001 , ..., -0.12403543,
         0.2398963 ,  0.18023914],
       ...,
       [-0.01430885,  0.00866984, -0.12403543, ...,  0.99999994,
        -0.04562457, -0.11879002],
       [ 0.27603194,  0.25537154,  0.2398963 , ..., -0.04562457,
         1.        ,  0.298647  ],
       [ 0.18782316,  0.17564687,  0.18023914, ..., -0.11879002,
         0.298647  ,  1.0000002 ]], dtype=float32)

In [57]:
G = nx.Graph()
for (w1, w2), weight1 in co_occurrence.items():
    # Convert w1 and w2 to indices
    idx1 = token_to_index.get(w1)
    idx2 = token_to_index.get(w2)
    if idx1 is not None and idx2 is not None:
        # Look up the cosine similarity value
        weight2 = cosine_matrix[idx1][idx2]
        weight3 = weight1 * weight2
        print("weight1: ", weight1, "weight2: ", weight2, "weight3: ", weight3)
        G.add_edge(w1, w2, weight=weight1)

weight1:  4 weight2:  0.1252531 weight3:  0.50101238489151
weight1:  8 weight2:  0.29649663 weight3:  2.3719730377197266
weight1:  1 weight2:  -0.04021917 weight3:  -0.04021916911005974
weight1:  2 weight2:  -0.095299184 weight3:  -0.19059836864471436
weight1:  1 weight2:  -0.11399478 weight3:  -0.1139947772026062
weight1:  2 weight2:  0.09066809 weight3:  0.1813361793756485
weight1:  1 weight2:  -0.004879038 weight3:  -0.0048790378496050835
weight1:  2 weight2:  0.15471394 weight3:  0.3094278872013092
weight1:  1 weight2:  0.2330167 weight3:  0.233016699552536
weight1:  2 weight2:  0.17396757 weight3:  0.34793514013290405
weight1:  1 weight2:  0.10776435 weight3:  0.1077643483877182
weight1:  2 weight2:  0.2232756 weight3:  0.44655120372772217
weight1:  1 weight2:  0.17549703 weight3:  0.17549702525138855
weight1:  2 weight2:  0.16498284 weight3:  0.3299656808376312
weight1:  1 weight2:  0.09397074 weight3:  0.09397073835134506
weight1:  1 weight2:  0.23784369 weight3:  0.237843692302

In [56]:
scores = nx.pagerank(G)
scores

{'lingkup': 0.017392137517998934,
 'usulan': 0.036039561378518326,
 'kantor': 0.061962957888187155,
 'ruangan': 0.04339784753258695,
 'mengacu': 0.016207452685155364,
 'risalah': 0.017792216544734778,
 'rapat': 0.01616921049286098,
 'mingguan': 0.013861252569395417,
 'butir': 0.013656769533525551,
 'nomor': 0.01592220429146587,
 'akomodasi': 0.01090985368654577,
 'disediakan': 0.025770613880502388,
 'lokasi': 0.026972973855838293,
 'iapangan': 0.015656352395946354,
 'detail': 0.0166318607520618,
 'draft': 0.015478313171575234,
 'exhibit': 0.012865792214406452,
 'appendix': 0.03254688292709314,
 'contractor': 0.014499324802507131,
 'provided': 0.016344276794356905,
 'services': 0.011613556946422994,
 'mengeluarkan': 0.012230292863205488,
 'dibawah': 0.010244213403649132,
 'artikel': 0.03960886695702066,
 'tabel': 0.026404890987802698,
 'accommodation': 0.024702478579776713,
 'provisions': 0.01569694358980749,
 'specification': 0.013572919528367056,
 'offices': 0.00810934760554743,
 'sit

In [None]:
connected_components = nx.connected_components(G)
largest_connected_component = max(connected_components, key=len)
G_sub = G.subgraph(largest_connected_component)
scores = nx.pagerank(G_sub)


In [49]:
scores = nx.eigenvector_centrality(G)
scores

{'lingkup': 0.03630014701958274,
 'usulan': 0.05501829228661656,
 'kantor': 0.44333925277584463,
 'ruangan': 0.35011503057301413,
 'mengacu': 0.16332410533587982,
 'risalah': 0.11720086930411844,
 'rapat': 0.052948287281791455,
 'mingguan': 0.03186719250661399,
 'butir': 0.017864265517364804,
 'nomor': 0.018506868063884435,
 'akomodasi': 0.14063793644962028,
 'disediakan': 0.2336453458199591,
 'lokasi': 0.20820033608286642,
 'iapangan': 0.09539965630352998,
 'detail': 0.06598133456010481,
 'draft': 0.030659200605472326,
 'exhibit': 0.05987980448786355,
 'appendix': 0.2676017236996826,
 'contractor': 0.054936458634986904,
 'provided': 0.056517160620376525,
 'services': 0.01090658505986954,
 'mengeluarkan': 0.007830155442669042,
 'dibawah': 0.03529002367612121,
 'artikel': 0.2003760157078369,
 'tabel': 0.21098889691171563,
 'accommodation': 0.1463693460897279,
 'provisions': 0.10758588481307597,
 'specification': 0.07700726228351927,
 'offices': 0.05630178411889803,
 'sites': 0.064573781

EVALUATION

In [None]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [None]:
# Evaluation TextRank top 10
predict_textrank_list_10 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10']].values.tolist()
eval_textrank_10 = eval(predict_textrank_list_10, targets, True).round(3)
eval_textrank_10.columns = ['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_10 = eval_textrank_10[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,flex_recall,flex_prec
0,no_match,full_match,no_match,no_match,partial_match,no_match,full_match,no_match,no_match,no_match,0.429,0.3
1,partial_match,partial_match,no_match,no_match,no_match,no_match,full_match,partial_match,full_match,no_match,0.714,0.5
2,partial_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.143,0.1


In [None]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_10 = eval_textrank_10['flex_recall'].mean()
textrank_prec_10 = eval_textrank_10['flex_prec'].mean()
textrank_f1_10 = 2 * (textrank_prec_10 * textrank_recall_10) / (textrank_prec_10 + textrank_recall_10)

# Create a DataFrame with the scores
summary_10 = pd.DataFrame({'textrank': [textrank_recall_10, textrank_prec_10, textrank_f1_10]}, index=['recall', 'precision', 'F1'])
summary_10 = summary_10.round(3)
summary_10

Unnamed: 0,textrank
recall,0.318
precision,0.223
F1,0.262


In [None]:
# Evaluation TextRank top 5
predict_textrank_list_5 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5']].values.tolist()
eval_textrank_5 = eval(predict_textrank_list_5, targets, True).round(3)
eval_textrank_5.columns = ['key_1','key_2','key_3', 'key_4','key_5','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_5 = eval_textrank_5[['key_1','key_2','key_3', 'key_4','key_5', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,flex_recall,flex_prec
0,no_match,full_match,no_match,no_match,partial_match,0.286,0.4
1,partial_match,partial_match,no_match,no_match,no_match,0.286,0.4
2,partial_match,no_match,no_match,no_match,no_match,0.143,0.2


In [None]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_5 = eval_textrank_5['flex_recall'].mean()
textrank_prec_5 = eval_textrank_5['flex_prec'].mean()
textrank_f1_5 = 2 * (textrank_prec_5 * textrank_recall_5) / (textrank_prec_5 + textrank_recall_5)

# Create a DataFrame with the scores
summary_5 = pd.DataFrame({'textrank': [textrank_recall_5, textrank_prec_5, textrank_f1_5]}, index=['recall', 'precision', 'F1'])
summary_5 = summary_5.round(3)
summary_5

Unnamed: 0,textrank
recall,0.217
precision,0.304
F1,0.254


In [None]:
# Evaluation TextRank top 3
predict_textrank_list_3 = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank_3 = eval(predict_textrank_list_3, targets, True).round(3)
eval_textrank_3.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_3 = eval_textrank_3[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,full_match,no_match,0.143,0.333
1,partial_match,partial_match,no_match,0.286,0.667
2,partial_match,no_match,no_match,0.143,0.333


In [None]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_3 = eval_textrank_3['flex_recall'].mean()
textrank_prec_3 = eval_textrank_3['flex_prec'].mean()
textrank_f1_3 = 2 * (textrank_prec_3 * textrank_recall_3) / (textrank_prec_3 + textrank_recall_3)

# Create a DataFrame with the scores
summary_3 = pd.DataFrame({'textrank': [textrank_recall_3, textrank_prec_3, textrank_f1_3]}, index=['recall', 'precision', 'F1'])
summary_3 = summary_3.round(3)
summary_3

Unnamed: 0,textrank
recall,0.156
precision,0.365
F1,0.219


In [None]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_10 = pd.concat([predict_textrank, df_targets, eval_textrank_10], axis=1)
predict_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,key_3.1,key_4.1,key_5.1,key_6.1,key_7.1,key_8.1,key_9.1,key_10.1,flex_recall,flex_prec
0,personil,pengganti,tender,penjelasan,persetujuan,penilaian,usulan,diajukan,kandidat,pengumuman,...,no_match,no_match,partial_match,no_match,full_match,no_match,no_match,no_match,0.429,0.3
1,template,document,facilities,processing,procedure,coordination,pengelolaan,exhibit,acuan,tiung,...,no_match,no_match,no_match,no_match,full_match,partial_match,full_match,no_match,0.714,0.5
2,kantor,ruangan,appendix,lapangan,kesepahaman,lokasi,usulan,klarifikasi,mingguan,offices,...,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,0.143,0.1


In [None]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_5 = pd.concat([predict_textrank, df_targets, eval_textrank_5], axis=1)
predict_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,4,5,6,key_1.1,key_2.1,key_3.1,key_4.1,key_5.1,flex_recall,flex_prec
0,personil,pengganti,tender,penjelasan,persetujuan,penilaian,usulan,diajukan,kandidat,pengumuman,...,,,,no_match,full_match,no_match,no_match,partial_match,0.286,0.4
1,template,document,facilities,processing,procedure,coordination,pengelolaan,exhibit,acuan,tiung,...,dokumen,,,partial_match,partial_match,no_match,no_match,no_match,0.286,0.4
2,kantor,ruangan,appendix,lapangan,kesepahaman,lokasi,usulan,klarifikasi,mingguan,offices,...,services for company,exhibit a,,partial_match,no_match,no_match,no_match,no_match,0.143,0.2


In [None]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_3 = pd.concat([predict_textrank, df_targets, eval_textrank_3], axis=1)
predict_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,personil,pengganti,tender,penjelasan,persetujuan,penilaian,usulan,diajukan,kandidat,pengumuman,...,usulan,pengganti,,,,no_match,full_match,no_match,0.143,0.333
1,template,document,facilities,processing,procedure,coordination,pengelolaan,exhibit,acuan,tiung,...,acuan,pengelolaan,dokumen,,,partial_match,partial_match,no_match,0.286,0.667
2,kantor,ruangan,appendix,lapangan,kesepahaman,lokasi,usulan,klarifikasi,mingguan,offices,...,lingkup kerja,akomodasi,services for company,exhibit a,,partial_match,no_match,no_match,0.143,0.333


In [None]:
import pandas as pd
from openpyxl import load_workbook

def write_excel(df, sheet_name, filename):
    """
    Writes the given dataframe to an excel file with the given filename and sheet name.
    If the sheet already exists in the file, the data in the sheet will be overwritten.
    """
    try:
        # Try to load the existing workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
            if sheet_name in writer.book.sheetnames:
                # If sheet already exists, remove it
                sheet = writer.book[sheet_name]
                writer.book.remove(sheet)

            # Write the dataframe to the excel file
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    except FileNotFoundError:
        # If the file doesn't exist, create a new workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='w') as writer:
            df.to_excel(writer, sheet_name=sheet_name, index=False)


In [None]:
# Write predictions to excel file
#from utils import write_excel

sheet_name_10 = '13b_TR_w2v_lg_combined_10'
sheet_name_5 = '13b_TR_w2v_lg_combined_5'
sheet_name_3 = '13b_TR_w2v_lg_combined_3'

output_file = '13b_TR_w2v_lg_combined.xlsx'
write_excel(predict_textrank_10, sheet_name_10, output_file)
write_excel(predict_textrank_5, sheet_name_5, output_file)
write_excel(predict_textrank_3, sheet_name_3, output_file)