POS Filter

In [1]:
#1. rutin1 import module
import pandas as pd
import os
import sys
import warnings
import matplotlib.pyplot as plt
import numpy as np
import string

import networkx as nx
from collections import defaultdict

warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(repo_root)

In [3]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r30_lg.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [4]:
#4 Preprocess
import re
'''
stopwords tidak masuk dalam preprocessing
'''
def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    return text

df["text"] = df['text'].apply(preprocess)
df["judul"] = df["judul"].apply(preprocess)

In [5]:
#5 Load stopword
stopwords_path = os.path.join(repo_root, "notebooks/stopwords_tuning/all_stop_words.txt")
with open(stopwords_path, 'r') as file:
    stopwords = set(file.read().strip().splitlines())

In [6]:
# Fungsi Phrase Detection

from collections import Counter
from nlp_id_local.tokenizer import PhraseTokenizer 
from nlp_id_local.postag import PosTag
from nltk.util import ngrams

def generate_ngrams(words, n=2):
    """Generate ngrams from a list of words."""
    return [" ".join(gram) for gram in ngrams(words, n)]

def detect_bigram(text):
    
    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Include only bigrams whose individual words are in available_tokens
    bigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 1]

    return bigrams_only

def detect_trigram(text):

    tokenizer = PhraseTokenizer()
    phrases = tokenizer.tokenize(text)
    # Include only trigrams whose individual words are in available_tokens
    trigrams_only = [phrase for phrase in phrases if phrase.count(" ") == 2 ]

    return trigrams_only

def detect_all_tokens(text):
    unigrams = [word for word in text.split()]
    bigrams = detect_bigram(text)
    trigrams = detect_trigram(text)
    
    # Combine unigrams, filtered bigrams, and filtered trigrams
    all_tokens = unigrams + bigrams + trigrams

    return all_tokens


In [7]:
# Fungsi Visualisasi
def visualize_graph(G, labels):
    # Remove self-loops (edges that connect a node to itself)
    G.remove_edges_from(nx.selfloop_edges(G))

    fig = plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(G, seed=42)
    #nx.draw(G, pos=pos, with_labels=False, font_weight="bold", node_size=5000, node_color='skyblue')
    nx.draw(G, pos=pos, with_labels=False, font_weight="bold")
    nx.draw_networkx_labels(G, pos, labels, font_size=12)
    plt.show()

In [8]:
# Load Word2vec Model
import os
from gensim.models import Word2Vec

def load_word2vec_model(model_path):
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"The provided Word2Vec model path does not exist: {model_path}")
    
    w2v_model = Word2Vec.load(model_path) 
    available_tokens = set(w2v_model.wv.key_to_index)
    
    return w2v_model, available_tokens

# Example usage
#w2v_model_path = os.path.join(repo_root, "models/w2v_200/idwiki_word2vec_200_new_lower.model")
w2v_model_path = os.path.join(repo_root, "notebooks/word2vec_model/w2v_wiki_own_phrase_training_200.model")
w2v_model, available_tokens = load_word2vec_model(w2v_model_path)

# Show a sample of available tokens
sample_tokens = list(available_tokens)[:5]
#sample_tokens

pos_model_path = os.path.join(repo_root, "notebooks/nlp-id_retraining/train_tuned.pkl")

def get_unique_tokens_pos(all_tokens, pos_model_path):
    """
    Get unique POS tags for tokens.
    """
    postagger = PosTag(pos_model_path)
    pos_tokens = []
    seen_tokens = set()
    
    for token in all_tokens:
        if token not in seen_tokens:
            seen_tokens.add(token)
            tokens_pos = postagger.get_phrase_tag(token)
            pos_tokens.append(tokens_pos)
    return pos_tokens

def flatten_list_of_lists(list_of_lists):
    """
    Flatten a list of lists into a single list.
    """
    return [item for sublist in list_of_lists for item in sublist]

def get_phrase_embedding(phrase, w2v_model):
    """Get the averaged word embedding for a phrase."""
    words = phrase.split()
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return None

In [19]:
# Target 1 : Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(w1, w2, w2v_model):
    if w1 not in w2v_model.wv or w2 not in w2v_model.wv:
        return 0
    vec1 = w2v_model.wv[w1]
    vec2 = w2v_model.wv[w2]
    similarity = cosine_similarity([vec1], [vec2])[0][0]
    return similarity

EXPLORATORY PURPOSE

===============================================================================

In [26]:
# load w2v and print sample kata yg ada dalam library

#w2v_model_path = os.path.join(repo_root, "models/w2v_200/idwiki_word2vec_200_new_lower.model")
w2v_model_path = os.path.join(repo_root, "notebooks/word2vec_model/w2v_wiki_own_phrase_training_200.model")

w2v_model, available_tokens = load_word2vec_model(w2v_model_path)

# Show a sample of available tokens
sample_tokens = list(available_tokens)[:5]
sample_tokens

['tipologi',
 'benicio_del_toro_traffic',
 'pemberontak_kurdi',
 'strategi_pit_stop',
 'persentase']

In [42]:
# check text dan list token/phrase
predict_textrank = pd.DataFrame()
text = df["text"][2]
words = detect_all_tokens(text)
#stop_words = stopwords
print(text)
print("================================================================")
print(words)

change inquiry terkait usulan perubahan lingkup kerja scope of work terkait aakomodasi dan ruangan kantor bagi perusahaan mengacu pada risalah rapat mingguan proyek jtb tanggal oktober butir nomor perihal usulan perubahan lingkup kerja scope of work terkait akomodasi dan ruangan kantor bagi perusahaan yang akan disediakan oleh kontraktor di lokasi iapangan proyek jtb nanti dengan ini disampaikan detail dari usulan tersebut dengan merujuk pada draft kontrak jtb no exhibit scope of work appendix contractor provided faciiities services for company dalam hal ini perusahaan bermaksud untuk mengeluarkan beberapa detaii dibawah dari iingkup kerja kontraktor artikel berikut tabel accommodation provisions at jtb work site artikel specification for accommodation faciiities selain itu dengan merujuk appendix yang sama pada table offices for company at work sites perusahaan bermaksud untuk mengubah komposisi ruangan kantor yang akan disediakan oleh kontraktor di lokasi lapangan jtb sebagai berikut

Cabang #1

1

In [50]:
# Cabang-1 : 
'''  
    - Postagging all token tanpa stop word, 
    - Stopword dibuat setelah textrank, 
    - POS ditampilkan tanpa filter

'''

# 1.1 postagging
model_path = os.path.join(repo_root, "notebooks/nlp-id_retraining/train_tuned.pkl")
postagger = PosTag(model_path)
pos_tokens = []

for token in words:
    tokens_pos = postagger.get_phrase_tag(token)
    pos_tokens.append(tokens_pos)

# flatten_list
pos_tokens = [item for sublist in pos_tokens for item in sublist]
print(pos_tokens)

[('change', 'FW'), ('inquiry', 'FW'), ('terkait', 'VP'), ('usulan', 'NN'), ('perubahan', 'NN'), ('lingkup', 'NN'), ('kerja', 'NN'), ('scope', 'FW'), ('of', 'NNP'), ('work', 'FW'), ('terkait', 'VP'), ('aakomodasi', 'NN'), ('dan', 'CC'), ('ruangan', 'NN'), ('kantor', 'NN'), ('bagi', 'IN'), ('perusahaan', 'NN'), ('mengacu', 'VP'), ('pada', 'IN'), ('risalah', 'NN'), ('rapat', 'NN'), ('mingguan', 'NN'), ('proyek', 'NN'), ('jtb', 'SC'), ('tanggal', 'NN'), ('oktober', 'FW'), ('butir', 'NN'), ('nomor', 'NN'), ('perihal', 'NN'), ('usulan', 'NN'), ('perubahan', 'NN'), ('lingkup', 'NN'), ('kerja', 'NN'), ('scope', 'FW'), ('of', 'NNP'), ('work', 'FW'), ('terkait', 'VP'), ('akomodasi', 'NN'), ('dan', 'CC'), ('ruangan', 'NN'), ('kantor', 'NN'), ('bagi', 'IN'), ('perusahaan', 'NN'), ('yang', 'PR'), ('akan', 'ADV'), ('disediakan', 'VP'), ('oleh', 'IN'), ('kontraktor', 'NN'), ('di', 'IN'), ('lokasi', 'NN'), ('iapangan', 'NN'), ('proyek', 'NN'), ('jtb', 'SC'), ('nanti', 'VP'), ('dengan', 'IN'), ('ini', 

In [None]:
# uji coba weight similarity
#w1 = pos_tokens[3][0]
w1 = "tidak mengalami"
print(w1)
w2 = pos_tokens[4][0]
print(w2)

vec1 = w2v_model.wv[w1]
vec2 = w2v_model.wv[w2]
print(vec1)
print(vec2)

In [66]:
# Build a co-occurrence matrix
co_occurrence = defaultdict(int)
window_size = 3
for i in range(len(words) - window_size + 1):
    window = words[i:i+window_size]
    for j in range(window_size):
        for k in range(j+1, window_size):
            w1, w2 = sorted([window[j], window[k]])
            if w1 != w2:
                co_occurrence[(w1, w2)] += 1

# Build a graph
G = nx.Graph()
for (w1, w2), weight1 in co_occurrence.items():
    weight2 = get_cosine_similarity(w1, w2, w2v_model)
    weight3 = weight1 * weight2
    if weight2 > 0:
        G.add_edge(w1, w2, weight=weight3)
    
# Compute TextRank scores
scores = nx.pagerank(G)
print("mengandung stopword:", scores)
print("================================================================")

# Prepare labels
labels = {node: f'{node}\n({score:.2f})' for node, score in scores.items()}

# Sort words by scores
ranked_words = sorted(((score, word) for word, score in scores.items()), reverse=True)

# Filter out stopwords from ranked_words
ranked_words_nstopword = [(score, word) for score, word in ranked_words if word not in stopwords]

# Extract the top keywords
num_keywords = 10
keywords = [word for score, word in ranked_words_nstopword[:num_keywords]]

keyphrases_with_scores = []
seen_tokens = set()  # Set to keep track of tokens that have already been added

for score, token in ranked_words_nstopword:
    if token not in seen_tokens:
        keyphrases_with_scores.append((token, score))
        seen_tokens.add(token)  # Mark the token as seen
        if len(keyphrases_with_scores) >= num_keywords:
            break 

print("stopword di filter :",keyphrases_with_scores)
print("================================================================")
# di beri Postag
# Convert pos_tokens to a dictionary for easier lookup
pos_dict = dict(pos_tokens)

# Now, attach POS tags to the keywords
keywords_with_pos = [(word, score, pos_dict.get(word, 'UNK')) for word, score in keyphrases_with_scores]

print("diberi postag :",keywords_with_pos)

print("================================================================")

# List of selected POS tags
selected_pos = {'NN', 'NNP', 'VB', 'NP', 'VP'}
# Filter the list for selected POS tags
filtered_list = [item for item in keywords_with_pos if item[2] in selected_pos]

print("filtered postag :", filtered_list)

mengandung stopword: {'change': 0.008735969531285503, 'inquiry': 0.008781083787446246, 'terkait': 0.010890431236290188, 'usulan': 0.01898959724398794, 'perubahan': 0.018503569527817586, 'lingkup': 0.010059524568753735, 'kerja': 0.009882544708713175, 'scope': 0.014431876345375091, 'of': 0.010230482227770923, 'work': 0.025093486759222607, 'dan': 0.028533992369774764, 'ruangan': 0.019138725960427698, 'kantor': 0.023577436810948486, 'bagi': 0.00636861266283274, 'perusahaan': 0.01784832497446911, 'mengacu': 0.002687173905003783, 'pada': 0.0173747268060892, 'risalah': 0.0060185788842203915, 'rapat': 0.00773920335414393, 'mingguan': 0.006769065111959663, 'proyek': 0.007163721541288373, 'jtb': 0.011393715314397682, 'tanggal': 0.006545908824535309, 'oktober': 0.007337872002168348, 'butir': 0.00816761295538232, 'nomor': 0.006515947033101089, 'perihal': 0.008079093771347108, 'akomodasi': 0.006013153955258769, 'yang': 0.022577210549608605, 'akan': 0.012009548244184891, 'disediakan': 0.006249723578

===================

In [11]:
def textrank(text, num_keywords=10):
    # Tokenize the text
    words = detect_all_tokens(text)
    # Load stopwords for the specified language
    stop_words = stopwords
    
    # Filter out stopwords and punctuation
    words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]
    
    # Path to the POS tagging model
    model_path = os.path.join(repo_root, "notebooks/nlp-id_retraining/train_tuned.pkl")
    
    # Get unique POS tags for tokens
    pos_tokens = get_unique_tokens_pos(words, model_path)
    flat_pos_tokens = flatten_list_of_lists(pos_tokens)
    selected_pos = {'NN', 'NNP', 'VB', 'NP', 'VP'}  # Exclude FW
    filtered_tokens = filter_tokens_by_pos(flat_pos_tokens, selected_pos)
    
    # Use filtered_tokens instead of words for the following processing
    # Build a co-occurrence matrix
    co_occurrence = defaultdict(int)
    window_size = 3
    for i in range(len(filtered_tokens) - window_size + 1):
        window = filtered_tokens[i:i+window_size]
        for j in range(window_size):
            for k in range(j+1, window_size):
                w1, w2 = sorted([window[j], window[k]])
                if w1 != w2:
                    co_occurrence[(w1, w2)] += 1
    
    # Build a graph
    G = nx.Graph()
    for (w1, w2), weight1 in co_occurrence.items():
        weight2 = get_cosine_similarity(w1, w2, w2v_model)
        weight3 = weight1 * weight2
        if weight2 > 0:
            G.add_edge(w1, w2, weight=weight3)
    
    # Compute TextRank scores
    scores = nx.pagerank(G)
    
    # Prepare labels
    labels = {node: f'{node}\n({score:.2f})' for node, score in scores.items()}

    # Sort words by scores
    ranked_words = sorted(((score, word) for word, score in scores.items()), reverse=True)
    
    # Extract the top keywords
    keywords = [word for score, word in ranked_words[:num_keywords]]

    keyphrases_with_scores = []
    seen_tokens = set()  # Set to keep track of tokens that have already been added

    for score, token in ranked_words:
        if token not in seen_tokens:
            keyphrases_with_scores.append((token, score))
            seen_tokens.add(token)  # Mark the token as seen
            if len(keyphrases_with_scores) >= num_keywords:
                break
    
    return keyphrases_with_scores, labels


ITERASI UNTUK ALL DATA

In [74]:
predict_textrank = pd.DataFrame()
for i in df.index:
#for i in df.loc[2:3].index:    
    print('Processing index', i, end='...! ')
    text = df["text"][i]
    keyphrases, labels = textrank(text, num_keywords=10)
    df_keyphrases = pd.DataFrame(keyphrases, columns=['Keyword', 'Score'])
    a = pd.DataFrame(df_keyphrases.Keyword).T.reset_index(drop=True)
    b = pd.DataFrame(df_keyphrases.Score).round(3).T.reset_index(drop=True)
    df_keyphrases = pd.concat([a, b], axis=1)

    # Check if there are missing columns and add them with zero values
    missing_columns = 20 - df_keyphrases.shape[1]
    for _ in range(missing_columns):
        df_keyphrases[df_keyphrases.shape[1]] = 0

    df_keyphrases.columns = ['key_1', 'key_2','key_3', 'key_4', 'key_5','key_6', 'key_7', 'key_8','key_9','key_10','score_1', 'score_2','score_3','score_4', 'score_5','score_6','score_7', 'score_8','score_9','score_10'] 
    predict_textrank = pd.concat([predict_textrank, df_keyphrases], ignore_index=True)
    print('Done')
predict_textrank.head(3)

Processing index 0...! Done
Processing index 1...! Done
Processing index 2...! Done
Processing index 3...! Done
Processing index 4...! Done
Processing index 5...! Done
Processing index 6...! Done
Processing index 7...! Done
Processing index 8...! Done
Processing index 9...! Done
Processing index 10...! Done
Processing index 11...! Done
Processing index 12...! Done
Processing index 13...! Done
Processing index 14...! Done
Processing index 15...! Done
Processing index 16...! Done
Processing index 17...! Done
Processing index 18...! Done
Processing index 19...! Done
Processing index 20...! Done
Processing index 21...! Done
Processing index 22...! Done
Processing index 23...! Done
Processing index 24...! Done
Processing index 25...! Done
Processing index 26...! Done
Processing index 27...! Done
Processing index 28...! Done
Processing index 29...! Done
Processing index 30...! Done
Processing index 31...! Done
Processing index 32...! Done
Processing index 33...! Done
Processing index 34...! 

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,score_1,score_2,score_3,score_4,score_5,score_6,score_7,score_8,score_9,score_10
0,alasan,personnel,penjelasan,persetujuan,prosedur,diusulkan,penilaian,proposed,tambahan,faktor,0.038,0.033,0.031,0.03,0.03,0.029,0.027,0.027,0.027,0.027
1,facilities,processing,exhibit,ctr,procedure,project,acuan,pengelolaan,document,perhatian,0.139,0.129,0.121,0.107,0.09,0.089,0.081,0.081,0.079,0.046
2,ruangan,fasilitas,kesepahaman,persyaratan,klarifikasi,provided,specification,disediakan,exhibit,provisions,0.034,0.033,0.031,0.03,0.03,0.029,0.028,0.028,0.028,0.027


EVALUATION

In [75]:
from utils import eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [76]:
# Evaluation TextRank top 10
predict_textrank_list_10 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10']].values.tolist()
eval_textrank_10 = eval(predict_textrank_list_10, targets, True).round(3)
eval_textrank_10.columns = ['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_10 = eval_textrank_10[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,flex_recall,flex_prec
0,no_match,no_match,no_match,partial_match,full_match,no_match,no_match,no_match,no_match,no_match,0.286,0.2
1,no_match,no_match,partial_match,no_match,no_match,no_match,full_match,full_match,partial_match,no_match,0.571,0.4
2,no_match,no_match,no_match,no_match,no_match,no_match,no_match,no_match,partial_match,no_match,0.143,0.1


In [77]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_10 = eval_textrank_10['flex_recall'].mean()
textrank_prec_10 = eval_textrank_10['flex_prec'].mean()
textrank_f1_10 = 2 * (textrank_prec_10 * textrank_recall_10) / (textrank_prec_10 + textrank_recall_10)

# Create a DataFrame with the scores
summary_10 = pd.DataFrame({'textrank': [textrank_recall_10, textrank_prec_10, textrank_f1_10]}, index=['recall', 'precision', 'F1'])
summary_10 = summary_10.round(3)
summary_10

Unnamed: 0,textrank
recall,0.181
precision,0.127
F1,0.149


In [78]:
# Evaluation TextRank top 5
predict_textrank_list_5 = predict_textrank[['key_1','key_2','key_3', 'key_4','key_5']].values.tolist()
eval_textrank_5 = eval(predict_textrank_list_5, targets, True).round(3)
eval_textrank_5.columns = ['key_1','key_2','key_3', 'key_4','key_5','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_5 = eval_textrank_5[['key_1','key_2','key_3', 'key_4','key_5', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,flex_recall,flex_prec
0,no_match,no_match,no_match,partial_match,full_match,0.286,0.4
1,no_match,no_match,partial_match,no_match,no_match,0.143,0.2
2,no_match,no_match,no_match,no_match,no_match,0.0,0.0


In [79]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_5 = eval_textrank_5['flex_recall'].mean()
textrank_prec_5 = eval_textrank_5['flex_prec'].mean()
textrank_f1_5 = 2 * (textrank_prec_5 * textrank_recall_5) / (textrank_prec_5 + textrank_recall_5)

# Create a DataFrame with the scores
summary_5 = pd.DataFrame({'textrank': [textrank_recall_5, textrank_prec_5, textrank_f1_5]}, index=['recall', 'precision', 'F1'])
summary_5 = summary_5.round(3)
summary_5

Unnamed: 0,textrank
recall,0.101
precision,0.141
F1,0.118


In [80]:
# Evaluation TextRank top 3
predict_textrank_list_3 = predict_textrank[['key_1','key_2','key_3']].values.tolist()
eval_textrank_3 = eval(predict_textrank_list_3, targets, True).round(3)
eval_textrank_3.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_textrank_3 = eval_textrank_3[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
eval_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,flex_recall,flex_prec
0,no_match,no_match,no_match,0.0,0.0
1,no_match,no_match,partial_match,0.143,0.333
2,no_match,no_match,no_match,0.0,0.0


In [81]:
# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
textrank_recall_3 = eval_textrank_3['flex_recall'].mean()
textrank_prec_3 = eval_textrank_3['flex_prec'].mean()
textrank_f1_3 = 2 * (textrank_prec_3 * textrank_recall_3) / (textrank_prec_3 + textrank_recall_3)

# Create a DataFrame with the scores
summary_3 = pd.DataFrame({'textrank': [textrank_recall_3, textrank_prec_3, textrank_f1_3]}, index=['recall', 'precision', 'F1'])
summary_3 = summary_3.round(3)
summary_3

Unnamed: 0,textrank
recall,0.064
precision,0.15
F1,0.09


In [82]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_10 = pd.concat([predict_textrank, df_targets, eval_textrank_10], axis=1)
predict_textrank_10.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,key_3.1,key_4.1,key_5.1,key_6.1,key_7.1,key_8.1,key_9.1,key_10.1,flex_recall,flex_prec
0,alasan,personnel,penjelasan,persetujuan,prosedur,diusulkan,penilaian,proposed,tambahan,faktor,...,no_match,partial_match,full_match,no_match,no_match,no_match,no_match,no_match,0.286,0.2
1,facilities,processing,exhibit,ctr,procedure,project,acuan,pengelolaan,document,perhatian,...,partial_match,no_match,no_match,no_match,full_match,full_match,partial_match,no_match,0.571,0.4
2,ruangan,fasilitas,kesepahaman,persyaratan,klarifikasi,provided,specification,disediakan,exhibit,provisions,...,no_match,no_match,no_match,no_match,no_match,no_match,partial_match,no_match,0.143,0.1


In [83]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_5 = pd.concat([predict_textrank, df_targets, eval_textrank_5], axis=1)
predict_textrank_5.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,4,5,6,key_1.1,key_2.1,key_3.1,key_4.1,key_5.1,flex_recall,flex_prec
0,alasan,personnel,penjelasan,persetujuan,prosedur,diusulkan,penilaian,proposed,tambahan,faktor,...,,,,no_match,no_match,no_match,partial_match,full_match,0.286,0.4
1,facilities,processing,exhibit,ctr,procedure,project,acuan,pengelolaan,document,perhatian,...,dokumen,,,no_match,no_match,partial_match,no_match,no_match,0.143,0.2
2,ruangan,fasilitas,kesepahaman,persyaratan,klarifikasi,provided,specification,disediakan,exhibit,provisions,...,services for company,exhibit a,,no_match,no_match,no_match,no_match,no_match,0.0,0.0


In [84]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_textrank_3 = pd.concat([predict_textrank, df_targets, eval_textrank_3], axis=1)
predict_textrank_3.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,alasan,personnel,penjelasan,persetujuan,prosedur,diusulkan,penilaian,proposed,tambahan,faktor,...,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,facilities,processing,exhibit,ctr,procedure,project,acuan,pengelolaan,document,perhatian,...,acuan,pengelolaan,dokumen,,,no_match,no_match,partial_match,0.143,0.333
2,ruangan,fasilitas,kesepahaman,persyaratan,klarifikasi,provided,specification,disediakan,exhibit,provisions,...,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,no_match,0.0,0.0


In [85]:
import pandas as pd
from openpyxl import load_workbook

def write_excel(df, sheet_name, filename):
    """
    Writes the given dataframe to an excel file with the given filename and sheet name.
    If the sheet already exists in the file, the data in the sheet will be overwritten.
    """
    try:
        # Try to load the existing workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
            if sheet_name in writer.book.sheetnames:
                # If sheet already exists, remove it
                sheet = writer.book[sheet_name]
                writer.book.remove(sheet)

            # Write the dataframe to the excel file
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    except FileNotFoundError:
        # If the file doesn't exist, create a new workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='w') as writer:
            df.to_excel(writer, sheet_name=sheet_name, index=False)


In [86]:
# Write predictions to excel file
#from utils import write_excel

sheet_name_10 = '14b_TR_w2v_lg_posfilter_10'
sheet_name_5 = '14b_TR_w2v_lg_posfilter_5'
sheet_name_3 = '14b_TR_w2v_lg_posfilter_3'

output_file = '14b_TR_w2v_lg_posfilter.xlsx'
write_excel(predict_textrank_10, sheet_name_10, output_file)
write_excel(predict_textrank_5, sheet_name_5, output_file)
write_excel(predict_textrank_3, sheet_name_3, output_file)