In [2]:
#1. rutin1 import module
import pandas as pd
import os
import sys

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import coo_matrix

In [3]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r30_lg.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [4]:
# preprocess data
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "notebooks/stopwords_tuning/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

df['preprocessed_text'] = df['text'].apply(preprocess)
df['preprocessed_text'] = df['preprocessed_text'].apply(preprocess)
text = df['preprocessed_text']

#example
text.head(3)


0    usulan personil proposed key personnel no tert...
1    template document project mengacu ctr exhibit ...
2    inquiry usulan lingkup scope aakomodasi ruanga...
Name: preprocessed_text, dtype: object

In [5]:
#Function for sorting tf_idf in descending order
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
        
    return results

def get_top_phrase(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,3),  
            max_features=2000).fit([corpus])
    bag_of_words = vec1.transform([corpus])
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]

In [6]:
def process_text(text):
    # Initialize CountVectorizer
    cv = CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1, 3))
    # Fit and transform the text data to a matrix of token counts
    X = cv.fit_transform(text)
    # Initialize TfidfTransformer
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    # Fit the transformer to the count matrix
    tfidf_transformer.fit(X)
    # Extract feature names
    feature_names = cv.get_feature_names_out()

    return feature_names

feature_names = process_text(text)

In [6]:
cv = CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1, 3))
X = cv.fit_transform(text)

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

# extract from random title
feature_names = cv.get_feature_names_out() #sblmnya method ini .get_feature_names() muncul error
feature_names

#from utils import sort_coo, extract_topn_from_vector
n_tfidf = 10
predict_tfidf = pd.DataFrame()
for index, row in df.iterrows():
    doc = row['preprocessed_text']
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
    
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items, n_tfidf)

    keyword_list = pd.DataFrame.from_dict(keywords, orient='index', columns=['score'])
    keyword_list.index.name = 'keyword'
    keyword_list.reset_index(inplace=True)

    a = pd.DataFrame(keyword_list.keyword).T.reset_index(drop=True)
    b = pd.DataFrame(keyword_list.score).round(2).T.reset_index(drop=True)
    keywords = pd.concat([a, b], axis=1)
    predict_tfidf = pd.concat([predict_tfidf, keywords], ignore_index=True)

predict_tfidf.columns = ['key_1', 'key_2','key_3', 'key_4', 'key_5','key_6', 'key_7', 'key_8','key_9','key_10','score_1', 'score_2','score_3','score_4', 'score_5','score_6','score_7', 'score_8','score_9','score_10'] 
predict_tfidf.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,score_1,score_2,score_3,score_4,score_5,score_6,score_7,score_8,score_9,score_10
0,fase tender,diajukan fase tender,diajukan fase,personil,personil pengganti,organisasi,fase,tender,pengganti,diajukan,0.32,0.32,0.32,0.31,0.28,0.27,0.24,0.24,0.21,0.18
1,template,document,processing facilities demikian,facilities demikian acuan,facilities demikian,demikian acuan pengelolaan,demikian acuan,biru processing facilities,biru processing,ctr,0.36,0.22,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.19
2,ruangan kantor,ruangan,kantor,artikel,appendix,disediakan lokasi,accommodation,lingkup scope,komposisi,scope,0.39,0.34,0.29,0.23,0.2,0.2,0.2,0.19,0.19,0.17


Evaluation

In [7]:
from utils import check_similarity, eval

targets = df[["k1", "k2", "k3","k4", "k5", "k6","k7"]].values.tolist()
df_targets = pd.DataFrame(targets)

In [8]:
# Evaluation TextRank top 10
predict_tfidf_list_10 = predict_tfidf[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10']].values.tolist()
eval_tfidf_10 = eval(predict_tfidf_list_10, targets, True).round(3)
eval_tfidf_10.columns = ['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf_10 = eval_tfidf_10[['key_1','key_2','key_3', 'key_4','key_5','key_6', 'key_7','key_8','key_9', 'key_10', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
#eval_tfidf_10.head(3)

# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall_10 = eval_tfidf_10['flex_recall'].mean()
tfidf_prec_10 = eval_tfidf_10['flex_prec'].mean()
tfidf_f1_10 = 2 * (tfidf_prec_10 * tfidf_recall_10) / (tfidf_prec_10 + tfidf_recall_10)

# Create a DataFrame with the scores
summary_10 = pd.DataFrame({'textrank': [tfidf_recall_10, tfidf_prec_10, tfidf_f1_10]}, index=['recall', 'precision', 'F1'])
summary_10 = summary_10.round(3)
#summary_10

In [9]:
# Evaluation TextRank top 5
predict_tfidf_list_5 = predict_tfidf[['key_1','key_2','key_3', 'key_4','key_5']].values.tolist()
eval_tfidf_5 = eval(predict_tfidf_list_5, targets, True).round(3)
eval_tfidf_5.columns = ['key_1','key_2','key_3', 'key_4','key_5','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf_5 = eval_tfidf_5[['key_1','key_2','key_3', 'key_4','key_5', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
#eval_tfidf_5.head(3)

# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall_5 = eval_tfidf_5['flex_recall'].mean()
tfidf_prec_5 = eval_tfidf_5['flex_prec'].mean()
tfidf_f1_5 = 2 * (tfidf_prec_5 * tfidf_recall_5) / (tfidf_prec_5 + tfidf_recall_5)

# Create a DataFrame with the scores
summary_5 = pd.DataFrame({'textrank': [tfidf_recall_5, tfidf_prec_5, tfidf_f1_5]}, index=['recall', 'precision', 'F1'])
summary_5 = summary_5.round(3)
#summary_5

In [10]:
# Evaluation TextRank top 3
predict_tfidf_list_3 = predict_tfidf[['key_1','key_2','key_3']].values.tolist()
eval_tfidf_3 = eval(predict_tfidf_list_3, targets, True).round(3)
eval_tfidf_3.columns = ['key_1', 'key_2','key_3','strict_recall', 'strict_prec', 'flex_recall','flex_prec']
eval_tfidf_3 = eval_tfidf_3[['key_1', 'key_2','key_3', 'flex_recall','flex_prec']] # untuk menyederhanakan hasil evaluasi
#eval_tfidf_3.head(3)

# Calculate TextRank Score, using flexible score : exact maatch =1, partial match = 1, no match = 0
tfidf_recall_3 = eval_tfidf_3['flex_recall'].mean()
tfidf_prec_3 = eval_tfidf_3['flex_prec'].mean()
tfidf_f1_3 = 2 * (tfidf_prec_3 * tfidf_recall_3) / (tfidf_prec_3 + tfidf_recall_3)

# Create a DataFrame with the scores
summary_3 = pd.DataFrame({'textrank': [tfidf_recall_3, tfidf_prec_3, tfidf_f1_3]}, index=['recall', 'precision', 'F1'])
summary_3 = summary_3.round(3)
summary_3

Unnamed: 0,textrank
recall,0.132
precision,0.309
F1,0.185


In [11]:
# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_tfidf_10 = pd.concat([predict_tfidf, df_targets, eval_tfidf_10], axis=1)
#predict_tfidf_10.head(3)

# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_tfidf_5 = pd.concat([predict_tfidf, df_targets, eval_tfidf_5], axis=1)
#predict_tfidf_5.head(3)

# Combine dataframe predict_textrank, df_targets and eval_textrank
predict_tfidf_3 = pd.concat([predict_tfidf, df_targets, eval_tfidf_3], axis=1)
predict_tfidf_3.head(3)

Unnamed: 0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,...,2,3,4,5,6,key_1.1,key_2.1,key_3.1,flex_recall,flex_prec
0,fase tender,diajukan fase tender,diajukan fase,personil,personil pengganti,organisasi,fase,tender,pengganti,diajukan,...,usulan,pengganti,,,,no_match,no_match,no_match,0.0,0.0
1,template,document,processing facilities demikian,facilities demikian acuan,facilities demikian,demikian acuan pengelolaan,demikian acuan,biru processing facilities,biru processing,ctr,...,acuan,pengelolaan,dokumen,,,partial_match,partial_match,no_match,0.286,0.667
2,ruangan kantor,ruangan,kantor,artikel,appendix,disediakan lokasi,accommodation,lingkup scope,komposisi,scope,...,lingkup kerja,akomodasi,services for company,exhibit a,,no_match,no_match,partial_match,0.143,0.333


In [None]:
from openpyxl import load_workbook

def write_excel(df, sheet_name, filename):
    """
    Writes the given dataframe to an excel file with the given filename and sheet name.
    If the sheet already exists in the file, the data in the sheet will be overwritten.
    """
    try:
        # Try to load the existing workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
            if sheet_name in writer.book.sheetnames:
                # If sheet already exists, remove it
                sheet = writer.book[sheet_name]
                writer.book.remove(sheet)

            # Write the dataframe to the excel file
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    except FileNotFoundError:
        # If the file doesn't exist, create a new workbook
        with pd.ExcelWriter(filename, engine='openpyxl', mode='w') as writer:
            df.to_excel(writer, sheet_name=sheet_name, index=False)


In [None]:
# Write predictions to excel file
sheet_name_10 = 'SE11_tfidf_10'
sheet_name_5 = 'SE11_tfidf_5'
sheet_name_3 = 'SE11_tfidf_3'

output_file = 'SE11_tfidf.xlsx'
write_excel(predict_tfidf_10, sheet_name_10, output_file)
write_excel(predict_tfidf_5, sheet_name_5, output_file)
write_excel(predict_tfidf_3, sheet_name_3, output_file)