1. Mencari kata yg paling sering muncul dalam semua surat

In [1]:
#1. rutin1 import module
import pandas as pd
import os
import sys

In [2]:
#2. rutin2 membuat syspath ke root utk aktifkan __init__.py
repo_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(repo_root)

In [3]:
#3. rutin3 Load the dataset
dataset_path = os.path.join(repo_root, "data/dataset_ekstraksi_r29.xlsx")
df = pd.read_excel(dataset_path)
df["text"] = df["judul"] +". "+ df["isi"]

In [4]:
# Preprocess
import re

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    return text

df_preprocess = df['text'].apply(preprocess)
df_preprocess.head(3)

0    usulan personil penting proposed key personnel...
1    template document jtb gpf project mengacu kepa...
2    change inquiry terkait usulan perubahan lingku...
Name: text, dtype: object

In [5]:
# Mencari kata yang paling sering muncul
from collections import Counter

def get_top_words(df_column, m=50, n=300):
    '''
    fungsi untuk meendapatkan kata yang paling sering muncul, untuk dimasukan sebagai candidate stop word dengan paramater
    df_column: kolom yang akan dihitung
    m: jumlah kata yang akan diambil
    n : frequency minimum kata yang akan diambil
    '''
    word_counts = Counter()
    for text in df_column:
        words = text.split()
        word_counts.update(words)
    top_words = word_counts.most_common(m)
    df_top_words = pd.DataFrame(top_words, columns=['word', 'frequency'])
    df_top_words = df_top_words[df_top_words['frequency'] >= n].sort_values(by='frequency', ascending=False)

    return df_top_words

top_keywords = get_top_words(df_preprocess, 50, 300 )
print(top_keywords)

            word  frequency
0            dan       4234
1     kontraktor       4191
2           yang       3561
3          untuk       3048
4     perusahaan       2643
5         dengan       2433
6           atas       2139
7            ini       1936
8           pada       1898
9             di       1753
10            no       1616
11           jtb       1614
12       tanggal       1501
13          dari       1435
14         dapat       1367
15         dalam       1334
16      tersebut       1276
17         surat       1182
18          kami       1168
19          oleh       1038
20        kepada       1021
21          akan        949
22        proyek        931
23        terima        921
24   disampaikan        865
25       sebagai        860
26      demikian        852
27         kasih        829
28           hal        799
29       terkait        791
30         telah        773
31         tidak        759
32     pekerjaan        742
33         bahwa        738
34     perhatian    

In [6]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf_score(df, top_keywords):
    '''
    Function to calculate the TF-IDF score for the top_keywords
    df: the input dataframe
    top_keywords: a dataframe containing the top keywords and their frequency
    '''
    text = df.str.cat(sep=' ')
    tfidf_vectorizer = TfidfVectorizer(vocabulary=top_keywords['word'].tolist(), smooth_idf=True, use_idf=True)
    tfidf = tfidf_vectorizer.fit_transform([text])
    tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf.data))
    top_keywords['tfidf_score'] = top_keywords['word'].apply(lambda x: tfidf_scores[x] if x in tfidf_scores else 0)
    return top_keywords.sort_values(by='tfidf_score', ascending=True)[:50]

# Example usage:
top_keywords = get_top_words(df_preprocess, 50, 300)
tfidf_scores = get_tfidf_score(df_preprocess, top_keywords)
print(tfidf_scores)

            word  frequency  tfidf_score
0            dan       4234     0.044582
1     kontraktor       4191     0.045406
2           yang       3561     0.045955
3          untuk       3048     0.046779
4     perusahaan       2643     0.046871
5         dengan       2433     0.048519
6           atas       2139     0.048885
7            ini       1936     0.049709
8           pada       1898     0.051906
9             di       1753     0.052821
10            no       1616     0.053920
11           jtb       1614     0.054103
12       tanggal       1501     0.058405
13          dari       1435     0.060694
14         dapat       1367     0.060694
15         dalam       1334     0.062617
16      tersebut       1276     0.067560
17         surat       1182     0.067926
18          kami       1168     0.069482
19          oleh       1038     0.070764
20        kepada       1021     0.072412
21          akan        949     0.073144
22        proyek        931     0.075890
23        terima

In [7]:
tfidf_scores[["word"]].to_csv("candidate_stopword.txt", index=False, header=False)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

def get_tfidf_top_words(df_column, m=50, n=400):
    '''
    Fungsi untuk mendapatkan kata yang paling sering muncul,
    untuk dimasukkan sebagai candidate stop word dengan parameter:
    df_column: kolom yang akan dihitung
    m: jumlah kata yang akan diambil
    n: frekuensi minimum untuk suatu kata untuk dianggap sebagai candidate stop word
    '''
    word_counts = Counter()
    for text in df_column:
        words = text.split()
        word_counts.update(words)
    top_words = word_counts.most_common(m)
    df_top_words = pd.DataFrame(top_words, columns=['word', 'frequency'])
    df_top_words = df_top_words[df_top_words['frequency'] >= n].sort_values(by='frequency', ascending=False)
    
    # Get only the top keywords
    keywords = df_top_words['word'].tolist()

    # Create a new dataframe to store the tfidf scores
    tfidf_df = pd.DataFrame(columns=['word', 'tfidf'])
    
    # Create a TfidfVectorizer object and fit on the corpus
    vectorizer = TfidfVectorizer(vocabulary=keywords, use_idf=True)
    X = vectorizer.fit_transform(df_column)

    # Compute the tfidf score for each keyword
    for i, keyword in enumerate(keywords):
        tfidf_score = X[:, vectorizer.vocabulary_[keyword]].toarray().sum()
        tfidf_df.loc[i] = [keyword, tfidf_score]
        
    # Sort by the tfidf score in ascending order
    tfidf_df = tfidf_df.sort_values(by='tfidf', ascending=True).reset_index(drop=True)
    
    # Return the top 50 words with the lowest tfidf score
    return tfidf_df.head(50)

tfidf_top_words = get_tfidf_top_words(df_preprocess, 50, 300)
print(tfidf_top_words)


            word       tfidf
0          covid   33.130793
1      melakukan   40.899210
2        berikut   44.613363
3         sesuai   45.352822
4           agar   46.234510
5          tidak   49.757862
6   kerjasamanya   50.547921
7             ke   51.562680
8        dokumen   53.301108
9            gpf   56.539551
10         telah   57.213537
11         bahwa   58.890416
12     perhatian   59.889230
13           hal   60.267971
14           gas   60.806016
15     pekerjaan   62.393674
16       bersama   63.758835
17        change   63.865454
18         kasih   64.758205
19        vendor   65.612570
20      demikian   65.654323
21            of   66.920863
22       sebagai   67.648204
23       terkait   69.378713
24        terima   70.134076
25        kepada   71.337403
26          oleh   72.049376
27     sampaikan   72.919475
28   disampaikan   78.368209
29          akan   78.874276
30        proyek   85.509612
31      tersebut   87.705177
32         dapat   89.941874
33         sur

In [9]:
# Mencari kata dg score TFIDF paling kecil

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

# Combine multiple rows into a single text
text = df_preprocess.str.cat(sep=' ')


cv = CountVectorizer(max_df=0.8, max_features=10000, ngram_range=(1, 1))
X = cv.fit_transform(df_preprocess)

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

# extract from random title
feature_names = cv.get_feature_names_out() #sblmnya method ini .get_feature_names() muncul error
n_minimum = 50

predict_tfidf = pd.DataFrame()

tf_idf_vector = tfidf_transformer.transform(cv.transform([text]))

# create a dictionary with feature names and corresponding tf-idf scores
feature_tfidf_scores = {}
for feature, score in zip(feature_names, tf_idf_vector.toarray()[0]):
    feature_tfidf_scores[feature] = score

# sort the dictionary by value in ascending order and take the first n_minimum items
sorted_tfidf_scores = sorted(feature_tfidf_scores.items(), key=lambda x: x[1])
top_words = sorted_tfidf_scores[:n_minimum]

# create a DataFrame with the top words and their tf-idf scores
df_top_words = pd.DataFrame(top_words, columns=['word', 'tfidf'])
print(df_top_words)



               word     tfidf
0                aa  0.000444
1               aae  0.000444
2        aakomodasi  0.000444
3                ab  0.000444
4               abb  0.000444
5       abdurrahman  0.000444
6             about  0.000444
7               abs  0.000444
8           absensi  0.000444
9         absorbent  0.000444
10              abu  0.000444
11              acc  0.000444
12       accentance  0.000444
13         accepted  0.000444
14   accomplishment  0.000444
15       accordance  0.000444
16              acd  0.000444
17           achmad  0.000444
18        acitivity  0.000444
19  acknowledgement  0.000444
20              acs  0.000444
21           adajan  0.000444
22           adakan  0.000444
23         adaptasi  0.000444
24        addiional  0.000444
25          adhered  0.000444
26       adisasmito  0.000444
27           adityo  0.000444
28       administer  0.000444
29      adminitrasi  0.000444
30         adrianto  0.000444
31          adverse  0.000444
32        

COMBINED STOPWORDS
Tala + Sastrawi + Custom

In [10]:
# preprocess data
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

def preprocess(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text = text.replace('.', '. ')
    text = re.sub('[^a-zA-Z.]', ' ', text)
    text = text.lower()
    text = re.sub("(\\d|\\W)+"," ",text)
    text = text.strip()

    stopwords_path = os.path.join(repo_root, "data/all_stop_words.txt")
    with open(stopwords_path, 'r') as f:
        stopwords = [line.strip() for line in f]

    dictionary = ArrayDictionary(stopwords)
    str = StopWordRemover(dictionary)
    text = str.remove(text)

    return text

In [11]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

# ambil list stopword sastrawi
stop_factory = StopWordRemoverFactory()
stopword_sastrawi = stop_factory.get_stop_words()

# ambil list stopword Tala
with open('stopword_list_tala.txt', 'r') as f:
    stopword_tala = [line.strip() for line in f]

# ambil list stopword candidate corpus skrg  
with open('candidate_stopword.txt', 'r') as f:
    stopword_corpus = [line.strip() for line in f]

stop_words = stopword_sastrawi + stopword_tala + stopword_corpus 

In [12]:
with open('all_stop_words.txt', 'w') as f:
    f.write('\n'.join(stop_words))