Definisi fungsi

In [4]:
import pandas as pd
import json
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('punkt')

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(file.read().splitlines())
    
def load_lexicon(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(json.load(file))  
    
slang_dict = json.load(open("txt/kamusSlang.json", "r", encoding="utf-8"))
stopwords = load_file('txt/stopwords-1.txt')
kamus_indonesia = load_file('txt/kamusIndonesia.txt')
pos_lexicon = load_lexicon('leksikon/leksikon-pos.json')
neg_lexicon = load_lexicon('leksikon/leksikon-neg.json')

def preprocessing(text, slang_dict, stopwords, kamus_indonesia, stemmer):
    text = text.lower()  # Case folding
    text = re.sub(r"\\t|\\n|\\u|\\|http[s]?://\\S+|[@#][A-Za-z0-9_]+", " ", text)  # Menghapus karakter khusus
    text = re.sub(r"\\d+", "", text)  # Menghapus angka
    text = text.translate(str.maketrans("", "", string.punctuation))  # Menghapus tanda baca (pakai import string)
    text = re.sub(r"\\s+", ' ', text).strip()  # merapihkan spasi ganda
    text = re.sub(r"\b[a-zA-Z]\b", "", text) # Menghapus satu huruf (besar/kecil)
    text = ' '.join([slang_dict.get(word, word) for word in text.split()]) # Normalisasi (pemanfaatan kamus slang)
    text = word_tokenize(text) # Tokenisasi (sebelum stemming)
    text = [stemmer.stem(word) for word in text] # Stemming
    text = [word for word in text if word not in stopwords and len(word) > 3 and word in kamus_indonesia] # Stopwords & memilah kata
    text = ' '.join(text)
    return text

def hitung_sentimen(text, pos_lexicon, neg_lexicon):
    pos_count = sum(1 for word in text.split() if word in pos_lexicon)
    neg_count = sum(1 for word in text.split() if word in neg_lexicon)
    if pos_count > neg_count:
        return 'Positif', 1
    elif neg_count > pos_count:
        return 'Negatif', -1
    else:
        return 'Netral', 0



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


https://github.com/stopwords-iso/stopwords-id # referens stopwords

* Implementasi preprocessing & hapus nilai null
* proses dibawah 15m 13.5s

In [5]:
df = pd.read_csv('code-filter-crawling/crawling.csv')
df.rename(columns={"full_text" : "teks"}, inplace=True)
df['teks'] = df['teks'].apply(lambda x: preprocessing(x, slang_dict, stopwords, kamus_indonesia,stemmer))

# Hapus baris yang memiliki nilai kosong (termasuk yang berisi spasi atau karakter non-huruf)
df = df[df['teks'].str.strip().astype(bool)]

Pelabelan sentimen

In [6]:
# Lanjutkan dengan analisis
df[['label' ,'skor']] = df['teks'].apply(lambda x: pd.Series(hitung_sentimen(x, pos_lexicon, neg_lexicon)))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 0 to 1144
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   teks    1144 non-null   object
 1   label   1144 non-null   object
 2   skor    1144 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 35.8+ KB


Menyimpan hasil pelabelan

In [7]:
df.to_csv('dataset_berlabel.csv', index=False)

Pengujian model logistic regression (tanpa smote)

In [2]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, accuracy_score
# from imblearn.over_sampling import SMOTE

# data = pd.read_csv('dataset_berlabel/dataset_berlabel.csv')

# # Preprocessing teks (case folding, tokenization, dsb.) bisa dilakukan di sini
# # data['teks'] = data['teks'].apply(lambda x: preprocessing(x, slang_dict, stopwords, kamus_indonesia, stemmer))

# X = data['teks']
# y = data['label']


# # Membagi data menjadi training dan testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# vectorizer = TfidfVectorizer()


# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)


# # Melatih model Logistic Regression
# model = LogisticRegression()
# model.fit(X_train_tfidf, y_train)

# # Memprediksi hasil untuk data testing
# y_pred = model.predict(X_test_tfidf)

# # Evaluasi model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7685589519650655
Classification Report:
               precision    recall  f1-score   support

     Negatif       0.76      0.99      0.86       150
      Netral       0.77      0.26      0.39        38
     Positif       0.89      0.41      0.57        41

    accuracy                           0.77       229
   macro avg       0.81      0.56      0.61       229
weighted avg       0.78      0.77      0.73       229



Pengujian model logistic regression (pakai smote)

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

data = pd.read_csv('dataset_berlabel.csv')

# Preprocessing teks (case folding, tokenization, dsb.) bisa dilakukan di sini
# data['teks'] = data['teks'].apply(lambda x: preprocessing(x, slang_dict, stopwords, kamus_indonesia, stemmer))

X = data['teks']
y = data['label']
smote = SMOTE(random_state=42)

# Membagi data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()


X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)
# Melatih model Logistic Regression
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)

# Memprediksi hasil untuk data testing
y_pred = model.predict(X_test_tfidf)

# Evaluasi model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7860262008733624
Classification Report:
               precision    recall  f1-score   support

     Negatif       0.87      0.91      0.89       150
      Netral       0.49      0.50      0.49        38
     Positif       0.75      0.59      0.66        41

    accuracy                           0.79       229
   macro avg       0.70      0.67      0.68       229
weighted avg       0.78      0.79      0.78       229



* Jurnal yang membahas penyimpanan model: https://katalog.ukdw.ac.id/8055/1/71190448_bab1_bab5_daftarpustaka.pdf
* Menyimpan model

In [11]:
import joblib

joblib.dump(model, "model/model_sentimen.pkl")
joblib.dump(vectorizer, "model/vectorizer_sentimen.pkl")


['vectorizer_sentimen.pkl']

Algoritma halaman preprocessing (perancangan)

In [18]:
import pandas as pd
import pandas as pd
import json
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('punkt')


# Misalkan kita menerima dataset baru dalam bentuk DataFrame
data_baru = pd.read_csv('code-filter-crawling/crawling.csv')
data_baru = data_baru.rename(columns={"full_text":"teks"})


# Preprocessing teks (termasuk case folding, tokenisasi, dll.) bisa dilakukan di sini jika diperlukan
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocessing(text, slang_dict, stopwords, kamus_indonesia, stemmer):
    text = text.lower()  # Case folding
    text = re.sub(r"\\t|\\n|\\u|\\|http[s]?://\\S+|[@#][A-Za-z0-9_]+", " ", text)  # Menghapus karakter khusus
    text = re.sub(r"\\d+", "", text)  # Menghapus angka
    text = text.translate(str.maketrans("", "", string.punctuation))  # Menghapus tanda baca (pakai import string)
    text = re.sub(r"\\s+", ' ', text).strip()  # merapihkan spasi ganda
    text = re.sub(r"\b[a-zA-Z]\b", "", text) # Menghapus satu huruf (besar/kecil)
    text = ' '.join([slang_dict.get(word, word) for word in text.split()]) # Normalisasi (pemanfaatan kamus slang)
    text = word_tokenize(text) # Tokenisasi (sebelum stemming)
    text = [stemmer.stem(word) for word in text] # Stemming
    text = [word for word in text if word not in stopwords and len(word) > 3 and word in kamus_indonesia] # Stopwords & memilah kata
    text = ' '.join(text)
    return text

def load_lexicon(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(json.load(file))

def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(file.read().splitlines())
    
slang_dict = json.load(open("txt/kamusSlang.json", "r", encoding="utf-8"))
stopwords = load_file('txt/stopwords-1.txt')
kamus_indonesia = load_file('txt/kamusIndonesia.txt')
pos_lexicon = load_lexicon('leksikon/leksikon-pos.json')
neg_lexicon = load_lexicon('leksikon/leksikon-neg.json')

data_baru['teks'] = data_baru['teks'].apply(lambda x: preprocessing(x, slang_dict, stopwords, kamus_indonesia, stemmer))
data_baru.to_csv("preprocessing/hasil.csv",index=False)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Algoritma halaman klasifikasi (perancangan)

In [2]:
import joblib
import pandas as pd
# Memuat model dan vectorizer yang telah disimpan
model = joblib.load('model_sentimen.pkl')
vectorizer = joblib.load('vectorizer_sentimen.pkl')

data_baru = pd.read_csv('preprocessing/preprocessing.csv')
data_baru = data_baru.dropna(subset=['teks'])
# Mengubah teks dari kolom 'teks' menjadi representasi numerik dengan vectorizer yang sudah dilatih
X_baru = vectorizer.transform(data_baru['teks'])

# Melakukan prediksi menggunakan model yang sudah dilatih
prediksi = model.predict(X_baru)

# Menambahkan hasil prediksi ke dalam dataset baru
data_baru['label'] = prediksi

# Menyimpan hasil prediksi ke file baru
data_baru.to_csv('hasil_prediksi.csv', index=False)

Algoritma halaman klasterisasi (perancangan)

In [21]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

# Membaca dataset
# df_selected = pd.read_csv('dataset-berlabel-aspek.csv')
df_selected = pd.read_csv('hasil_prediksi.csv')
# Pastikan semua nilai dalam kolom 'teks' adalah string, dan tangani NaN
df_selected['teks'] = df_selected['teks'].fillna('').astype(str)
# Memuat stopwords
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(file.read().splitlines())
stopwords2 = load_file('txt/stopwords-2.txt')

# Mengahapus stopwords
def preprocessing(text, stopwords):
    text = [word for word in text.split() if word not in stopwords]
    return ' '.join(text) 

# Menghapus stopwords dari kolom 'teks' dan menyimpannya dalam kolom baru 'teks-kmeans'
df_selected['teks-kmeans'] = df_selected['teks'].apply(lambda x: preprocessing(x, stopwords2))

centroid_sentences = {
    'kompensasi': "gaji kompensasi",
    'kepuasan_kerja': "mental stres jam",
    'aktualisasi': "berkembang kembang jabatan skill",
    'hubungan_kerja': "hubungan jahat hubungan baik lingkung"
}
# Menghitung posisi dalam DataFrame untuk setiap centroid
num_rows = len(df_selected)
posisi = {
    int(num_rows * 0.25): centroid_sentences['kompensasi'],
    int(num_rows * 0.50): centroid_sentences['kepuasan_kerja'],
    int(num_rows * 0.75): centroid_sentences['aktualisasi'],
    int(num_rows * 0.90): centroid_sentences['hubungan_kerja']
}
# Menyisipkan kalimat ke dalam DataFrame pada posisi yang ditentukan
for pos, sentence in posisi.items():
    df_selected.at[pos, 'teks-kmeans'] = sentence
# Vektorisasi teks menggunakan TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_selected['teks-kmeans'])  # Menggunakan kolom teks yang telah dibersihkan
lokasi_centroid = X[list(posisi.keys())].toarray()

# K-means clustering
kmeans = KMeans(n_clusters=4, init=lokasi_centroid, n_init=10, random_state=0)
kmeans.fit(X)

# Menyimpan hasil klaster pada kolom baru 'skor-klaster-prediksi'
df_selected['label-klaster'] = kmeans.labels_

db_score = davies_bouldin_score(X.toarray(), kmeans.labels_)
print(f"Davies-Bouldin Score: {db_score:.2f}")

centroid_texts = set(centroid_sentences.values())
df_selected = df_selected[~df_selected['teks-kmeans'].isin(centroid_texts)].reset_index(drop=True)

ambil = pd.DataFrame(df_selected[['teks-kmeans', 'label', 'label-klaster']])
ambil.to_csv("klaster-prediksi.csv", index=False)

# Memisahkan klaster menjadi DataFrame yang berbeda dan menambahkan kolom 'label'
clusters = [df_selected[df_selected['label-klaster'] == i][['teks-kmeans', 'label', 'label-klaster']].reset_index(drop=True) for i in range(4)]

# Label untuk setiap klaster
label_klaster = ['kompensasi', 'kepuasan kerja', 'aktualisasi', 'hubungan kerja']

# Menampilkan dan menyimpan hasil
for label, cleaned_data in zip(label_klaster, clusters):  # Menyesuaikan penggunaan zip
    # print(f"Faktor {label.capitalize()}:")
    # print(cleaned_data[['teks-kmeans', 'label']])

    # Menyimpan data ke file
    cleaned_data.to_csv(f'klaster/{label}.csv', sep='\t', index=False, header=True)


Davies-Bouldin Score: 6.49


  super()._check_params_vs_input(X, default_n_init=10)


Pelabelan aspek

In [22]:
import pandas as pd
import json

df = pd.read_csv('klaster-prediksi.csv')

# Tentukan Aspek dengan Skor
def tentukan_aspek(text, aspek_kompensasi, aspek_kepuasan_kerja, aspek_aktualisasi, aspek_hubungan):
    # Hitung jumlah kata yang cocok dengan aspek
    kompensasi = sum(1 for word in text.split() if word in aspek_kompensasi)
    kepuasan = sum(1 for word in text.split() if word in aspek_kepuasan_kerja)
    aktualisasi = sum(1 for word in text.split() if word in aspek_aktualisasi)
    hubungan = sum(1 for word in text.split() if word in aspek_hubungan)

    # Tentukan label dan skor
    scores = {
        'Kompensasi': kompensasi,
        'Kepuasan Kerja': kepuasan,
        'Aktualisasi': aktualisasi,
        'Hubungan': hubungan
    }

    # Pilih aspek dengan skor terbanyak
    label_aspek = max(scores, key=scores.get)
    # skor_aspek = scores[label_aspek]

    return label_aspek

# Load lexicon for each aspect
def load_lexicon(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(json.load(file))

# Load leksikon dari file
aspek_kompensasi = load_lexicon('leksikon/aspek-kompensasi.json')
aspek_kepuasan = load_lexicon('leksikon/aspek-kepuasan-kerja.json')
aspek_aktualisasi = load_lexicon('leksikon/aspek-aktualisasi.json')
aspek_hubungan = load_lexicon('leksikon/aspek-hubungan-kerja.json')

df['teks-kmeans'] = df['teks-kmeans'].fillna('').astype(str)

# Misalkan df adalah DataFrame yang sudah ada dan memiliki kolom 'teks'
# Terapkan fungsi tentukan_aspek pada kolom 'teks' dan simpan hasilnya pada kolom baru
df[['label-aspek']] = df['teks-kmeans'].apply(
    lambda x: pd.Series(tentukan_aspek(x, aspek_kompensasi, aspek_kepuasan, aspek_aktualisasi, aspek_hubungan))
)

# Menambahkan kolom 'skor-label-aspek' berdasarkan kondisi tertentu
def assign_skor_label_aspek(label_aspek):
    if 'Kompensasi' in label_aspek:
        return 0
    elif 'Kepuasan Kerja' in label_aspek:
        return 1
    elif 'Aktualisasi' in label_aspek:
        return 2
    elif 'Hubungan' in label_aspek:
        return 3
    else:
        return  # Nilai default jika tidak ada yang cocok

# Terapkan fungsi ke kolom 'label-aspek' untuk membuat kolom 'skor-label-aspek'
df['skor-label-aspek-aktual'] = df['label-aspek'].apply(assign_skor_label_aspek)

df.to_csv('klaster-aktual.csv', index=False)
# Menampilkan hasil
print(df[['teks-kmeans', 'label-aspek',]].head())


                                         teks-kmeans label-aspek
0  pandemik bayar gaji selesa relaks fikir henti ...  Kompensasi
1                                         gaji cepat  Kompensasi
2  pilih salah harga level tutup bawa gerbong rom...  Kompensasi
3  bangun pagi semangat pergi tepu rutin gain mat...  Kompensasi
4  alas jahat banget arah visi jahat bungkus alas...  Kompensasi
