In [67]:
### 1.1. Lingkungan Teknis & Import Library
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [68]:
df = pd.read_csv("dataset_judol.csv")
df.tail()

Unnamed: 0,comment,label
18665,malah di ingatin wkwk,0
18666,"Iyaa..klo Abah Anies pastiI pinter jawab, tp​ ...",0
18667,Lebih tertarik baca komentarnya netizen ya 😅 a...,0
18668,12:15 bening mengkilat 😇,0
18669,yaelah tong tong,0


### <b> Imbalance handling

In [69]:
print(df[df["label"] == 0]['label'].count())
print(df[df["label"] == 1]['label'].count())

11288
7382


In [70]:
# Asumsikan 'df' adalah DataFrame Anda yang sudah bersih
# df['label'] berisi label 0 dan 1

# 1. Pisahkan DataFrame berdasarkan kelas
df_mayoritas = df[df.label == 0]
df_minoritas = df[df.label == 1]

# 2. Ambil sampel dari kelas mayoritas sebanyak jumlah kelas minoritas
df_mayoritas_undersampled = df_mayoritas.sample(n=len(df_minoritas), random_state=42)

# 3. Gabungkan kembali kedua DataFrame
df_undersampled = pd.concat([df_mayoritas_undersampled, df_minoritas])

# 4. Acak urutan DataFrame agar tidak berurutan (semua 0 dulu baru semua 1)
df = df_undersampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Verifikasi hasil
print("Jumlah data setelah Random Undersampling:")
print(df['label'].value_counts())


Jumlah data setelah Random Undersampling:
label
0    7382
1    7382
Name: count, dtype: int64


In [71]:
print(df[df["label"] == 0]['label'].count())
print(df[df["label"] == 1]['label'].count())

7382
7382


### <b> Text preprocessing

In [72]:
# Casefolding
NAMA_KOLOM_TEKS = 'comment'
df['comment'] = df[NAMA_KOLOM_TEKS].str.lower()

In [73]:
# Noise Removal
df['text_clean'] = df['comment'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

print("Hasil Noise Removal:")
print(df[['comment', 'text_clean']].head())

Hasil Noise Removal:
                                             comment  \
0  iya benar, jangan2 jawabannya lebih hebat\nnge...   
1                     gak mikir kali join alexis17 .   
2                     wakkaka si perusak mobil kan?😂   
3                               tumben g gontok".an😂   
4                            𝘼𝙇𝙀𝙓𝙄𝙎17 emang beda . !   

                                          text_clean  
0  iya benar jangan2 jawabannya lebih hebat\nngel...  
1                      gak mikir kali join alexis17   
2                       wakkaka si perusak mobil kan  
3                                  tumben g gontokan  
4                                    17 emang beda    


In [74]:
# Normalization word
kamus_normalisasi = {
    'ga': 'tidak', 'gak': 'tidak', 'tdk': 'tidak', 'engga': 'tidak',
    'jg': 'juga', 'jgn': 'jangan',
    'yg': 'yang', 'utk': 'untuk',
    'sm': 'sama', 'dgn': 'dengan',
    'klo': 'kalau', 'kalo': 'kalau',
    'krn': 'karena',
    'bgt': 'banget', 'skrg': 'sekarang',
    'trus': 'terus', 'sdh': 'sudah',
    'blm': 'belum', 'lg': 'lagi',
    'sya': 'saya', 'gw': 'saya', 'gue': 'saya',
    'lu': 'kamu', 'loe': 'kamu',
    'wkwk': 'tertawa', 'wkwkwk': 'tertawa', 'xixi': 'tertawa',
    'depo': 'deposit', 'wd': 'withdraw', 'jp': 'jackpot',
    'dpt': 'dapat', 'dapet': 'dapat',
}

# fungsi normalisasi
def normalize_text(text):
    # Memecah teks menjadi daftar kata dan mapping ke normalnya (jika ada)
    words = text.split()
    normalized_words = [kamus_normalisasi.get(word, word) for word in words]

    # Menggabungkan kembali kata-kata menjadi satu kalimat
    return ' '.join(normalized_words)

df['text_clean'] = df['text_clean'].apply(normalize_text)


print("\nContoh Normalisasi pada Kalimat Spesifik:")
kalimat_kotor = "klo gw depo lg skrg, dpt bonus ga ya?"
kalimat_bersih = "klo gw depo lg skrg dpt bonus ga ya" # setelah case folding & noise removal
kalimat_normal = normalize_text(kalimat_bersih)

print(f"Sebelum : '{kalimat_kotor}'")
print(f"Sesudah : '{kalimat_normal}'")


Contoh Normalisasi pada Kalimat Spesifik:
Sebelum : 'klo gw depo lg skrg, dpt bonus ga ya?'
Sesudah : 'kalau saya deposit lagi sekarang dapat bonus tidak ya'


In [75]:
# Remove stopword
stopword_list = [
    'di', 'dan', 'yang', 'untuk', 'pada', 'ke', 'karena', 'ini', 'itu',
    'dengan', 'tapi', 'juga', 'adalah', 'saya', 'kamu', 'dia', 'kita', 'kalian',
    'mereka', 'saja', 'jika', 'atau', 'dari', 'akan', 'sudah', 'telah',
    'belum', 'lagi', 'saat', 'seperti', 'hanya', 'bisa', 'jadi', 'buat',
]
stopword_set = set(stopword_list)

# fungsi untuk menghapus stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stopword_set]
    return ' '.join(filtered_words)

# Terapkan fungsi stopword removal ke kolom 'text_clean'
df['text_clean'] = df['text_clean'].apply(remove_stopwords)

print("Hasil Stopword Removal:")
print(df[['comment', 'text_clean']].head())

# Contoh spesifik untuk menunjukkan perubahan
print("\nContoh Stopword Removal pada Kalimat Spesifik:")
kalimat_sebelum = "indonesia butuh orang orang seperti kamu bang saya hormat" # setelah normalisasi
kalimat_sesudah = remove_stopwords(kalimat_sebelum)

print(f"Sebelum: '{kalimat_sebelum}'")
print(f"Sesudah: '{kalimat_sesudah}'")

Hasil Stopword Removal:
                                             comment  \
0  iya benar, jangan2 jawabannya lebih hebat\nnge...   
1                     gak mikir kali join alexis17 .   
2                     wakkaka si perusak mobil kan?😂   
3                               tumben g gontok".an😂   
4                            𝘼𝙇𝙀𝙓𝙄𝙎17 emang beda . !   

                                          text_clean  
0  iya benar jangan2 jawabannya lebih hebat ngela...  
1                     tidak mikir kali join alexis17  
2                       wakkaka si perusak mobil kan  
3                                  tumben g gontokan  
4                                      17 emang beda  

Contoh Stopword Removal pada Kalimat Spesifik:
Sebelum: 'indonesia butuh orang orang seperti kamu bang saya hormat'
Sesudah: 'indonesia butuh orang orang bang hormat'


In [76]:
df[df['label'] == 1].head()

Unnamed: 0,comment,label,text_clean
1,gak mikir kali join alexis17 .,1,tidak mikir kali join alexis17
4,𝘼𝙇𝙀𝙓𝙄𝙎17 emang beda . !,1,17 emang beda
7,gk ada bosan nya aku sama ⭐𝘼𝙇𝙀𝙓𝙄𝙎17,1,gk ada bosan nya aku sama 17
15,"dari awal cobain sampe sekarang, gak pernah ny...",1,awal cobain sampe sekarang tidak pernah nyesel...
17,"grafis 𝘿 𝐄 𝑊 а 𝘿 о 𝙍 а keren banget , bikin be...",1,grafis keren banget bikin betah main


### <b> Word Tokenization

In [77]:
# X dan y untuk memastikan alur yang jelas
X = df['text_clean'].values
y = df['label'].values

# Membagi data menjadi data latih dan uji.
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

VOCAB_SIZE = 10000  # Jumlah kata unik yang akan kita simpan dalam kamus
MAX_SEQUENCE_LENGTH = 128 # Panjang maksimal setiap sekuens komentar

In [78]:
# oov_token: Kata yang tidak ada di kamus akan diubah menjadi token '<unk>'.
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<unk>')
tokenizer.fit_on_texts(X_train)

#  Mengubah Teks menjadi Sekuens Angka
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [79]:
# Menyeragamkan Panjang Sekuens
X_train_padded = pad_sequences(
    X_train_sequences,
    maxlen=MAX_SEQUENCE_LENGTH,
    padding='post',
    truncating='post'
)

X_test_padded = pad_sequences(
    X_test_sequences,
    maxlen=MAX_SEQUENCE_LENGTH,
    padding='post',
    truncating='post'
)

In [80]:
# --- Verifikasi Hasil ---
print("Proses Vectorization Selesai.")
print("-" * 50)
# Tampilkan contoh hasil
contoh_index = 0
print(f"Teks Asli (setelah cleaning): \n{X_train[contoh_index]}")
print(f"\nSetelah diubah menjadi sekuens angka: \n{X_train_sequences[contoh_index]}")
print(f"\nSetelah di-padding menjadi {MAX_SEQUENCE_LENGTH} elemen: \n{X_train_padded[contoh_index]}")
print("-" * 50)
print(f"Bentuk (shape) dari data latih akhir: {X_train_padded.shape}")
print(f"Bentuk (shape) dari data uji akhir: {X_test_padded.shape}")

Proses Vectorization Selesai.
--------------------------------------------------
Teks Asli (setelah cleaning): 
malem nungguin update alexis17

Setelah diubah menjadi sekuens angka: 
[405, 575, 466, 3]

Setelah di-padding menjadi 128 elemen: 
[405 575 466   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
--------------------------------------------------
Bentuk (shape) dari data latih akhir: (11811, 128)
Bentuk (shape) dari data uji akhir: (2953, 128)


In [81]:
y_train

array([1, 0, 0, ..., 1, 1, 1])

### <b> Dataset to linguistic model

In [82]:
# Hyperparameter tuning
EMBEDDING_DIM = 64     # Ukuran vektor untuk setiap kata
UNITS = 64             # Jumlah unit/neuron di dalam layer LSTM/GRU/RNN
EPOCHS = 10            # Jumlah berapa kali model melihat keseluruhan data latih
BATCH_SIZE = 32        # Jumlah data yang diproses dalam satu waktu
PATIENCE = 3           # Berapa epoch model akan menunggu jika tidak ada peningkatan

# --- Callback untuk menghentikan training lebih awal (mencegah overfitting) ---
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=PATIENCE,
    restore_best_weights=True
)

In [83]:
# Train LSTM Model
def create_lstm_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(UNITS)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(UNITS, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(2, activation='softmax') # 2 output untuk 2 kelas (judol/bukan)
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Buat dan latih model LSTM
lstm_model = create_lstm_model()
lstm_model.summary()

history_lstm = lstm_model.fit(
    X_train_padded, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test_padded, y_test),
    callbacks=[early_stopping]
)



Epoch 1/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.7709 - loss: 0.4037 - val_accuracy: 0.9661 - val_loss: 0.0959
Epoch 2/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 30ms/step - accuracy: 0.9786 - loss: 0.0647 - val_accuracy: 0.9648 - val_loss: 0.1062
Epoch 3/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 32ms/step - accuracy: 0.9905 - loss: 0.0332 - val_accuracy: 0.9658 - val_loss: 0.1097
Epoch 4/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 31ms/step - accuracy: 0.9924 - loss: 0.0259 - val_accuracy: 0.9648 - val_loss: 0.1391


In [84]:
# Train GRU Model
def create_gru_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(UNITS)), # Cukup ganti LSTM dengan GRU
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(UNITS, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Buat dan latih model GRU
gru_model = create_gru_model()
gru_model.summary()

history_gru = gru_model.fit(
    X_train_padded, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test_padded, y_test),
    callbacks=[early_stopping]
)

Epoch 1/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 37ms/step - accuracy: 0.7957 - loss: 0.3962 - val_accuracy: 0.9614 - val_loss: 0.1070
Epoch 2/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 35ms/step - accuracy: 0.9810 - loss: 0.0594 - val_accuracy: 0.9695 - val_loss: 0.0898
Epoch 3/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 36ms/step - accuracy: 0.9901 - loss: 0.0350 - val_accuracy: 0.9658 - val_loss: 0.1051
Epoch 4/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 40ms/step - accuracy: 0.9931 - loss: 0.0253 - val_accuracy: 0.9648 - val_loss: 0.1244
Epoch 5/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 43ms/step - accuracy: 0.9941 - loss: 0.0193 - val_accuracy: 0.9644 - val_loss: 0.1246


In [85]:
# Train Simple RNN (baseline)
def create_rnn_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(UNITS)), # Menggunakan SimpleRNN
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(UNITS, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Buat dan latih model RNN
rnn_model = create_rnn_model()
rnn_model.summary()

history_rnn = rnn_model.fit(
    X_train_padded, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test_padded, y_test),
    callbacks=[early_stopping]
)

Epoch 1/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 33ms/step - accuracy: 0.7160 - loss: 0.4950 - val_accuracy: 0.9597 - val_loss: 0.1142
Epoch 2/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 29ms/step - accuracy: 0.9813 - loss: 0.0623 - val_accuracy: 0.9672 - val_loss: 0.1043
Epoch 3/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.9911 - loss: 0.0271 - val_accuracy: 0.9597 - val_loss: 0.1239
Epoch 4/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.9941 - loss: 0.0198 - val_accuracy: 0.9638 - val_loss: 0.1428
Epoch 5/10
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.9946 - loss: 0.0163 - val_accuracy: 0.9631 - val_loss: 0.1539


### <b> Evaluation metrics

In [86]:
def evaluate_model(model, X_test, y_test, model_name=""):
    print(f"\n--- Evaluasi Model: {model_name} ---")

    # Dapatkan prediksi dari model
    y_pred_probs = model.predict(X_test)
    y_pred = np.argmax(y_pred_probs, axis=1)

    # Hitung metrik
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Akurasi: {accuracy:.4f}")
    print(f"F1-Score (Weighted): {f1:.4f}")
    print("\nLaporan Klasifikasi:")
    print(classification_report(y_test, y_pred, target_names=['Bukan Judol (0)', 'Judol (1)']))

    # 3. Kembalikan metrik untuk tabel perbandingan
    return accuracy, f1

In [87]:
def evaluate_model(model, X_test, y_test, model_name=""):
    print(f"\n--- Evaluasi Model: {model_name} ---")
    y_pred_probs = model.predict(X_test)
    y_pred = np.argmax(y_pred_probs, axis=1)

    print("Laporan Klasifikasi:")
    print(classification_report(y_test, y_pred, target_names=['Bukan Judol (0)', 'Judol (1)']))
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, f1

print("--- Memulai Evaluasi untuk Semua Model ---")
acc_lstm, f1_lstm = evaluate_model(lstm_model, X_test_padded, y_test, "LSTM")
acc_gru, f1_gru = evaluate_model(gru_model, X_test_padded, y_test, "GRU")
acc_rnn, f1_rnn = evaluate_model(rnn_model, X_test_padded, y_test, "SimpleRNN")

# Comparation table
summary_df = pd.DataFrame({
    'Model': ['LSTM', 'GRU', 'Simple RNN'],
    'Akurasi': [acc_lstm, acc_gru, acc_rnn],
    'F1-Score (Weighted)': [f1_lstm, f1_gru, f1_rnn]
})

# Urutkan berdasarkan performa terbaik
summary_df = summary_df.sort_values(by='F1-Score (Weighted)', ascending=False)

print("\n\n===== RINGKASAN AKHIR PERBANDINGAN MODEL =====")
print(summary_df.round(4))

--- Memulai Evaluasi untuk Semua Model ---

--- Evaluasi Model: LSTM ---
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step
Laporan Klasifikasi:
                 precision    recall  f1-score   support

Bukan Judol (0)       0.95      0.98      0.97      1477
      Judol (1)       0.98      0.95      0.97      1476

       accuracy                           0.97      2953
      macro avg       0.97      0.97      0.97      2953
   weighted avg       0.97      0.97      0.97      2953


--- Evaluasi Model: GRU ---
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Laporan Klasifikasi:
                 precision    recall  f1-score   support

Bukan Judol (0)       0.97      0.97      0.97      1477
      Judol (1)       0.97      0.97      0.97      1476

       accuracy                           0.97      2953
      macro avg       0.97      0.97      0.97      2953
   weighted avg       0.97      0.97      0.97      2953


--- Evaluasi Mode