In [20]:
import re
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
def bersihkan_teks(teks):
    teks = teks.lower()
    teks = re.sub(r'\d+', '', teks)
    teks = re.sub(r"http\S+|www\S+|https\S+", "", teks, flags=re.MULTILINE)
    teks = re.sub(r"@\w+|#", "", teks)
    teks = re.sub(r"[^\w\s]", "", teks)
    teks = re.sub(r'<.*?>', '', teks)
    teks = re.sub(r'\s+', ' ', teks).strip()
    teks = " ".join([word for word in teks.split() if word not in stopwords.words("indonesian")])
    return teks

In [3]:
df = pd.read_csv("dataset_pengaduan.csv")

In [4]:
df['konten'] = df['pengaduan'].apply(bersihkan_teks)
df['pengaduan'] = df['konten']
df.drop(columns=['pengaduan'], inplace=True)

In [5]:
df_classification = df[['konten', 'kategori', 'sentimen']]
df_classification.head()

Unnamed: 0,konten,kategori,sentimen
0,kemacetan parah area parkir pintu masuk,Transportasi,negatif
1,kualitas pengajaran menurun perpustakaan perpu...,Pendidikan,positif
2,fasilitas sekolah manajemen buruk minim ruang ...,Pendidikan,negatif
3,ketersediaan obat darurat mencukupi menit pasien,Kesehatan,positif
4,jadwal bus menit terlambat penumpang pagi,Transportasi,netral


Ekstrasi fitur menggunakan TF-IDF untuk menghitung nilai dari kata.

In [6]:
tfidf = TfidfVectorizer()
konten_tfidf = tfidf.fit_transform(df_classification['konten']).toarray()

Mengubah kategori menjadi label agar bisa digabung untuk meningkatkan kualitas training.

In [7]:
ohe = OneHotEncoder()
kategori_encode = ohe.fit_transform(df_classification[['kategori']]).toarray()

Mengubah sentimen menjadi label, agar bisa digunakan untuk prediksi

In [8]:
le = LabelEncoder()
df_classification['sentimen'] = le.fit_transform(df_classification['sentimen'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_classification['sentimen'] = le.fit_transform(df_classification['sentimen'])


Split data dan Pelatihan

In [24]:
X = np.hstack((konten_tfidf, kategori_encode))
y = le.fit_transform(df_classification['sentimen'])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42)

In [36]:
y_train_dl = to_categorical(y_train)

In [38]:
model_xgb = XGBClassifier()
model_svc = SVC(kernel='linear', random_state=42) 
model_rf = RandomForestClassifier(random_state=42)
model_dl = Sequential([
    Embedding(input_dim=10000, output_dim=150, input_length=(X_train.shape[1],), name='Input'),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2, name='LSTM_layer'),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='Dense_layer'),
    Dropout(0.3),
    Dense(3, activation='softmax', name='predict')
])



In [27]:
#Training Model XGBoost
model_xgb.fit(X_train, y_train)

#Prediksi
y_train_predic_xgb = model_xgb.predict(X_train)
y_test_predic_xgb = model_xgb.predict(X_test)

In [28]:
train_accuracy_xgb = accuracy_score(y_train, y_train_predic_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_test_predic_xgb)

print(f"Akurasi Training: {train_accuracy_xgb:.4f}")
print(f"Akurasi Testing: {test_accuracy_xgb:.4f}")

Akurasi Training: 0.5134
Akurasi Testing: 0.4780


In [29]:
#Training Model Random Forest
model_rf.fit(X_train, y_train)

#Prediksi
y_train_predic_rf = model_rf.predict(X_train)
y_test_predic_rf = model_rf.predict(X_test)

In [30]:
train_accuracy_rf = accuracy_score(y_train, y_train_predic_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_predic_rf)

print(f"Akurasi Training: {train_accuracy_rf:.4f}")
print(f"Akurasi Testing: {test_accuracy_rf:.4f}")

Akurasi Training: 0.5242
Akurasi Testing: 0.4540


In [40]:
model_dl.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [41]:
model_dl.summary()

In [42]:
early_stopping = EarlyStopping(patience=5, restore_best_weights=True, monitor='val_accuracy')

In [43]:
training = model_dl.fit(X_train, y_train_dl, epochs=10, batch_size=256, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 1s/step - accuracy: 0.4687 - loss: 1.7553 - val_accuracy: 0.5038 - val_loss: 1.3805
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 1s/step - accuracy: 0.4899 - loss: 1.3220 - val_accuracy: 0.5038 - val_loss: 1.1632
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1s/step - accuracy: 0.4924 - loss: 1.1431 - val_accuracy: 0.5038 - val_loss: 1.0771
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1s/step - accuracy: 0.4934 - loss: 1.0726 - val_accuracy: 0.5038 - val_loss: 1.0470
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 1s/step - accuracy: 0.4928 - loss: 1.0502 - val_accuracy: 0.5038 - val_loss: 1.0414
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 1s/step - accuracy: 0.4894 - loss: 1.0437 - val_accuracy: 0.5038 - val_loss: 1.0344
