In [1]:
import re
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
def bersihkan_teks(teks):
    teks = teks.lower()
    teks = re.sub(r'\d+', '', teks)
    teks = re.sub(r"http\S+|www\S+|https\S+", "", teks, flags=re.MULTILINE)
    teks = re.sub(r"@\w+|#", "", teks)
    teks = re.sub(r"[^\w\s]", "", teks)
    teks = re.sub(r'<.*?>', '', teks)
    teks = re.sub(r'\s+', ' ', teks).strip()
    teks = " ".join([word for word in teks.split() if word not in stopwords.words("indonesian")])
    return teks

In [3]:
df = pd.read_csv("dataset_pengaduan.csv")

In [4]:
df['konten'] = df['pengaduan'].apply(bersihkan_teks)
df['pengaduan'] = df['konten']
df.drop(columns=['pengaduan'], inplace=True)

In [5]:
df_classification = df[['konten', 'kategori', 'sentimen']]
df_classification.head()

Unnamed: 0,konten,kategori,sentimen
0,kemacetan parah area parkir pintu masuk,Transportasi,negatif
1,kualitas pengajaran menurun perpustakaan perpu...,Pendidikan,positif
2,fasilitas sekolah manajemen buruk minim ruang ...,Pendidikan,negatif
3,ketersediaan obat darurat mencukupi menit pasien,Kesehatan,positif
4,jadwal bus menit terlambat penumpang pagi,Transportasi,netral


Ekstrasi fitur menggunakan TF-IDF untuk menghitung nilai dari kata.

In [6]:
tfidf = TfidfVectorizer()
konten_tfidf = tfidf.fit_transform(df_classification['konten'])

Mengubah kategori menjadi label agar bisa digabung untuk meningkatkan kualitas training.

In [7]:
ohe = OneHotEncoder()
kategori_encode = ohe.fit_transform(df_classification[['kategori']]).toarray()

In [8]:
konten_array = konten_tfidf.toarray()

Mengubah sentimen menjadi label, agar bisa digunakan untuk prediksi

In [9]:
le = LabelEncoder()
df_classification['sentimen'] = le.fit_transform(df_classification['sentimen'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_classification['sentimen'] = le.fit_transform(df_classification['sentimen'])


Split data dan Pelatihan

In [10]:
X = np.hstack((konten_array, kategori_encode))
y = df_classification['sentimen']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42)

In [12]:
model_xgb = XGBClassifier()
model_svc = SVC(kernel='linear', random_state=42)
model_rf = RandomForestClassifier(random_state=42)

In [13]:
#Training Model XGBoost
model_xgb.fit(X_train, y_train)

#Prediksi
y_train_predic_xgb = model_xgb.predict(X_train)
y_test_predic_xgb = model_xgb.predict(X_test)

In [14]:
train_accuracy_xgb = accuracy_score(y_train, y_train_predic_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_test_predic_xgb)

print(f"Akurasi Training: {train_accuracy_xgb:.4f}")
print(f"Akurasi Testing: {test_accuracy_xgb:.4f}")

Akurasi Training: 0.5134
Akurasi Testing: 0.4780


In [15]:
#Training Model SVC
model_svc.fit(X_train, y_train)

#Prediksi
y_train_predic_svc = model_svc.predict(X_train)
y_test_predic_svc = model_svc.predict(X_test)

In [16]:
train_accuracy_svc = accuracy_score(y_train, y_train_predic_svc)
test_accuracy_svc = accuracy_score(y_test, y_test_predic_svc)

print(f"Akurasi Training: {train_accuracy_svc:.4f}")
print(f"Akurasi Testing: {test_accuracy_svc:.4f}")

Akurasi Training: 0.4936
Akurasi Testing: 0.4883


In [17]:
#Training Model Random Forest
model_rf.fit(X_train, y_train)

#Prediksi
y_train_predic_rf = model_rf.predict(X_train)
y_test_predic_rf = model_rf.predict(X_test)

In [18]:
train_accuracy_rf = accuracy_score(y_train, y_train_predic_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_predic_rf)

print(f"Akurasi Training: {train_accuracy_rf:.4f}")
print(f"Akurasi Testing: {test_accuracy_rf:.4f}")

Akurasi Training: 0.5242
Akurasi Testing: 0.4540
