In [5]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBRFClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import Vectors, Vocab
from tensorflow.keras.preprocessing.sequence import pad_sequences
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from collections import Counter

OSError: [WinError 127] The specified procedure could not be found

In [None]:
nltk.download('stopwords')

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
stop_words = set(stopwords.words("indonesian"))
def bersihkan_teks(teks):
    teks = teks.lower()
    teks = re.sub(r'\d+', '', teks)
    teks = re.sub(r"http\S+|www\S+|https\S+", "", teks, flags=re.MULTILINE)
    teks = re.sub(r"@\w+|#", "", teks)
    teks = re.sub(r"[^\w\s]", "", teks)
    teks = re.sub(r'<.*?>', '', teks)
    teks = re.sub(r'\s+', ' ', teks).strip()
    tokens = " ".join([stemmer.stem(word) for word in teks.split() if word not in stop_words])
    return tokens

In [None]:
csv_file = "dataset_pengaduan.csv"
folder_data = "dataset/"

In [None]:
df = pd.read_csv(csv_file)

In [None]:
df['konten'] = df['pengaduan'].apply(bersihkan_teks)
df['pengaduan'] = df['konten']
df.drop(columns=['pengaduan'], inplace=True)

In [None]:
df_classification = df[['date','konten', 'kategori', 'sentimen']]
df_classification.head()

In [None]:
TEKS = Field(sequential=True, tokenize=lambda x: x.split(), lower=True, include_lengths=False, batch_first=True)
KATEGORI = Field(sequential=False, use_vocab=False)
LABEL = Field(sequential=False, use_vocab=False)
TANGGAL = Field(sequential=False, use_vocab=False)

fields = [
    ('date', TANGGAL),
    ('konten', TEKS),
    ('kategori', KATEGORI),
    ('sentimen', LABEL)
]

# Pre-proccessing Data

## pre-sklearn

Ekstrasi fitur menggunakan TF-IDF untuk menghitung nilai dari kata.

In [None]:
tfidf = TfidfVectorizer()
konten_tfidf = tfidf.fit_transform(df_classification['konten']).toarray()

Mengubah kategori menjadi label agar bisa digabung untuk meningkatkan kualitas training.

In [None]:
ohe = OneHotEncoder()
kategori_encoded = ohe.fit_transform(df_classification[['kategori']]).toarray()

Mengubah sentimen menjadi label, agar bisa digunakan untuk prediksi

In [None]:
le = LabelEncoder()

# Split data

## Split for sklearn

In [None]:
feature = np.hstack((konten_tfidf, kategori_encoded, date_features_scaled))
target = le.fit_transform(df_classification['sentimen'])

In [None]:
X = feature
y = target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42)

## Split for Deep Learning

### Ekstraksi Fitur khusus deep learning

### Split Data

In [None]:
df_train, df_val = train_test_split(df_classification, test_size=0.2, random_state=42)

In [None]:
df_train.to_csv(f"{folder_data}train_dataset.csv", index=False)
df_val.to_csv(f"{folder_data}val_dataset.csv", index=False)

In [None]:
train_data, val_data = TabularDataset.splits(
    path=folder_data,
    train='train_dataset.csv',
    validation='val_dataset.csv',
    format='csv',
    fields=fields,
    skip_header=True
)

In [None]:
print(f"Jumlah data train: {len(train_data)}")
print(f"Jumlah data val: {len(val_data)}")

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz
!gunzip cc.id.300.bin.gz

In [None]:
vectors = Vectors(name='cc.id.300.bin', cache='./.vector_cache')
max_words = 10000
TEKS.build_vocab(train_data, max_size=max_words, vectors=vectors, unk_init=torch.Tensor.normal_)

print(f"Ukuran vocab: {len(TEKS.vocab)}")

In [None]:
ohe = OneHotEncoder()
kategori_train = ohe.fit_transform(df_train[['kategori']])
kategori_val = ohe.transform(df_val[['kategori']])

In [None]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_val['date'] = pd.to_datetime(df_val['date'])

date_features_train = np.array([
    df_train['date'].dt.year,
    df_train['date'].dt.month,
    df_train['date'].dt.day,
    df_train['date'].dt.weekday
]).T

date_features_val = np.array([
    df_val['date'].dt.year,
    df_val['date'].dt.month,
    df_val['date'].dt.day,
    df_val['date'].dt.weekday
]).T

scaler = MinMaxScaler()
date_features_train_scaled = scaler.fit_transform(date_features_train)
date_features_val_scaled = scaler.transform(date_features_val)

print("date_features_train_scaled shape:", date_features_train_scaled.shape)
print("date_features_val_scaled shape:", date_features_val_scaled.shape)

In [None]:
label_encoder = LabelEncoder()
y_train_nn = label_encoder.fit_transform(df_train['sentimen'])
y_val = label_encoder.transform(df_val['sentimen'])

print("y_train shape:", y_train_nn.shape)
print("y_val shape:", y_val.shape)

### Pengacakan data

In [None]:
accelerator = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
train_iter, val_iter = BucketIterator.splits(
    (train_data, val_data),
    batch_size=64,
    sort_within_batch=True,
    sort_key=lambda x: len(x.konten),
    device=accelerator
)

In [None]:
def batch(batch, kategori_data, date_data, labels):
    sequences = batch.konten
    kategori = torch.FloatTensor(kategori_data[batch.batch_idx]).to(accelerator)
    date_features = torch.FloatTensor(date_data[batch.batch_idx]).to(accelerator)
    labels = torch.LongTensor(labels[batch.batch_idx]).to(accelerator)
    return (sequences, kategori, date_features), labels

In [None]:
train_iter.batch_idx = np.arange(len(train_iter.dataset))
val_iter.batch_idx = np.arange(len(val_iter.dataset))

In [None]:
embedding_matrix = TEKS.vocab.vectors
embedding_dim = embedding_matrix.shape[1]

print("Shape:", embedding_matrix.shape)

# Pembuatan Model

## Model sklearn

In [None]:
model_xgb = XGBRFClassifier()
model_rf = RandomForestClassifier(class_weight='balanced', random_state=42)

## Model Deep Learning menggunakan Pytorch

In [None]:
class lstm_model(nn.Module):
    def __init__(self, embedding_matrix, max_len, num_kategori, num_date, hidden_dim=256, num_classes=3, dropout=0.3):
        super(lstm_model, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix), freeze=False
        )
        self.lstm = nn.LSTM(
            input_size=embedding_matrix.shape[1],
            hidden_size=hidden_dim,
            num_layers=3,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc_lstm = nn.Linear(hidden_dim * 2, 128)
        self.fc_additional = nn.Linear(num_kategori + num_date, 364)
        self.fc = nn.Sequential(
            nn.Linear(128 + 64, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )

    def forward(self, inputs):
        sequences, kategori, date_features = inputs

        embedded = self.embedding(sequences)
        lstm_out, (hidden, _) = self.lstm(embedded)

        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)

        lstm_out = self.fc_lstm(context)

        additional = torch.cat((kategori, date_features), dim=1)
        additional = self.fc_additional(additional)

        combined = torch.cat((lstm_out, additional), dim=1)
        out = self.fc(combined)
        return out

Balancing sentiment

In [None]:
print(df_classification['sentimen'].value_counts())

In [None]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

In [None]:
# for key in class_weight_dict:
#     class_weight_dict[key] = min(class_weight_dict[key], 1.5)

In [None]:
print(class_weight_dict)

# Training Model

## Model sklearn

In [None]:
#Training Model XGBoost
model_xgb.fit(X_train, y_train)
y_train_predic_xgb = model_xgb.predict(X_train)
y_test_predic_xgb = model_xgb.predict(X_test)

In [None]:
train_accuracy_xgb = accuracy_score(y_train, y_train_predic_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_test_predic_xgb)

print("XGBoost:")
print(f"Akurasi Training: {train_accuracy_xgb:.4f}")
print(f"Akurasi Testing: {test_accuracy_xgb:.4f}")

In [None]:
#Training Model Random Forest
model_rf.fit(X_train, y_train)
y_train_predic_rf = model_rf.predict(X_train)
y_test_predic_rf = model_rf.predict(X_test)

In [None]:
train_accuracy_rf = accuracy_score(y_train, y_train_predic_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_predic_rf)

print("Random Forest:")
print(f"Akurasi Training: {train_accuracy_rf:.4f}")
print(f"Akurasi Testing: {test_accuracy_rf:.4f}")

## Model Pytorch

In [None]:
max_len = 100
num_kategori = kategori_train.shape[1]
num_tanggal = date_features_train_scaled.shape[1]

deep_model = lstm_model(
    embedding_matrix=embedding_matrix,
    max_len=max_len,
    num_kategori=num_kategori,
    num_date=num_tanggal,
    hidden_dim=256
)

In [None]:
deep_model = deep_model.to(accelerator)

In [None]:
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.AdamW(deep_model.parameters(), lr=0.001, weight_decay=1e-5)

In [None]:
class_weights= torch.FloatTensor(class_weights).to(accelerator)

In [None]:
def training(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for batch in train_iter:
            inputs, labels = prepare_batch(batch, kategori_train, date_features_train_scaled, y_train)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_iter)
        epoch_acc = 100 * correct / total
        train_losses.append(epoch_loss)
        train_accs.append(epoch_acc)

        model.eval()
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_iter:
                inputs, labels = prepare_batch(batch, kategori_val, date_features_val_scaled, y_val)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        epoch_val_loss = val_loss / len(val_iter)
        epoch_val_acc = 100 * val_correct / val_total
        val_losses.append(epoch_val_loss)
        val_accs.append(epoch_val_acc)

        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}%, '
              f'Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_acc:.2f}%')

        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    return train_losses, train_accs, val_losses, val_accs

In [None]:
train_losses, train_accs, val_losses, val_accs = training(
    deep_model,
    train_iter,
    val_iter,
    criterion,
    optimizer,
    num_epochs=50
)

In [None]:
def evaluate_model(model, sequences, kategori, date_features, labels, device):
    model.eval()
    with torch.no_grad():
        sequences = torch.LongTensor(sequences).to(device)
        kategori = torch.FloatTensor(kategori).to(device)
        date_features = torch.FloatTensor(date_features).to(device)
        labels = torch.LongTensor(labels).to(device)

        inputs = (sequences, kategori, date_features)

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)

        accuracy = (predicted == labels).sum().item() / len(labels)

    return accuracy

train_sequences = []
val_sequences = []

for example in train_data:
    train_sequences.append([TEXT.vocab.stoi[word] for word in example.konten])
for example in val_data:
    val_sequences.append([TEXT.vocab.stoi[word] for word in example.konten])

In [None]:
train_sequences = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = pad_sequences(val_sequences, maxlen=max_len, padding='post')

test_accuracy = evaluate_model(
    deep_model,
    val_sequences,
    kategori_val,
    date_features_val_scaled,
    y_val,
    accelerator
)

print(f'Validation Accuracy: {test_accuracy*100:.2f}%')

In [None]:
def plot_training_history(train_losses, train_accs, val_loss, val_acc):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train', color='black')
    plt.plot(val_loss, label='Validation', color='purple')
    plt.title('Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train', color='black')
    plt.plot(val_acc, label='Validation', color='purple')
    plt.title('Accuracy per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.grid()

    plt.tight_layout()
    plt.show()

plot_training_history(train_losses, train_accur, val_loss, val_acc)