In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [1]:
from google.colab import files
files.upload()

Saving dataset_new_4aspect.csv to dataset_new_4aspect.csv


{'dataset_new_4aspect.csv': b"content,score,aspek,sentimen\nsangat bagus untuk membantu bapak ibu guru dalam melihat datanya dengan cepat,5,Lainnya,Positif\nsangat membantu app ini karena kami membutuhkan informasi trimakasih simpatika,5,Lainnya,Positif\naplikasi paling top keren banget lengkap yang membuat aplikasi pintar banget sudah diberi kecerdasan ilmu dan mau berbagi terima kasih semoga sukses selalu,5,Lainnya,Positif\nalhamdulillah sangat membantu,5,Lainnya,Positif\nsaya berikan bintang karena semua ikhtiar demi kebaikan walaupun masih perlu pengembangan saya berikan apresiasi yang setinggi-tingginya terima kasih app ini sangat bermanfaat bagi kami para guru di seluruh nusantara,5,Lainnya,Positif\nboleh saya kasih full bintang tapi ada syaratnya hehe tolong permudah saya ketika saya lupa password karena sudah berbagai cara apapun saya ikuti instruksi untuk reset password namun belum berhasil juga hingga sekarang,5,Autentikasi,Negatif\naplikasi simpel dan mudah dipahami mudah-mu

In [60]:
# Load dataset
df = pd.read_csv("dataset_new_4aspect.csv")
df

Unnamed: 0,content,score,aspek,sentimen
0,sangat bagus untuk membantu bapak ibu guru dal...,5,Lainnya,Positif
1,sangat membantu app ini karena kami membutuhka...,5,Lainnya,Positif
2,aplikasi paling top keren banget lengkap yang ...,5,Lainnya,Positif
3,alhamdulillah sangat membantu,5,Lainnya,Positif
4,saya berikan bintang karena semua ikhtiar demi...,5,Lainnya,Positif
...,...,...,...,...
1636,bagus,1,Lainnya,Netral
1637,ok,1,Lainnya,Netral
1638,sukses,1,Lainnya,Netral
1639,sip,1,Lainnya,Netral


In [5]:
# Fungsi untuk membersihkan teks
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Hapus tanda baca
    text = re.sub(r'\d+', '', text)  # Hapus angka
    return text

In [7]:
# Terapkan pembersihan teks
df['content'] = df['content'].apply(clean_text)

In [61]:
# Pisahkan data berdasarkan label
df_pos = df[df['sentimen'] == 'Positif']
df_neg = df[df['sentimen'] == 'Negatif']
df_neu = df[df['sentimen'] == 'Netral']

# Undersampling kelas Positif menjadi 700
df_pos_under = df_pos.sample(n=700, random_state=42)

# Gabungkan sebelum SMOTE
df_resampled = pd.concat([df_pos_under, df_neg, df_neu])

In [62]:
# Pisahkan fitur & label
X = df_resampled['content'].values  # Ganti 'text' dengan nama kolom teks
y = df_resampled['sentimen'].values

# Encode label ke numerik
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [63]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

# Konversi teks ke sequence angka
X_sequences = tokenizer.texts_to_sequences(X)

# Padding sequences agar panjangnya sama
X_padded = pad_sequences(X_sequences, padding='post', maxlen=100)

In [64]:
# --- 4. Terapkan SMOTE pada Data Numerik ---
label_mapping = {label: idx for idx, label in enumerate(label_encoder.classes_)}
print("Mapping Label:", label_mapping)

# SMOTE hanya untuk kelas minoritas
smote = SMOTE(sampling_strategy={label_mapping['Negatif']: 700, label_mapping['Netral']: 700}, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_padded, y_encoded)

# Konversi kembali label numerik ke string
y_resampled = label_encoder.inverse_transform(y_resampled)

Mapping Label: {'Negatif': 0, 'Netral': 1, 'Positif': 2}


In [65]:
# Encode kembali label setelah SMOTE
y_resampled = label_encoder.transform(y_resampled)

# Split Data Train & Test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [58]:
# --- 5. Split Data Train & Test ---
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print("Jumlah data setelah balancing:", np.unique(y_resampled, return_counts=True))

Jumlah data setelah balancing: (array([0, 1, 2]), array([700, 700, 700]))


In [83]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Embedding(input_dim=15000, output_dim=128, input_length=150),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

# Compile model
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [84]:
# --- 7. Training Model ---
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100, batch_size=32, verbose=1
)

Epoch 1/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 326ms/step - accuracy: 0.3618 - loss: 1.0969 - val_accuracy: 0.4833 - val_loss: 1.0874
Epoch 2/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 272ms/step - accuracy: 0.4413 - loss: 1.0824 - val_accuracy: 0.4952 - val_loss: 1.0344
Epoch 3/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 279ms/step - accuracy: 0.4909 - loss: 1.0162 - val_accuracy: 0.4738 - val_loss: 0.9785
Epoch 4/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 270ms/step - accuracy: 0.4927 - loss: 0.9653 - val_accuracy: 0.5667 - val_loss: 0.9026
Epoch 5/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 279ms/step - accuracy: 0.5341 - loss: 0.8837 - val_accuracy: 0.5548 - val_loss: 0.8665
Epoch 6/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 279ms/step - accuracy: 0.5629 - loss: 0.8211 - val_accuracy: 0.5690 - val_loss: 0.8392
Epoch 7/100
[1m

In [93]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, GRU, Dropout, Dense

model_gru = Sequential([
    Embedding(input_dim=15000, output_dim=128, input_length=150),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(GRU(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(GRU(64)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model_gru.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [94]:
# --- 7. Training Model ---
history = model_gru.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100, batch_size=32, verbose=1
)

Epoch 1/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 266ms/step - accuracy: 0.3605 - loss: 1.0961 - val_accuracy: 0.5167 - val_loss: 1.0790
Epoch 2/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 242ms/step - accuracy: 0.4972 - loss: 1.0704 - val_accuracy: 0.5119 - val_loss: 1.0202
Epoch 3/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 244ms/step - accuracy: 0.4989 - loss: 0.9939 - val_accuracy: 0.5381 - val_loss: 0.9289
Epoch 4/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 243ms/step - accuracy: 0.5219 - loss: 0.9057 - val_accuracy: 0.4810 - val_loss: 0.8956
Epoch 5/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 242ms/step - accuracy: 0.5521 - loss: 0.8557 - val_accuracy: 0.5643 - val_loss: 0.8501
Epoch 6/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 241ms/step - accuracy: 0.5704 - loss: 0.8273 - val_accuracy: 0.6143 - val_loss: 0.8179
Epoch 7/100
[1m

In [95]:
# --- 8. Prediksi ---
def predict_sentiment(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=100, padding='post')
    prediction = model.predict(padded)
    label_idx = np.argmax(prediction)
    return label_encoder.inverse_transform([label_idx])[0]

# Contoh Prediksi


In [96]:
sample_text = "Aplikasinya jelek"
print(f"Prediksi Sentimen: {predict_sentiment(sample_text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
Prediksi Sentimen: Netral


In [97]:
from sklearn.metrics import accuracy_score

# Lakukan prediksi pada data uji
y_pred = model.predict(X_test)

# Konversi prediksi ke label numerik
y_pred_classes = y_pred.argmax(axis=1)

# Hitung akurasi
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Akurasi Model: {accuracy:.4f}')

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step
Akurasi Model: 0.7000


In [15]:
# Contoh prediksi
text_test = "Jelek"
print(f"Prediksi Sentimen: {predict_sentiment(text_test)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Prediksi Sentimen: Positif
