In [36]:
import json
import re
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import wordnet, stopwords
import nltk


In [37]:
# Download NLTK data (run only once)
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\skytr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skytr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# **1. Data Preprocessing**
stop_words = set(stopwords.words('english'))

# Fungsi untuk memproses teks
def preprocess_text(text):
    if isinstance(text, str):  # Periksa apakah input adalah string
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Hilangkan karakter selain huruf
        words = text.split()
        words = [word for word in words if word not in stop_words]  # Hilangkan stop words
        return ' '.join(words)
    else:
        raise ValueError("Input ke preprocess_text harus berupa string")

# Fungsi untuk augmentasi teks
def augment_text(text):
    if isinstance(text, str):
        words = text.split()
        augmented = []
        for word in words:
            synonyms = wordnet.synsets(word)
            if synonyms:
                synonym = synonyms[0].lemmas()[0].name()
                augmented.append(synonym)
            else:
                augmented.append(word)
        return ' '.join(augmented)
    else:
        raise ValueError("Input ke augment_text harus berupa string")


In [31]:
# Load Dataset
with open('dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

patterns = []
tags = []
responses = {}
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        processed_pattern = preprocess_text(pattern)
        patterns.append(processed_pattern)
        tags.append(intent["tag"])
    responses[intent["tag"]] = intent["responses"]

In [33]:
# Data Augmentation
augmented_patterns = patterns.copy()
augmented_tags = tags.copy()
for pattern, tag in zip(patterns, tags):
    augmented_patterns.append(augment_text(pattern))
    augmented_tags.append(tag)

patterns = augmented_patterns
tags = augmented_tags

In [34]:
# Encode Tags
encoder = LabelEncoder()
encoded_tags = encoder.fit_transform(tags)

In [35]:
tags

['salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'salam_pertama',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'perpisahan',
 'tentang_pembuat',
 'tentang_pembuat',
 'tentang_pembuat',
 'tentang_pembuat',
 'tentang_pembuat',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'kesulitan_matematika',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'tertarik_sains',
 'ter

In [39]:
# Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>", num_words=10000)
tokenizer.fit_on_texts(patterns)
sequences = tokenizer.texts_to_sequences(patterns)
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post")

In [41]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, 
    encoded_tags, 
    test_size=0.2, 
    random_state=42, 
    stratify=encoded_tags
)

In [43]:
# **2. Model Architecture**
model = Sequential([
    Embedding(
        input_dim=len(tokenizer.word_index) + 1, 
        output_dim=200,  # Adjusted embedding size
        input_length=max_len
    ),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    BatchNormalization(),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dropout(0.4),
    Dense(128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dropout(0.3),
    Dense(len(encoder.classes_), activation="softmax")
])



In [45]:
model.summary()

In [47]:
# **3. Compile Model**
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100,
    decay_rate=0.9,
    staircase=True
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)

In [49]:
# **4. Class Weights**
class_weights = {}
unique_tags, counts = np.unique(y_train, return_counts=True)
max_count = max(counts)
for tag, count in zip(unique_tags, counts):
    class_weights[tag] = max_count / count

# **5. Training**
early_stopping = EarlyStopping(
    monitor="val_accuracy", 
    patience=10, 
    restore_best_weights=True
)
history = model.fit(
    X_train, y_train, 
    epochs=500, 
    validation_split=0.2, 
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stopping]
)

Epoch 1/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 305ms/step - accuracy: 0.0749 - loss: 7.1396 - val_accuracy: 0.0000e+00 - val_loss: 5.7244
Epoch 2/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0636 - loss: 6.7905 - val_accuracy: 0.0000e+00 - val_loss: 5.5595
Epoch 3/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.1266 - loss: 6.6644 - val_accuracy: 0.0000e+00 - val_loss: 5.4066
Epoch 4/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.2168 - loss: 6.2295 - val_accuracy: 0.0750 - val_loss: 5.2610
Epoch 5/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.2649 - loss: 5.9543 - val_accuracy: 0.0750 - val_loss: 5.1280
Epoch 6/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.2872 - loss: 5.6358 - val_accuracy: 0.1000 - val_loss: 5.0034
Epoch 7/500
[1m5/5[0m [

In [50]:
# **6. Evaluation**
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Classification Report
y_pred = np.argmax(model.predict(X_test), axis=1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=encoder.classes_))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8783 - loss: 2.8293 
Test Loss: 2.8341, Test Accuracy: 0.8800
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 857ms/step

Classification Report:
                      precision    recall  f1-score   support

      belajar_bahasa       1.00      1.00      1.00         6
kesulitan_matematika       1.00      1.00      1.00         6
            minat_ai       1.00      1.00      1.00         2
       minat_aljabar       1.00      1.00      1.00         3
     minat_astronomi       0.75      1.00      0.86         3
      minat_geometri       0.43      1.00      0.60         3
     minat_teknologi       0.86      1.00      0.92         6
          perpisahan       1.00      0.75      0.86         4
       salam_pertama       1.00      1.00      1.00         4
          study_tips       1.00      0.60      0.75         5
     tentang_pembuat       1.00      1.00      1.00         2
      terta

In [51]:
# **7. Enhanced Prediction Function**
def predict_intent(text):
    processed_text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    prediction = model.predict(padded_sequence)
    intent_index = np.argmax(prediction)
    intent = encoder.classes_[intent_index]
    
    return intent, responses[intent]

In [52]:
# Save Model and Tokenizer
model.save("chatbot_model.h5")
with open("tokenizer.json", "w") as tok_file:
    json.dump(tokenizer.to_json(), tok_file)
with open("label_encoder.json", "w") as enc_file:
    json.dump(encoder.classes_.tolist(), enc_file)

