In [5]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
# Membaca dataset
df = pd.read_csv('../preprocessing/clean_bahan.csv')

df['clean_bahan'] = df['clean_bahan'].astype(str)

# Tokenizer untuk bahan
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_bahan'])
sequences = tokenizer.texts_to_sequences(df['clean_bahan'])
X = pad_sequences(sequences, maxlen=100)  # padding sequences to same length

# Encode label judul menjadi angka
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['judul'])

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Parameter
vocab_size = len(tokenizer.word_index) + 1  # total kata unik
maxlen = 100  # panjang input
num_classes = len(label_encoder.classes_)  # jumlah kelas (judul)

# Membangun model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',  # karena y bukan one-hot
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          2907904   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 55992)             7222968   
                                                                 
Total params: 10278968 (39.21 MB)
Trainable params: 10278968 (39.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
# Latih model
history = model.fit(
    X_train,
    y_train,
    epochs=30,
    batch_size=500,
    validation_split=0.1
)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [11]:
# Evaluasi di test set
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

Test Loss: 32.9701, Test Accuracy: 0.0049
