In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SimpleRNN
from tensorflow.keras.utils import to_categorical

# Mengganti path dengan path file CSV Anda
file_path = 'data.csv'

# Memuat data dari CSV ke dalam DataFrame
data = pd.read_csv(file_path)

# Label atau target yang akan diprediksi
data['Label'] = data['Jenis Kelamin'].apply(lambda x: 0 if x == 'Perempuan' else 1)

# Memisahkan data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(data['Nama'], data['Label'], test_size=0.2, random_state=42)


In [2]:
# Tokenisasi teks
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train)

# Mengonversi teks menjadi urutan token
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding agar semua urutan memiliki panjang yang sama
maxlen = 20  # Misalnya, kita batasi panjang nama menjadi 20 karakter
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Konversi label menjadi format kategori (one-hot encoding)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)


In [3]:
# Inisialisasi model Sequential
model = Sequential()

# Layer embedding untuk mengubah token menjadi vektor embedding
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=maxlen))

# Layer LSTM dengan 64 unit
model.add(LSTM(64, return_sequences=True))  # Menggunakan LSTM dengan return_sequences=True untuk menghubungkannya dengan RNN

# Layer Simple RNN dengan 32 unit
model.add(SimpleRNN(32))

# Output layer dengan fungsi aktivasi softmax untuk klasifikasi
model.add(Dense(2, activation='softmax'))

# Compile model dengan optimizer Adam dan loss function categorical_crossentropy (karena kategori lebih dari 2)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Tampilkan ringkasan model
print(model.summary())

# Melatih model dengan data latih
model.fit(X_train_pad, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test_cat))



None
Epoch 1/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 16ms/step - accuracy: 0.5982 - loss: 0.6694 - val_accuracy: 0.7231 - val_loss: 0.5581
Epoch 2/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7233 - loss: 0.5621 - val_accuracy: 0.7727 - val_loss: 0.4920
Epoch 3/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7435 - loss: 0.5165 - val_accuracy: 0.7741 - val_loss: 0.4691
Epoch 4/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7588 - loss: 0.4796 - val_accuracy: 0.7893 - val_loss: 0.4584
Epoch 5/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7612 - loss: 0.4822 - val_accuracy: 0.7851 - val_loss: 0.4550
Epoch 6/10
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7747 - loss: 0.4680 - val_accuracy: 0.7782 - val_loss: 0.4558
Epoch 7/10
[1m91/91[0m [3

<keras.src.callbacks.history.History at 0x185150b1f40>

In [4]:
# Evaluasi model dengan data uji
loss, accuracy = model.evaluate(X_test_pad, y_test_cat)
print(f'Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7835 - loss: 0.4498
Loss: 0.4584, Accuracy: 0.7851


In [5]:
# Menyimpan model
model.save('model.h5')



In [6]:
import pickle

# Setelah melakukan tokenisasi, simpan tokenizer ke dalam file menggunakan pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)