In [8]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import pandas as pd
from nltk.corpus import stopwords
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from keras.optimizers import Adam
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
stop_words = stopwords.words('indonesian')

In [9]:
df_train = pd.read_csv('datasets/new_dataset.csv')
MAX_SEQUENCE_LENGTH = 250
selected_column = df_train.columns[1:]
df_test = pd.read_csv('datasets/data_test.csv')
total_train_data = len(df_train)
df_train = df_train.append(df_test)
df_train

Unnamed: 0,questions,domain,dsc,email,gagal connect,laptop,login,moana,network,password,reset password,sccm,ticket,visa,vpn,wifi
0,bagaimana cara connect wifi dan visa,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
1,connect wifi,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,wifi bermasalah,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,saya ingin konek ke visa tapi tidak bisa,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,bagaimana cara connect wifi di visa?,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,bantu saya untuk membuat ticket komplain visa ...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
77,bagaimana cara connect izone visa dan vpn,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
78,saya ingin connect ke byod visa dan vpn,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
79,cara buat komplain visa management,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [10]:
def preprocessingText(text: str):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = text.lower()
    text = stemmer.stem(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text_tokenize = word_tokenize(text)
    text = [t for t in text_tokenize if t not in stop_words]
    text = " ".join(text)
    return text

In [11]:
df_train['questions'] = df_train['questions'].apply(lambda x: preprocessingText(x))
df_train['questions']

0                  connect wifi visa
1                       connect wifi
2                               wifi
3                         konek visa
4                  connect wifi visa
                   ...              
76        bantu ticket komplain visa
77            connect izone visa vpn
78             connect byod visa vpn
79          komplain visa management
80    pandu komplain visa management
Name: questions, Length: 453, dtype: object

In [12]:
num_words = 500
max_words = 100000

In [13]:
X = df_train['questions'].values
tokenizer = tf.keras.preprocessing.text.Tokenizer(max_words)
tokenizer.fit_on_texts(X)
X_train = tokenizer.texts_to_sequences(X)
X_train = tf.keras.utils.pad_sequences(X_train, MAX_SEQUENCE_LENGTH)

In [14]:
X_train_data = X_train[0:total_train_data]
X_test_data = X_train[total_train_data:]

In [15]:
columns = df_train.columns
y_train = np.array(df_train[columns[1:]])
y_train_data = y_train[0:total_train_data]
y_test_data = y_train[total_train_data:]

In [16]:
embeddings_index = {}
with open('embedding.txt', encoding='utf-8') as file:
    for line in file:
        values = line.rstrip().rsplit(', ')
        word = values[0]
        embed = np.asarray(values[1].split(' '), dtype='float32')
        embeddings_index[word] = embed

embed_size = 50
word_index = tokenizer.word_index
embed_num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size), dtype='float32')

In [17]:
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
MAX_LEN = 250
input = tf.keras.layers.Input(shape=(MAX_LEN,))
x = tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(input)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(256, return_sequences=True, dropout=0.1,
                                                      recurrent_dropout=0.1))(x)
x = tf.keras.layers.Conv1D(128, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)

x = tf.keras.layers.concatenate([avg_pool, max_pool])

preds = tf.keras.layers.Dense(y_train.shape[1], activation="sigmoid")(x)

model = tf.keras.Model(input, preds)


optimizer = Adam(learning_rate=1e-3)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_loss'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
    cp_callback
]
# model.fit(X_train, y_train, verbose=1, epochs=10, validation_split=0.2, callbacks=callbacks)
model.fit(X_train_data, y_train_data, verbose=1, epochs=10, validation_data=(X_test_data, y_test_data), callbacks=callbacks)

In [None]:
model.evaluate(X_train, y_train)



[0.04600673168897629, 0.7086092829704285]

In [None]:
'{:.3f}'.format(3.6592403e-13)
#  9.9997795e-01 -> 0.99997795

'0.000'

In [None]:
model.save('tf_model')



INFO:tensorflow:Assets written to: tf_model\assets


INFO:tensorflow:Assets written to: tf_model\assets


In [None]:
model = tf.keras.models.load_model('tf_model')

In [None]:
faq = 'buat ticket '
faq = preprocessingText(faq)
print(faq)
tokenize_faq = word_tokenize(faq)
tokenize_faq = [val for val in tokenize_faq if val not in stop_words]
faq = " ".join(tokenize_faq)
new_faq = [faq]
seq = tokenizer.texts_to_sequences(new_faq)
padded = tf.keras.utils.pad_sequences(seq, MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
threshold = 0.5
found = False
for i in range(len(selected_column)):
    if pred[0][i] > threshold:
        found = True
        pred1 = '{:.3f}'.format(pred[0][i])
        print(selected_column[i], pred1)

if not found:
    print("Tags tidak ditemukan")

ticket
ticket 0.990
