In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import Sastrawi

In [15]:
!pip install PySastrawi

Collecting PySastrawi
[?25l  Downloading https://files.pythonhosted.org/packages/61/84/b0a5454a040f81e81e6a95a5d5635f20ad43cc0c288f8b4966b339084962/PySastrawi-1.2.0-py2.py3-none-any.whl (210kB)
[K     |█▋                              | 10kB 13.1MB/s eta 0:00:01[K     |███▏                            | 20kB 17.8MB/s eta 0:00:01[K     |████▊                           | 30kB 12.6MB/s eta 0:00:01[K     |██████▎                         | 40kB 9.7MB/s eta 0:00:01[K     |███████▉                        | 51kB 10.5MB/s eta 0:00:01[K     |█████████▍                      | 61kB 7.1MB/s eta 0:00:01[K     |███████████                     | 71kB 7.8MB/s eta 0:00:01[K     |████████████▌                   | 81kB 8.4MB/s eta 0:00:01[K     |██████████████                  | 92kB 8.9MB/s eta 0:00:01[K     |███████████████▋                | 102kB 9.3MB/s eta 0:00:01[K     |█████████████████▏              | 112kB 9.3MB/s eta 0:00:01[K     |██████████████████▊             | 122kB 9

In [2]:
#DATASET SOURCE FROM https://github.com/dbrehmer/Knowself/blob/master/data/mypersonality/essays.csv"
#DATASET USED IN THIS NOTEBOOK IS DATASET FROM THE SOURCE THAT HAS BEEN TRANSLATED TO BAHASA INDONESIA USING GOOGLE TRANSLATE

DATASET_URL = "https://raw.githubusercontent.com/lazuardi100/Hexa-Engineer/ML/Dataset/dataset.csv"
df = pd.read_csv(DATASET_URL, sep =',')

df.head()

Unnamed: 0,#AUTHID,TTEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Nah, sekarang saya baru saja bangun dari tidur...",n,y,y,n,y
1,1997_605191.txt,"Nah, di sini kita pergi dengan arus kesadaran ...",n,n,y,n,n
2,1997_687252.txt,Keyboard terbuka dan tombol untuk mendorong. H...,n,y,n,y,y
3,1997_568848.txt,Aku tidak percaya itu! Ini benar-benar terjadi...,y,n,y,y,n
4,1997_688160.txt,"Nah, di sini aku pergi dengan aliran tua yang ...",y,n,y,n,y


In [3]:
def changeLabel(labels):
  for index, values in enumerate(labels.values):
    if values == 'n':
      labels[index] = 0
    else:
      labels[index] = 1
  return labels

changeLabel(df['cEXT'])
changeLabel(df['cNEU'])
changeLabel(df['cAGR'])
changeLabel(df['cCON'])
changeLabel(df['cOPN'])

0       1
1       0
2       1
3       0
4       1
       ..
2462    0
2463    1
2464    0
2465    1
2466    1
Name: cOPN, Length: 2467, dtype: object

In [23]:
#Text Preprocessing referenced from https://github.com/ksnugroho/basic-text-preprocessing/blob/master/text-preprocessing.ipynb
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def preprocess_text(texts):
  for text in texts:
    #lowercase all character in the text
    text = text.lower()
    #remove punctuation
    text = text.translate(str.maketrans("","",string.punctuation))
    #remove leading and trailing whitespace
    text = text.strip()
    #remove StopWord
    stopword = StopWordRemoverFactory().create_stop_word_remover()
    text = stopword.remove(text)
    #stemming
    stemmer = StemmerFactory().create_stemmer()
    text = stemmer.stem(text)
  
  return texts

df['TTEXT'] = preprocess_text(df['TTEXT'])

In [37]:
df = df.sample(frac=1)
train_dataset, val_dataset, test_dataset = np.split(df, [int(.95 * len(df)), int(.975 * len(df))])

In [38]:
train_data = train_dataset['TTEXT']
train_label_ext  = train_dataset['cEXT']
train_label_neu  = train_dataset['cNEU']
train_label_agr  = train_dataset['cAGR']
train_label_con  = train_dataset['cCON']
train_label_opn  = train_dataset['cOPN']

val_data = val_dataset['TTEXT']
val_label_ext  = val_dataset['cEXT']
val_label_neu  = val_dataset['cNEU']
val_label_agr  = val_dataset['cAGR']
val_label_con  = val_dataset['cCON']
val_label_opn  = val_dataset['cOPN']

test_data = test_dataset['TTEXT']
test_label_ext  = test_dataset['cEXT']
test_label_neu  = test_dataset['cNEU']
test_label_agr  = test_dataset['cAGR']
test_label_con  = test_dataset['cCON']
test_label_opn  = test_dataset['cOPN']

In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Tried to use parameter and model from the NLP Course by Deeplearning.ai
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data)

training_sequences = tokenizer.texts_to_sequences(train_data)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_data)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_data)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [40]:
train_dataset_ext = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_ext.values.astype(dtype=np.float32)))
train_dataset_neu = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_neu.values.astype(dtype=np.float32)))
train_dataset_agr = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_agr.values.astype(dtype=np.float32)))
train_dataset_con = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_con.values.astype(dtype=np.float32)))
train_dataset_opn = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_opn.values.astype(dtype=np.float32)))

val_dataset_ext = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_ext.values.astype(dtype=np.float32)))
val_dataset_neu = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_neu.values.astype(dtype=np.float32)))
val_dataset_agr = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_agr.values.astype(dtype=np.float32)))
val_dataset_con = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_con.values.astype(dtype=np.float32)))
val_dataset_opn = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_opn.values.astype(dtype=np.float32)))

test_dataset_ext = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_ext.values.astype(dtype=np.float32)))
test_dataset_neu = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_neu.values.astype(dtype=np.float32)))
test_dataset_agr = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_agr.values.astype(dtype=np.float32)))
test_dataset_con = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_con.values.astype(dtype=np.float32)))
test_dataset_opn = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_opn.values.astype(dtype=np.float32)))

In [41]:
embedding = "https://tfhub.dev/google/nnlm-id-dim128/2"

hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

model_ext = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')])

# model_ext = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.Conv1D(128, 5, activation='relu'),
#     tf.keras.layers.GlobalMaxPooling1D(),
#     tf.keras.layers.Dense(24, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

In [42]:
batch_size = 128
num_examples = tf.data.experimental.cardinality(train_dataset_ext).numpy()
ext_batches = train_dataset_ext.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
val_batches = val_dataset_ext.batch(batch_size).prefetch(1)

In [43]:
model_ext.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model_ext.fit(ext_batches,
                        validation_data=val_batches,
                        epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
