In [1]:
# !pip install PySastrawi

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import Sastrawi

In [2]:
#DATASET SOURCE FROM https://github.com/dbrehmer/Knowself/blob/master/data/mypersonality/essays.csv"
#DATASET USED IN THIS NOTEBOOK IS DATASET FROM THE SOURCE THAT HAS BEEN TRANSLATED TO BAHASA INDONESIA USING GOOGLE TRANSLATE

DATASET_URL = "https://raw.githubusercontent.com/lazuardi100/Hexa-Engineer/ML/Dataset/dataset.csv"
df = pd.read_csv(DATASET_URL, sep =',')

df.head()

Unnamed: 0,#AUTHID,TTEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Nah, sekarang saya baru saja bangun dari tidur...",n,y,y,n,y
1,1997_605191.txt,"Nah, di sini kita pergi dengan arus kesadaran ...",n,n,y,n,n
2,1997_687252.txt,Keyboard terbuka dan tombol untuk mendorong. H...,n,y,n,y,y
3,1997_568848.txt,Aku tidak percaya itu! Ini benar-benar terjadi...,y,n,y,y,n
4,1997_688160.txt,"Nah, di sini aku pergi dengan aliran tua yang ...",y,n,y,n,y


In [3]:
def changeLabel(labels):
  for index, values in enumerate(labels.values):
    if values == 'n':
      labels[index] = 0
    else:
      labels[index] = 1
  return labels

changeLabel(df['cEXT'])
changeLabel(df['cNEU'])
changeLabel(df['cAGR'])
changeLabel(df['cCON'])
changeLabel(df['cOPN'])

0       1
1       0
2       1
3       0
4       1
       ..
2462    0
2463    1
2464    0
2465    1
2466    1
Name: cOPN, Length: 2467, dtype: object

In [4]:
#Text Preprocessing referenced from https://github.com/ksnugroho/basic-text-preprocessing/blob/master/text-preprocessing.ipynb
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def preprocess_text(text):
  #lowercase all character in the text
  text = text.lower()
  #remove stripes
  words = text.split(' ')
  text = ''
  for word in words:
    if '-' in word:
      word = word.split('-')[0]
    text = text + word + ' '
  #remove punctuation
  text = text.translate(str.maketrans("","",string.punctuation))
  #remove leading and trailing whitespace
  text = text.strip()
  #remove StopWord
  stopword = StopWordRemoverFactory().create_stop_word_remover()
  text = stopword.remove(text)
  #stemming
  stemmer = StemmerFactory().create_stemmer()
  text = stemmer.stem(text)
  return text

df['TTEXT'] = df['TTEXT'].map(preprocess_text)

In [6]:
df = df.sample(frac=1)
train_dataset, val_dataset, test_dataset = np.split(df, [int(.95 * len(df)), int(.975 * len(df))])

In [7]:
train_data = train_dataset['TTEXT']
# train_label_ext  = train_dataset['cEXT']
# train_label_neu  = train_dataset['cNEU']
train_label_agr  = train_dataset['cAGR']
train_label_con  = train_dataset['cCON']
train_label_opn  = train_dataset['cOPN']

val_data = val_dataset['TTEXT']
# val_label_ext  = val_dataset['cEXT']
# val_label_neu  = val_dataset['cNEU']
val_label_agr  = val_dataset['cAGR']
val_label_con  = val_dataset['cCON']
val_label_opn  = val_dataset['cOPN']

test_data = test_dataset['TTEXT']
# test_label_ext  = test_dataset['cEXT']
# test_label_neu  = test_dataset['cNEU']
test_label_agr  = test_dataset['cAGR']
test_label_con  = test_dataset['cCON']
test_label_opn  = test_dataset['cOPN']

In [8]:
#Create Data Pipeline

# train_dataset_ext = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_ext.values.astype(dtype=np.float32)))
# train_dataset_neu = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_neu.values.astype(dtype=np.float32)))
train_dataset_agr = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_agr.values.astype(dtype=np.float32)))
train_dataset_con = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_con.values.astype(dtype=np.float32)))
train_dataset_opn = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_opn.values.astype(dtype=np.float32)))
# 
# val_dataset_ext = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_ext.values.astype(dtype=np.float32)))
# val_dataset_neu = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_neu.values.astype(dtype=np.float32)))
val_dataset_agr = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_agr.values.astype(dtype=np.float32)))
val_dataset_con = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_con.values.astype(dtype=np.float32)))
val_dataset_opn = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_opn.values.astype(dtype=np.float32)))
# 
# test_dataset_ext = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_ext.values.astype(dtype=np.float32)))
# test_dataset_neu = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_neu.values.astype(dtype=np.float32)))
test_dataset_agr = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_agr.values.astype(dtype=np.float32)))
test_dataset_con = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_con.values.astype(dtype=np.float32)))
test_dataset_opn = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_opn.values.astype(dtype=np.float32)))

In [10]:
batch_size = 128
num_examples = tf.data.experimental.cardinality(train_dataset_agr).numpy()

# ext_train_batches = train_dataset_ext.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
# ext_val_batches = val_dataset_ext.batch(batch_size).prefetch(1)
# ext_test_batches = test_dataset_ext.batch(batch_size)

# neu_train_batches = train_dataset_neu.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
# neu_val_batches = val_dataset_neu.batch(batch_size).prefetch(1)
# neu_test_batches = test_dataset_neu.batch(batch_size)

agr_train_batches = train_dataset_agr.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
agr_val_batches = val_dataset_agr.batch(batch_size).prefetch(1)
agr_test_batches = test_dataset_agr.batch(batch_size)

con_train_batches = train_dataset_con.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
con_val_batches = val_dataset_con.batch(batch_size).prefetch(1)
con_test_batches = test_dataset_con.batch(batch_size)

opn_train_batches = train_dataset_opn.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
opn_val_batches = val_dataset_opn.batch(batch_size).prefetch(1)
opn_test_batches = test_dataset_opn.batch(batch_size)

In [11]:
#Some model are commented to save the memory
embedding = "https://tfhub.dev/google/nnlm-id-dim128/2"

hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

# model_ext = tf.keras.Sequential([
#           hub_layer,
#           tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
#           tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,kernel_regularizer=tf.keras.regularizers.L2(1e-5),recurrent_regularizer=tf.keras.regularizers.L2(1e-6))),
#           tf.keras.layers.Dense(8, activation='relu'),
#           tf.keras.layers.Dropout(0.4),
#           tf.keras.layers.Dense(1, activation='sigmoid')])

# model_neu = tf.keras.Sequential([
#           hub_layer,
#           tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
#           tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,kernel_regularizer=tf.keras.regularizers.L2(1e-5),recurrent_regularizer=tf.keras.regularizers.L2(1e-6))),
#           tf.keras.layers.Dense(8, activation='relu'),
#           tf.keras.layers.Dropout(0.4),
#           tf.keras.layers.Dense(1, activation='sigmoid')])

model_agr = tf.keras.Sequential([
          hub_layer,
          tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
          tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,kernel_regularizer=tf.keras.regularizers.L2(1e-5),recurrent_regularizer=tf.keras.regularizers.L2(1e-6))),
          tf.keras.layers.Dense(8, activation='relu'),
          tf.keras.layers.Dropout(0.4),
          tf.keras.layers.Dense(1, activation='sigmoid')])

model_con = tf.keras.Sequential([
          hub_layer,
          tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
          tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,kernel_regularizer=tf.keras.regularizers.L2(1e-5),recurrent_regularizer=tf.keras.regularizers.L2(1e-6))),
          tf.keras.layers.Dense(8, activation='relu'),
          tf.keras.layers.Dropout(0.4),
          tf.keras.layers.Dense(1, activation='sigmoid')])

model_opn = tf.keras.Sequential([
          hub_layer,
          tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
          tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,kernel_regularizer=tf.keras.regularizers.L2(1e-5),recurrent_regularizer=tf.keras.regularizers.L2(1e-6))),
          tf.keras.layers.Dense(8, activation='relu'),
          tf.keras.layers.Dropout(0.4),
          tf.keras.layers.Dense(1, activation='sigmoid')])

In [12]:
#Early Stopping
class highAccCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.90):
      print("\nReached 90% accuracy so cancelling training to prevent overfitting the model!")
      self.model.stop_training = True

In [14]:
# model_ext.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
# history_ext = model_ext.fit(ext_train_batches,
#                         validation_data=ext_val_batches,
#                         epochs=10,
#                         callbacks = [highAccCallback()]
#                         )

# model_neu.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
# history_neu = model_neu.fit(neu_train_batches,
#                         validation_data=neu_val_batches,
#                         epochs=10,
#                         callbacks = [highAccCallback()]
#                         )

# model_agr.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
# history_agr = model_agr.fit(agr_train_batches,
#                         validation_data=agr_val_batches,
#                         epochs=10,
#                         callbacks = [highAccCallback()]
#                         )

# model_con.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss='binary_crossentropy',
#               metrics=['accuracy'])
# history_con = model_con.fit(con_train_batches,
#                         validation_data=con_val_batches,
#                         epochs=10,
#                         callbacks = [highAccCallback()]
#                         )

model_opn.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])
history_opn = model_opn.fit(opn_train_batches,
                        validation_data=opn_val_batches,
                        epochs=10,
                        callbacks = [highAccCallback()]
                        )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Reached 90% accuracy so cancelling training to prevent overfitting the model!


In [15]:
# result_ext = model_ext.evaluate(ext_test_batches)
# result_neu = model_neu.evaluate(neu_test_batches)
result_agr = model_agr.evaluate(agr_test_batches)
result_con = model_con.evaluate(con_test_batches)
result_opn = model_opn.evaluate(opn_test_batches)



In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [16]:
# model_ext.save('ext_model.h5')
# model_neu.save('neu_model.h5')
model_agr.save('agr_model.h5')
model_con.save('con_model.h5')
model_opn.save('opn_model.h5')

In [None]:
!gsutil cp agr_model_v2.h5 gs://b21-cap0116
!gsutil cp con_model_v2.h5 gs://b21-cap0116
!gsutil cp ext_model_v2.h5 gs://b21-cap0116
!gsutil cp neu_model_v2.h5 gs://b21-cap0116
!gsutil cp opn_model_v2.h5 gs://b21-cap0116

Copying file://agr_model_v2.h5 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][  1.3 GiB/  1.3 GiB]                                                
Operation completed over 1 objects/1.3 GiB.                                      
Copying file://con_model_v2.h5 [Content-Type=application/octet-stream]...
==>