In [None]:
#Import library required for project
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
#DATASET SOURCE FROM https://github.com/dbrehmer/Knowself/blob/master/data/mypersonality/essays.csv"
#DATASET USED IN THIS NOTEBOOK IS DATASET FROM THE SOURCE THAT HAS BEEN TRANSLATED TO BAHASA INDONESIA USING GOOGLE TRANSLATE

DATASET_URL = "https://raw.githubusercontent.com/lazuardi100/Hexa-Engineer/ML/Dataset/dataset.csv"
df = pd.read_csv(DATASET_URL, sep =',')

df.head()

In [None]:
#Method to change label from y/n into 1/0
def changeLabel(labels):
  for index, values in enumerate(labels.values):
    if values == 'n':
      labels[index] = 0
    else:
      labels[index] = 1
  return labels

changeLabel(df['cEXT'])
changeLabel(df['cNEU'])
changeLabel(df['cAGR'])
changeLabel(df['cCON'])
changeLabel(df['cOPN'])

In [None]:
#Text Preprocessing referenced from https://github.com/ksnugroho/basic-text-preprocessing/blob/master/text-preprocessing.ipynb
def preprocess_text(text):
  #lowercase all character in the text
  text = text.lower()
  #remove punctuation
  text = text.translate(str.maketrans("","",string.punctuation))
  #remove leading and trailing whitespace
  text = text.strip()
  #remove StopWord
  stopword = StopWordRemoverFactory().create_stop_word_remover()
  text = stopword.remove(text)
  #stemming
  stemmer = StemmerFactory().create_stemmer()
  text = stemmer.stem(text)
  return text

df['TTEXT'] = df['TTEXT'].map(preprocess_text)

In [None]:
#Shuffle the dataset and split it for train, validation, and test

df = df.sample(frac=1)
train_dataset, val_dataset, test_dataset = np.split(df, [int(.95 * len(df)), int(.975 * len(df))])

In [None]:
#Separate the data and the label from each other

train_data = train_dataset['TTEXT']
train_label_ext  = train_dataset['cEXT']
train_label_neu  = train_dataset['cNEU']
train_label_agr  = train_dataset['cAGR']
train_label_con  = train_dataset['cCON']
train_label_opn  = train_dataset['cOPN']

val_data = val_dataset['TTEXT']
val_label_ext  = val_dataset['cEXT']
val_label_neu  = val_dataset['cNEU']
val_label_agr  = val_dataset['cAGR']
val_label_con  = val_dataset['cCON']
val_label_opn  = val_dataset['cOPN']

test_data = test_dataset['TTEXT']
test_label_ext  = test_dataset['cEXT']
test_label_neu  = test_dataset['cNEU']
test_label_agr  = test_dataset['cAGR']
test_label_con  = test_dataset['cCON']
test_label_opn  = test_dataset['cOPN']

In [None]:
#Create Data Pipeline

train_dataset_ext = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_ext.values.astype(dtype=np.float32)))
train_dataset_neu = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_neu.values.astype(dtype=np.float32)))
train_dataset_agr = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_agr.values.astype(dtype=np.float32)))
train_dataset_con = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_con.values.astype(dtype=np.float32)))
train_dataset_opn = tf.data.Dataset.from_tensor_slices((train_data.values, train_label_opn.values.astype(dtype=np.float32)))

val_dataset_ext = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_ext.values.astype(dtype=np.float32)))
val_dataset_neu = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_neu.values.astype(dtype=np.float32)))
val_dataset_agr = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_agr.values.astype(dtype=np.float32)))
val_dataset_con = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_con.values.astype(dtype=np.float32)))
val_dataset_opn = tf.data.Dataset.from_tensor_slices((val_data.values, val_label_opn.values.astype(dtype=np.float32)))

test_dataset_ext = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_ext.values.astype(dtype=np.float32)))
test_dataset_neu = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_neu.values.astype(dtype=np.float32)))
test_dataset_agr = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_agr.values.astype(dtype=np.float32)))
test_dataset_con = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_con.values.astype(dtype=np.float32)))
test_dataset_opn = tf.data.Dataset.from_tensor_slices((test_data.values, test_label_opn.values.astype(dtype=np.float32)))

In [None]:
#Prepare Data for training, validation, and testing

batch_size = 128
num_examples = tf.data.experimental.cardinality(train_dataset_ext).numpy()

ext_train_batches = train_dataset_ext.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
ext_val_batches = val_dataset_ext.batch(batch_size).prefetch(1)
ext_test_batches = test_dataset_ext.batch(batch_size)

neu_train_batches = train_dataset_neu.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
neu_val_batches = val_dataset_neu.batch(batch_size).prefetch(1)
neu_test_batches = test_dataset_neu.batch(batch_size)

agr_train_batches = train_dataset_agr.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
agr_val_batches = val_dataset_agr.batch(batch_size).prefetch(1)
agr_test_batches = test_dataset_agr.batch(batch_size)

con_train_batches = train_dataset_con.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
con_val_batches = val_dataset_con.batch(batch_size).prefetch(1)
con_test_batches = test_dataset_con.batch(batch_size)

opn_train_batches = train_dataset_opn.shuffle(num_examples // 4).batch(batch_size).prefetch(1)
opn_val_batches = val_dataset_opn.batch(batch_size).prefetch(1)
opn_test_batches = test_dataset_opn.batch(batch_size)

In [None]:
#Create 5 Models for every personality

embedding = "https://tfhub.dev/google/nnlm-id-dim128/2"

hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

model_ext = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')])

model_neu = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')])

model_agr = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')])

model_opn = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')])

model_con = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')])

In [None]:
#Check the Summary of the model
#All model have same architecture
model_ext.summary()

In [None]:
# Compile and train the EXT Model
model_ext.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

ext_history = model_ext.fit(ext_train_batches,
                        validation_data=ext_val_batches,
                        epochs=6
                        )

In [None]:
# Compile and train the NEU Model
model_neu.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

neu_history = model_neu.fit(neu_train_batches,
                        validation_data=neu_val_batches,
                        epochs=6
                        )

In [None]:
# Compile and train the AGR Model
model_agr.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

agr_history = model_agr.fit(agr_train_batches,
                        validation_data=agr_val_batches,
                        epochs=6
                        )

In [None]:
# Compile and train the OPN Model
model_opn.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

opn_history = model_opn.fit(opn_train_batches,
                        validation_data=opn_val_batches,
                        epochs=6
                        )

In [None]:
# Compile and train the CON Model
model_con.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

con_history = model_con.fit(con_train_batches,
                        validation_data=con_val_batches,
                        epochs=6
                        )

In [None]:
#Test the model
ext_test_result = model_ext.evaluate(ext_test_batches)
neu_test_result = model_neu.evaluate(neu_test_batches)
agr_test_result = model_agr.evaluate(agr_test_batches)
con_test_result = model_con.evaluate(con_test_batches)
opn_test_result = model_opn.evaluate(opn_test_batches)

In [None]:
#Code to visualize the accuracy and loss of training and validation
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(ext_history.history[string])
  plt.plot(ext_history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(ext_history, 'accuracy')
plot_graphs(ext_history, 'loss')

In [None]:
#Save model into .h5 file so it can be uploaded into VM
model_ext.save('ext_model.h5')
model_neu.save('neu_model.h5')
model_agr.save('agr_model.h5')
model_con.save('con_model.h5')
model_opn.save('opn_model.h5')