<a href="https://colab.research.google.com/github/jhuang12/Tensorflow-for-personality-items-classification/blob/master/191216TF_IPIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#THIS NOTEBOOK WAS CREATED TO USE TF NLP FOR IPIP ITEMS

In [0]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf

In [112]:
print(tf.__version__)

2.0.0


In [113]:
!pip install tensorflow==2.0.0



DATA LOADING

In [114]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# dpath = os.path.join(r"C:/Users/amead/Google Drive/active/machine learning/big5/big_five_items.csv")
dpath = os.path.join("/content/drive/My Drive/big_five_items.csv")

In [0]:
data = pd.read_csv(dpath,low_memory = False, sep = "\t", lineterminator = '\n', encoding = 'utf-8')

DATA PROCESSING


In [0]:
#stem as one string
sentences = pd.Series(data['Stem'])

In [0]:
#scale as categorical data
data['labels'] = data['Scale'].astype('category').cat.codes

In [0]:
#make the labels into np.array
labels = np.array(pd.Series(data['labels']))

DATA CLEANING (STEMMING)

TOKENIZATION AND PADDING

In [0]:
import random 
random.seed(700)

In [0]:
#stratified random sampling training and validation sample by dimensions

training_data = []
training_labels = []
validation_data = []
validation_labels = []

for s in range(5):
  training_set = data.loc[data['labels'] == s, 'Stem'].sample(frac = 0.7, replace = False, random_state = 1)
  training_l = np.array([s]*len(training_set))

  validation_set = data.loc[(~data['Stem'].isin(training_set)) & (data['labels'] == s), 'Stem']
  validation_l = np.array([s]*len(validation_set))

  training_data.extend(training_set)
  training_labels.extend(training_l)

  validation_data.extend(validation_set)
  validation_labels.extend(validation_l)

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LENGTH = 10000

def sequence_vectorization (train_texts, val_texts): 
  """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
  """
  tokenizer = Tokenizer(oov_token = "<OOV>", num_words = MAX_LENGTH)
  # Create vocabulary with training texts.
  tokenizer.fit_on_texts(train_texts)
  x_train = tokenizer.texts_to_sequences(train_texts)
  x_val = tokenizer.texts_to_sequences(val_texts)
  
  # Get max sequence length.
  max_length = len(max(x_train, key=len))
  
  if max_length > MAX_LENGTH:
    max_length = MAX_LENGTH
 
# Fix sequence length to max value. Sequences shorter than the length are
# padded in the beginning and sequences longer are truncated
# at the beginning.
  x_train = np.asarray(pad_sequences(x_train, maxlen=max_length))
  x_val = np.asarray(pad_sequences(x_val, maxlen=max_length))

  return x_train, x_val, tokenizer.word_index

In [0]:
train_vec, val_vec, word_index = sequence_vectorization(training_data, validation_data)

In [0]:
training_labels = np.array(training_labels)
validation_labels = np.array(validation_labels)

MODEL BUILING - MLP

In [0]:
#create a callback criteria
ACCURACY_STOP = 0.95

class myCallbacks(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs = {}):
    if (logs.get('accuracy')> ACCURACY_STOP):
      self.model.stop_training = True
      print("\nReach accuracy of 95% and stop training!")

callbacks = myCallbacks()

In [0]:
#create checkponts to 
from tensorflow.keras.callbacks import ModelCheckpoint

def checkpoint(model_name):
  checkpoint_directory = '/content/drive/My Drive/NLP_tensorflow/{}'.format(model_name)

  checkpoint = ModelCheckpoint(checkpoint_directory,
                             monitor = "val_accuracy", 
                             save_best_only = True, 
                             mode = 'max')
  return checkpoint

In [0]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.

    # Arguments
        num_classes: int, number of classes.

    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [0]:
#list parameters for different models
layers = 4
units = 12
#dropout_rate = 0.1 -- could play with
num_classes = 5

In [0]:
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

In [0]:
def mlp_model(layers, units, num_classes):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        #dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    #model.add(Dropout(rate = dropout_rate)
    
    for i in range(layers-1):
      model.add(Dense(units = units, activation = 'relu'))
      #model.add(Dropout(rate = dropout_rate))

    model.add(Dense(units = op_units, activation = op_activation))
    return model

In [0]:
mlp_model = mlp_model(layers, units, num_classes)

In [0]:
mlp_model.compile(loss = 'sparse_categorical_crossentropy', 
              optimizer = 'adam', 
              metrics = ['accuracy'])

In [133]:
NUM_EPOCHS = 50
model_name = 'mlp.h5'  

mlp_history = mlp_model.fit(train_vec, training_labels,
                    validation_data = (val_vec, validation_labels),
                    epochs = NUM_EPOCHS, 
                    callbacks = [checkpoint(model_name), callbacks])

Train on 459 samples, validate on 183 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


MODEL BUILDING CNN, RNN, AND LSTM

In [0]:
#general function inputs
layers = 4
units = 256
dropout_rate = 0.1
#input_shape = 1
num_classes = 5

In [0]:
#cnn model related inputs 
embedding_input = 1000
embedding_output = 16
conv1d_filter = 64
conv1d_kernel_size = 3

In [0]:
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import Conv1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

In [0]:
def cnn_model(layers, units, num_classes):
  """ create an instance of CNN model. 

      # Arguments
          layers: int, number of `Dense` layers in the model.
          units: int, output dimension of the layers.
          #dropout_rate: float, percentage of input to drop at Dropout layers.
          num_classes: int, number of output classes.

      # Returns
          A CNN model instance.

  """
  op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
  model = models.Sequential()
  model.add(Embedding(embedding_input, embedding_output))
  model.add(Conv1D(conv1d_filter, conv1d_kernel_size, activation = 'relu'))
  model.add(GlobalAveragePooling1D())
    
  for i in range(layers-1):
    model.add(Dense(units = units, activation ='relu'))
    #model.add(Dropout(rate = dropout_rate))
    units = units//2

  model.add(Dense(units = op_units, activation = op_activation))

  return model

In [0]:
model = cnn_model(layers, units, num_classes)

In [0]:
model.compile(loss = 'sparse_categorical_crossentropy', 
              optimizer = 'adam', 
              metrics = ['accuracy'])

In [140]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 16)          16000     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 64)          3136      
_________________________________________________________________
global_average_pooling1d_3 ( (None, 64)                0         
_________________________________________________________________
dense_36 (Dense)             (None, 256)               16640     
_________________________________________________________________
dense_37 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_38 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_39 (Dense)             (None, 5)               

In [141]:
NUM_EPOCHS = 50
model_name = 'cnn_model.h5'

cnn_history = model.fit(train_vec, training_labels,
                    validation_data = (val_vec, validation_labels),
                    epochs = NUM_EPOCHS, 
                    callbacks = [checkpoint(model_name), callbacks])

Train on 459 samples, validate on 183 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Reach accuracy of 95% and stop training!


USE SINGLE LAYER LSTM

In [0]:
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(1000, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [143]:
model1.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 64)          64000     
_________________________________________________________________
bidirectional_7 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_40 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_41 (Dense)             (None, 5)                 325       
Total params: 138,629
Trainable params: 138,629
Non-trainable params: 0
_________________________________________________________________


In [0]:
#lstm related inputs
lstm_n = 64

In [0]:
from tensorflow.keras.layers import Bidirectional

In [0]:
def lstm_1l(layers, units, lstm_n, num_classes):
  """create an instance of a single layer LSTM. 

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        lstm_n: int, number of neurons in the LSTM layer
        num_classes: int, number of output classes.

    # Returns
        A single layer LSTM model instance.
  """
  op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
  model = models.Sequential()
  model.add(Embedding(embedding_input, embedding_output))
  model.add(Bidirectional(tf.keras.layers.LSTM(lstm_n)))

  for i in range(layers-1):
    model.add(Dense(units = units, activation ='relu'))
    #model.add(Dropout(rate = dropout_rate))
    units = units//2

  model.add(Dense(units = op_units, activation = op_activation))

  return model

In [0]:
model = lstm_1l(layers, units, lstm_n, num_classes)

In [148]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 16)          16000     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 128)               41472     
_________________________________________________________________
dense_42 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_43 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_44 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_45 (Dense)             (None, 5)                 325       
Total params: 131,973
Trainable params: 131,973
Non-trainable params: 0
_______________________________________________

In [0]:
model.compile(loss = 'sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [150]:
model_name = 's_lstm.h5'
single_lstm_history = model.fit(train_vec, training_labels,
                                 validation_data=(val_vec, validation_labels),
                                 epochs=NUM_EPOCHS,
                                 callbacks = [checkpoint(model_name), callbacks])

Train on 459 samples, validate on 183 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Reach accuracy of 95% and stop training!


USE MULTIPLE LAYER LSTM

In [0]:
#multiple layers lstm related inputs
lstm_layers = 3
layers = 3
units = 32
lstm_n = 32

In [188]:
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(1000, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])


model2.compile(loss = 'sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']
)


model2.fit(train_vec, training_labels,
           validation_data=(val_vec, validation_labels),
           epochs=30, 
           callbacks = [ checkpoint(model_name), callbacks])

Train on 459 samples, validate on 183 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Reach accuracy of 95% and stop training!


<tensorflow.python.keras.callbacks.History at 0x7ff79e579d68>

In [0]:
def lstm_mul(layers, units, lstm_layers, lstm_n, num_classes):
  """create an instance of a multi-layer LSTM. 

    # Arguments
        lstm_layers: int, number of Bidirectional layers in the LSTM
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        lstm_n: int, the number of nuerons in LSTM layers
        num_classes: int, number of output classes.

    # Returns
        A multi-layer LSTM model instance.
  """
  op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
  model = models.Sequential()
  model.add(Embedding(embedding_input, embedding_output))

  for i in range(lstm_layers -1):
    if i < lstm_layers-1 :
      model.add(Bidirectional(tf.keras.layers.LSTM(lstm_n, return_sequences = True)))
      lstm_n = lstm_n//2
    else:
      model.add(Bidirectional(tf.keras.layers.LSTM(lstm_n)))
    
  for i in range(layers -1):
    model.add(Dense(units, activation = 'relu'))
    units = units//2
  
  model.add(Dense(units = op_units, activation = op_activation))

  return model

In [0]:
m_lstm_model = lstm_mul(layers, units, lstm_layers, lstm_n, num_classes)

In [192]:
m_lstm_model.summary()

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, None, 16)          16000     
_________________________________________________________________
bidirectional_29 (Bidirectio (None, None, 64)          12544     
_________________________________________________________________
bidirectional_30 (Bidirectio (None, None, 32)          10368     
_________________________________________________________________
dense_78 (Dense)             (None, None, 32)          1056      
_________________________________________________________________
dense_79 (Dense)             (None, None, 16)          528       
_________________________________________________________________
dense_80 (Dense)             (None, None, 5)           85        
Total params: 40,581
Trainable params: 40,581
Non-trainable params: 0
_________________________________________________

In [0]:
m_lstm_model.compile(loss = 'sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']
)

In [194]:
model_name = 'm_lstm.h5'

multiple_lstm_history = m_lstm_model.fit(train_vec, training_labels,
                                  validation_data=(val_vec, validation_labels),
                                  epochs=NUM_EPOCHS,
                                  callbacks = [checkpoint(model_name), callbacks])

Train on 459 samples, validate on 183 samples
Epoch 1/50


TypeError: ignored

In [195]:
m_lstm_model.fit(train_vec, training_labels,
                                  validation_data=(val_vec, validation_labels),
                                  epochs=NUM_EPOCHS)

Train on 459 samples, validate on 183 samples
Epoch 1/50
 32/459 [=>............................] - ETA: 0s

InvalidArgumentError: ignored

USE GRU

In [0]:
#gru related inputs 
gru_n = 32

In [0]:
def gru_model(layers, units, gru_n, num_classes):
  """create an instance of GRU. 

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        gru_n: int, number of neurons in the GRU layer
        num_classes: int, number of output classes.

    # Returns
        A single layer LSTM model instance.
  """
  op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
  model = models.Sequential()
  model.add(Embedding(embedding_input, embedding_output, input_length = 24))
  model.add(Bidirectional(tf.keras.layers.GRU(gru_n)))

  for i in range(layers-1):
    model.add(Dense(units, activation ='relu'))
    units = units//2
  
  model.add(Dense(units = op_units, activation = op_activation))

  return model

In [0]:
gru_model = gru_model(layers, units, gru_n, num_classes)

In [0]:
gru_model.summary()

In [0]:
gru_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [0]:
model_name = 'gru.h5'

gru_model_history = gru_model.fit(train_vec, training_labels,
                              validation_data=(val_vec, validation_labels),
                              epochs=NUM_EPOCHS, 
                              callbacks = [checkpoint(model_name), callbacks])

MODEL PERFORMANCE

In [0]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [0]:
plot_graphs(history, 'accuracy')

In [0]:
plot_graphs(history, 'loss')

In [0]:
#clear the trained models
tf.keras.backend.clear_session()