# Baseline LSTM MBTI Classification Model

First, load libraries and useful functions from class:

In [3]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
from sklearn.model_selection import train_test_split

# NLTK for NLP utils and corpora
# import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# Helper libraries
from w266_common import utils, vocabulary

In [4]:
def MakeFancyRNNCell(hidden_dims, keep_prob):
    """Make a fancy RNN cell.

    Use tf.nn.rnn_cell functions to construct an LSTM cell.
    Initialize forget_bias=0.0 for better training.

    Args:
      H: hidden state sizes, provided in array
      keep_prob: dropout keep prob (same for input and output)

    Returns:
      (tf.nn.rnn_cell.RNNCell) multi-layer LSTM cell with dropout
    """
    cells = []
    for H in hidden_dims:
      cell = tf.nn.rnn_cell.BasicLSTMCell(H, forget_bias=0.0)
      cell = tf.nn.rnn_cell.DropoutWrapper(
          cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
      cells.append(cell)
    return tf.nn.rnn_cell.MultiRNNCell(cells)

def matmul3d(X, W):
    """Wrapper for tf.matmul to handle a 3D input tensor X.
    Will perform multiplication along the last dimension.

    Args:
      X: [m,n,k]
      W: [k,l]

    Returns:
      XW: [m,n,l]
    """
    Xr = tf.reshape(X, [-1, tf.shape(X)[2]])
    XWr = tf.matmul(Xr, W)
    newshape = [tf.shape(X)[0], tf.shape(X)[1], tf.shape(W)[1]]
    return tf.reshape(XWr, newshape)

## Specifications for Baseline LSTM for MBTI

In this baseline, the task is to predict the MBTI type (16 types) given a text string. We will model after the A3 assignment, with Architecture and Parameters defined below.

### Pre-Processing:
* Minimial pre-processing, only separating punctuation from text and lower-case all text
* Assigning words to numerical indices based on a fixed Vocab size, defined by word fre-quency in training set

### Architecture:
* Encoder: 2 layer LSTM
* Decoder: Softmax
* Classification: 16 MBTI types

### Parameters
* Batch Size: 500
* Text length: dynamic
* Vocabulary size (V): |V| - full corpus
* Embedding Size: [128,200,256,523]
* Hidden Dimensions: [256,300]

### Training:
* Epochs = 25
* 80% train, 20% test
* Loss: Cross Entropy with Adam initialization
* Optimizers: Adam Optimizer


![Generic RNN Architecture for MBTI]()

## Load Corpus & Pre-Process

In [5]:
#load data
df = pd.read_csv('./mbti_1.csv')
df.head(5)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [144]:
# function to clean and tokenize sentence ["Hello world."] into list of words ["hello world"]
def clean_tokenize(sentence):
    ignore_words = []
    words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
#     words = sentence.split() #nltk.word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
    words_cleaned = sentence.lower()
    return words_cleaned

In [146]:
# split posts per users into separate sentences
post = []
utype = []
user = []

for index, row in df.iterrows():
    posts = row['posts'].split('|||')
    posts_clean = []
    for sentence in posts:
        posts_clean.append(clean_tokenize(sentence))
    post.extend(posts_clean)
#     post.extend(posts)
    utype.extend([row['type'] for i in range(len(posts))])
    user.extend([index for i in range(len(posts))])
    
short_posts = pd.DataFrame({"user": user,"type": utype,"post": post})
print(short_posts.shape)
short_posts.head(5)

(422845, 3)


Unnamed: 0,post,type,user
0,'http://www.youtube.com/watch?v=qsxhcwe3krw,INFJ,0
1,http://41.media.tumblr.com/tumblr_lfouy03pma1q...,INFJ,0
2,enfp and intj moments https://www.youtube.com...,INFJ,0
3,what has been the most life-changing experienc...,INFJ,0
4,http://www.youtube.com/watch?v=vxzeywwrdw8 h...,INFJ,0


In [147]:
# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(np.array(short_posts['post']), 
                                                    np.array(short_posts['type']), 
                                                    test_size=0.2, 
                                                    random_state=88)

print("MBIT posts", post_train[:5])
print('')
print("MBTI Labels: ",label_train[:5])

MBIT posts ['https://www.youtube.com/watch?v=bxvkaah2d7m'
 'isfjs and infps can balance each other really well, i think, if they learn to communicate - sjs choosing words (and tone of voice!) carefully, and nps learning to take things less personally. your sj...'
 "i'd seek recognition. not fame."
 "to be honest, maybe you are giving out vibes that you are not self-assured. because if you think about it, a bully is going to go for someone they don't think will fight back, someone who is weak (i..."
 "probably. any thinking, really. personally, i prefer the ax-b-c-dy function stack compared to grant's. it allows for the parts where grant gets it right while still being consistent with jung, i.e.,..."]

MBTI Labels:  ['INTP' 'INFP' 'INTP' 'ENFP' 'INTJ']


In [148]:
# Build a vocabulary (V size is defaulted to full text)
# vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in utils.flatten(short_posts['post'])))
vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in post_train))
vocab_mbti.size

333007

In [125]:
print (vocab_mbti.words_to_ids(['a','what','and','the']))
print (vocab_mbti.ids_to_words([202, 147565, 317206, 159348])) 

[202, 147565, 317206, 159348]
['a', 'what', 'and', 'the']


In [149]:
# canonicalize train and test sets
x_train = []
for post in post_train:
    x_train.append(vocab_mbti.words_to_ids(post.split()))

In [151]:
print(post_train[88])
print(x_train[88])

i don't agree with brutal..
[308, 2, 361, 187384, 2]


In [153]:
#create integer classifiers as 1 hot
type_mbti16 = np.array(short_posts["type"])
keys = list(set(type_mbti16))
values = list(range(1,len(keys)+1))
label_map = dict(zip(keys, values))
print(label_map)

y_train = np.array([label_map[label] for label in label_train])
y_test = np.array([label_map[label] for label in label_test])
print(y_train[:5])

{'ISFJ': 1, 'ESFP': 2, 'INFJ': 3, 'ENTJ': 4, 'ENFJ': 5, 'ISFP': 6, 'ESTJ': 7, 'INTP': 8, 'ESTP': 9, 'ESFJ': 10, 'INFP': 11, 'ISTP': 12, 'ENTP': 13, 'INTJ': 14, 'ENFP': 15, 'ISTJ': 16}
[ 8 11  8 15 14]


## Bulid the LSTM Model

In [154]:
# Model Parameters
V = vocab_mbti.size
classes = 16 #len(set(labels))
batch_size = 500 # this will be used for creating the batched sets (changed from 500 to mini size 10)
embed_dim = 256
hidden_dims = [256,300]
dropout_keep_prob = .5

# Training Parameters
softmax_ns = 5 # probably don't need this for 16 classes
learning_rate = .001
num_epochs = 5

In [183]:
tf.reset_default_graph()
tf.set_random_seed(88)

# Create input placeholder
with tf.name_scope("Inputs"):
    x_text_ = tf.placeholder(tf.int32, [None, None], name="x_text") #batch x text_length
    y_type_ = tf.placeholder(tf.int32, [None,classes], name="y_type") #batch x 1
    # Get dynamic shape info from inputs
    batch_size_ = tf.shape(x_text_)[0]
    text_length_ = tf.shape(x_text_)[1]
    ns_ = tf.tile([text_length_], [batch_size_, ], name="ns")
    
# Construct embedding layer
with tf.name_scope("Embedding_Layer"):
    W_in_ = tf.Variable(tf.random_uniform([V, embed_dim], -1.0, 1.0), name="W_in")
    x_ = tf.nn.embedding_lookup(W_in_, x_text_)

# Construct RNN/LSTM cell and recurrent layer.
with tf.name_scope("Recurrent_Layer"):
    cell_ = MakeFancyRNNCell(hidden_dims, dropout_keep_prob)
    initial_h_ = cell_.zero_state(batch_size_, dtype=tf.float32)
    output_, final_h_= tf.nn.dynamic_rnn(cell_, x_, sequence_length= ns_,dtype=tf.float32)

with tf.name_scope("Output_Layer"):
    W_out_ = tf.Variable(tf.random_uniform([hidden_dims[-1],classes],-1.0, 1.0), name="W_out")
    b_out_ = tf.Variable(tf.zeros([classes,], dtype=tf.float32), name="b_out")
    logits_ = tf.add(matmul3d(output_, W_out_), b_out_, name="logits")

with tf.name_scope("Prediction"):
    pred_proba_ = tf.nn.softmax(logits_, name="pred_proba")
    pred_max_ = tf.argmax(logits_, 1, name="pred_max")
    pred_samples_ = tf.reshape(tf.multinomial(tf.reshape(logits_ , [-1, classes]), 
                                                          1, 
                                                          output_dtype=tf.int32, 
                                                          name="pred_samples"),
                                           [batch_size_,1])
with tf.name_scope("Cost_Function"):
    # Sampled Softmax loss for training. Do we need this for only 16 classes?
#     train_inputs_ = tf.reshape(output_, [batch_size*text_length,-1])
#     per_example_train_loss_ = tf.nn.sampled_softmax_loss(weights=tf.transpose(W_out_),
#                                                          biases=b_out_,
#                                                          labels=tf.reshape(target_y_, [-1, 1]), 
#                                                          inputs=tf.reshape(output_, [batch_size*text_length,-1]),
#                                                          num_sampled=softmax_ns, 
#                                                          num_classes=classes,
#                                                          name="per_example_sampled_softmax_loss")

#     train_loss_ = tf.reduce_mean(per_example_train_loss_, name="sampled_softmax_loss")
    # Full softmax loss for scoriing
#     loss_ = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_type_,
#                                                                            logits=logits_,
#                                                                            name="loss"))
    per_example_loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_type_, 
                                                                       logits=logits_, 
                                                                       name="per_example_loss")
    loss_ = tf.reduce_mean(per_example_loss_, name="loss")


In [184]:
with tf.name_scope("Train"):
    learning_rate_ = tf.placeholder(tf.float32, name="learning_rate")
    optimizer_ = tf.train.AdamOptimizer(learning_rate)
#     gradients, variables = zip(*optimizer_.compute_gradients(train_loss_))
    gradients, variables = zip(*optimizer_.compute_gradients(loss_))
    gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    train_step_ = optimizer_.apply_gradients(zip(gradients, variables))

# Initializer step
init_ = tf.global_variables_initializer()

In [185]:
# tensorboard --logdir="tf_graph" --port 6006
summary_writer = tf.summary.FileWriter("tf_graph", 
                                       tf.get_default_graph())

## Train Model

In [171]:
# create batched arrays of size batch_size based on input x and input y
def pad_np_array(example_ids, max_len=35, pad_id=0):
    """Pad a list of lists of ids into a rectangular NumPy array.
    Longer sequences will be truncated to max_len ids, while shorter ones will
    be padded with pad_id.
    Args:
        example_ids: list(list(int)), sequence of ids for each example
        max_len: maximum sequence length
        pad_id: id to pad shorter sequences with
    Returns: (x, ns)
        x: [num_examples, max_len] NumPy array of integer ids
        ns: [num_examples] NumPy array of sequence lengths (<= max_len)
    """
    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def batch_generator(x, y, batch_size):
    for i in range(0, len(x), batch_size):
        # padd the batch
        x_batch = x[i:i+batch_size]
        max_len_batch = max([len(x) for x in x_batch])
        x_padded, _ = pad_np_array(x_batch, max_len=max_len_batch, pad_id=0)
        yield (x_padded, y[i:i+batch_size]) # returns tuple of batched x, batched y

In [179]:
def train_batch(session, x, y, learning_rate):
    feed_dict = {x_text_:x, #np array of texts
                 y_type_:y, #np array of types
                 learning_rate_:learning_rate}
    print(feed_dict)
    c, _ = session.run([loss_, train_step_],
                       feed_dict=feed_dict)
    return c

In [180]:
print_interval = 1000

np.random.seed(88)

session = tf.Session()
session.run(init_)

# for testing
batch_size = 2

t0 = time.time()
for epoch in range(1,num_epochs+1):
    t0_epoch = time.time()
    epoch_cost = 0.0
    total_batches = 0
    print ("")
    for i, (x,y) in enumerate(batch_generator(x_train, y_train, batch_size)):
        if (i % print_interval == 0):
            print("[epoch %d] seen %d batches" % (epoch, i))
        
        epoch_cost += train_batch(session, x, y, learning_rate)
        break
        total_batches = i + 1

    avg_cost = epoch_cost / total_batches
    print("[epoch %d] Completed %d batches in %s" % (epoch, i, utils.pretty_timedelta(since=t0_epoch)))
    print("[epoch %d] Average cost: %.03f" % (epoch, avg_cost,))


[epoch 1] seen 0 batches
{<tf.Tensor 'Inputs/x_text:0' shape=(?, ?) dtype=int32>: array([[  1492,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0],
       [     2, 317206,      2,      2, 258702,      2,      2,      2,
             2,    308,      2,      2,      2,      2,      2,      2,
            42,      2,      2,    758,      2,      2,      2,      2,
             2, 317206,      2,      2,      2,      2,      2,      2,
             2,      2,      2]], dtype=int32), <tf.Tensor 'Inputs/y_type:0' shape=(?, ?) dtype=int32>: array([ 8, 11]), <tf.Tensor 'Train_1/learning_rate:0' shape=<unknown> dtype=float32>: 0.001}


ValueError: Cannot feed value of shape (2,) for Tensor 'Inputs/y_type:0', which has shape '(?, ?)'

In [None]:
def score_batch(session, x, y):
    feed_dict = {x_text_:x,
                 y_type_:y}
    return session.run(loss_, feed_dict=feed_dict)

def score_dataset(x, y):
    total_cost = 0.0
    total_batches = 0
    for (x,y) in batch_generator(x, y, 1000):
        total_cost += score_batch(session, batch)
        total_batches += 1

    return total_cost / total_batches

In [None]:
print "Train set perplexity: %.03f" % np.exp(score_dataset(x_train,y_train))
print "Test set perplexity: %.03f" % np.exp(score_dataset(x_test,y_test))