In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import nltk
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from collections import Counter
from importlib import reload
import os, sys, re, json, time, datetime, shutil
import patched_numpy_io


#nltk.download('stopwords')
from nltk.corpus import stopwords
from w266_common import utils, vocabulary


In [18]:
df = pd.read_csv('./mbti_1.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [19]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean_tokenize(sentence):
    ignore_words = ['a']
    words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
#     stop_words = set(stopwords.words('english'))
#     words_cleaned = [w for w in words_cleaned if not w in stop_words]
    return words_cleaned

In [20]:
# split posts per users into separate sentences
post = []
utype = []
user = []

for index, row in df.iterrows():
    posts = row['posts'].split('|||')
    posts_clean = []
    for sentence in posts:
        posts_clean.append(clean_tokenize(sentence))
    post.extend(posts_clean)
    utype.extend([row['type'] for i in range(len(posts))])
    user.extend([index for i in range(len(posts))])
    
short_posts = pd.DataFrame({"user": user,"type": utype,"post": post})
print(short_posts.shape)
short_posts.head()

(422845, 3)


Unnamed: 0,post,type,user
0,"[http, www, youtube, com, watch, v, qsxhcwe3krw]",INFJ,0
1,"[http, 41, media, tumblr, com, tumblr_lfouy03p...",INFJ,0
2,"[enfp, and, intj, moments, https, www, youtube...",INFJ,0
3,"[what, has, been, the, most, life, changing, e...",INFJ,0
4,"[http, www, youtube, com, watch, v, vxzeywwrdw...",INFJ,0


In [21]:
#split corpus into training and test set
X_train, X_test, y_train, y_test = train_test_split(short_posts['post'], short_posts['type'], test_size=0.2, random_state=42)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

338276
84569
338276
84569


In [22]:
X_train[0:4]

83632     [i, am, basically, draftsman, for, telecommuni...
326075    [this, probably, is, very, true, i, know, i, h...
85363     [be, the, change, you, want, to, see, in, the,...
104409                                               [infj]
Name: post, dtype: object

In [23]:
#create  vocabulary
# flatten text inputs into into single long list of words, feed into vocab
vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in utils.flatten(X_train))) 
print(vocab_mbti.size)

119420


In [24]:
print (vocab_mbti.words_to_ids(['basically']))
print (vocab_mbti.ids_to_words([405]))  

[527]
['personally']


In [25]:
def pad_np_array(example_ids, max_len=35, pad_id=0):
    """Pad a list of lists of ids into a rectangular NumPy array.
    Longer sequences will be truncated to max_len ids, while shorter ones will
    be padded with pad_id.
    Args:
        example_ids: list(list(int)), sequence of ids for each example
        max_len: maximum sequence length
        pad_id: id to pad shorter sequences with
    Returns: (x, ns)
        x: [num_examples, max_len] NumPy array of integer ids
        ns: [num_examples] NumPy array of sequence lengths (<= max_len)
    """
    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def tokenize_post(post_string):
    return vocab_mbti.words_to_ids(post_string)
   

In [26]:
def as_padded_array(post_ids, targets, max_len=40, pad_id=0,
                    root_only=False, df_idxs=None):
    """Return the dataset as a (padded) NumPy array.
    Longer sequences will be truncated to max_len ids, while shorter ones
    will be padded with pad_id.
    Args:
      split: 'train' or 'test'
      max_len: maximum sequence length
      pad_id: id to pad shorter sequences with
      root_only: if true, will only export root phrases
      df_idxs: (optional) custom list of indices to export
    Returns: (x, ns, y)
      x: [num_examples, max_len] NumPy array of integer ids
      ns: [num_examples] NumPy array of sequence lengths (<= max_len)
      y: [num_examples] NumPy array of target ids
    """
    #needs to put together the pad_np_array output with  the target labels in dataframe
    
    
    x, ns = pad_np_array(post_ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(targets)

In [27]:
#tokenize training corpus

X_train=X_train.reset_index(drop=True)
X_train_ids = []

for post in range(0,(len(X_train))):
    X_train_ids.append(tokenize_post(X_train[post]))

#tokenize test set

X_test=X_test.reset_index(drop=True)
X_test_ids = []

for post in range(0,(len(X_test))):
    X_test_ids.append(tokenize_post(X_test[post]))    
    

In [28]:
train_x, train_ns, train_y = as_padded_array(X_train_ids, y_train)
test_x, test_ns, test_y = as_padded_array(X_test_ids, y_test)

In [29]:
#create integer classifiers
classifiers = list(set(train_y))
target_y_train = []
for i in range(len(train_y)):
    target_y_train.append(classifiers.index(train_y[i]))
target_y_train=np.array(target_y_train)
    
target_y_test = []
for i in range(len(test_y)):
    target_y_test.append(classifiers.index(test_y[i]))
target_y_test=np.array(target_y_test)

print(target_y_train[:5])

[ 0  4 11 12  1]


In [30]:
#train model using tf.estimator

import MBTI_BOW_model; reload(MBTI_BOW_model)

# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab_mbti.size, embed_dim=50, hidden_dims=[25], num_classes=len(classifiers),
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
vocab_mbti.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=MBTI_BOW_model.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (119,420 words) written to '/tmp/tf_bow_sst_20181207-1952/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20181207-1952/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20181207-1952', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a6347c438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20181207-1952' --port 6006

Then in your browser, open: http://localhost:6006


In [31]:
#start training

# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=20, eval_every=2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=target_y_train,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )

# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=target_y_test,
                    batch_size=128, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on test
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")

INFO:tensorflow:Calling model_fn.
XS_shape: (?, 40, 50)
H shape:  (?, 25)
Logits: (?, 16)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20181207-1952/model.ckpt.
INFO:tensorflow:loss = 5.01354, step = 1
INFO:tensorflow:global_step/sec: 168.283
INFO:tensorflow:loss = 3.52103, step = 101 (0.600 sec)
INFO:tensorflow:global_step/sec: 287.436
INFO:tensorflow:loss = 3.15998, step = 201 (0.350 sec)
INFO:tensorflow:global_step/sec: 230.015
INFO:tensorflow:loss = 3.11642, step = 301 (0.427 sec)
INFO:tensorflow:global_step/sec: 408.076
INFO:tensorflow:loss = 3.11194, step = 401 (0.245 sec)
INFO:tensorflow:global_step/sec: 308.226
INFO:tensorflow:loss = 2.76571, step = 501 (0.325 sec)
INFO:tensorflow:global_step/sec: 261.725
INFO:tensorflow:loss = 2.56755, step = 601 (0.382 sec)
IN

In [32]:
#Evaluation

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")  

print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
XS_shape: (?, 40, 50)
H shape:  (?, 25)
Logits: (?, 16)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-08-04:04:09
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20181207-1952/model.ckpt-211430
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-08-04:04:11
INFO:tensorflow:Saving dict for global step 211430: accuracy = 0.228181, cross_entropy_loss = 2.26629, global_step = 211430, loss = 2.52781
Accuracy on test set: 22.82%


{'accuracy': 0.22818054,
 'cross_entropy_loss': 2.2662907,
 'global_step': 211430,
 'loss': 2.5278091}

In [33]:
#train model manually


import MBTI_BOW_model; reload(MBTI_BOW_model)

x, ns, y = train_x, train_ns, target_y_train
batch_size = 32

# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab_mbti.size, embed_dim=50, hidden_dims=[25], num_classes=16,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)
model_fn = MBTI_BOW_model.classifier_model_fn

total_batches = 0
total_examples = 0
total_loss = 0
loss_ema = np.log(2)  # track exponential-moving-average of loss
ema_decay = np.exp(-1/10)  # decay parameter for moving average = np.exp(-1/history_length)
with tf.Graph().as_default(), tf.Session() as sess:
    ##
    # Construct the graph here. No session.run calls - just wiring up Tensors.
    ##
    # Add placeholders so we can feed in data.
    x_ph_  = tf.placeholder(tf.int32, shape=[None, x.shape[1]])  # [batch_size, max_len]
    ns_ph_ = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    y_ph_  = tf.placeholder(tf.int32, shape=[None])              # [batch_size]
    
    # Construct the graph using model_fn
    features = {"ids": x_ph_, "ns": ns_ph_}  # note that values are Tensors
    estimator_spec = model_fn(features, labels=y_ph_, mode=tf.estimator.ModeKeys.TRAIN,
                              params=model_params)
    loss_     = estimator_spec.loss
    train_op_ = estimator_spec.train_op
    
    ##
    # Done constructing the graph, now we can make session.run calls.
    ##
    sess.run(tf.global_variables_initializer())
    
    # Run a single epoch
    t0 = time.time()
    for (bx, bns, by) in utils.multi_batch_generator(batch_size, x, ns, y):
        # feed NumPy arrays into the placeholder Tensors
        feed_dict = {x_ph_: bx, ns_ph_: bns, y_ph_: by}
        batch_loss, _ = sess.run([loss_, train_op_], feed_dict=feed_dict)
        
        # Compute some statistics
        total_batches += 1
        total_examples += len(bx)
        total_loss += batch_loss * len(bx)  # re-scale, since batch loss is mean
        # Compute moving average to smooth out noisy per-batch loss
        loss_ema = ema_decay * loss_ema + (1 - ema_decay) * batch_loss
        
        if (total_batches % 25 == 0):
            print("{:5,} examples, moving-average loss {:.2f}".format(total_examples, 
                                                                      loss_ema))    
    print("Completed one epoch in {:s}".format(utils.pretty_timedelta(since=t0)))   

XS_shape: (?, 40, 50)
H shape:  (?, 25)
Logits: (?, 16)
  800 examples, moving-average loss 4.51
1,600 examples, moving-average loss 4.07
2,400 examples, moving-average loss 3.83
3,200 examples, moving-average loss 3.70
4,000 examples, moving-average loss 3.50
4,800 examples, moving-average loss 3.42
5,600 examples, moving-average loss 3.31
6,400 examples, moving-average loss 3.24
7,200 examples, moving-average loss 3.13
8,000 examples, moving-average loss 3.05
8,800 examples, moving-average loss 2.97
9,600 examples, moving-average loss 2.95
10,400 examples, moving-average loss 2.97
11,200 examples, moving-average loss 2.83
12,000 examples, moving-average loss 2.87
12,800 examples, moving-average loss 2.77
13,600 examples, moving-average loss 2.70
14,400 examples, moving-average loss 2.73
15,200 examples, moving-average loss 2.64
16,000 examples, moving-average loss 2.67
16,800 examples, moving-average loss 2.62
17,600 examples, moving-average loss 2.63
18,400 examples, moving-average 