# MBTI Parallel Classification Model with Neural BOW (I/E Axis)

First, load libraries and useful functions from class:

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division


import os, sys, re, json, time, datetime, shutil
from importlib import reload
import collections, itertools

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
from w266_common import patched_numpy_io
assert(tf.__version__.startswith("1."))
from nltk.corpus import stopwords

# Utils and Helper libraries
# import nltk
from w266_common import utils, vocabulary
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import math
from nltk.corpus import stopwords

  from ._conv import register_converters as _register_converters


## Specifications for Binary Classification NBOW for MBTI

In this baseline, the task is to predict the first MBTI axis (I vs. E) given a text string. We will model after the A2 assignment, with Architecture and Parameters defined below.

### Pre-Processing:
* Minimial pre-processing, only separating punctuation from text and lower-case all text
* Assigning words to numerical indices based on a fixed Vocab size, defined by word frequency in training set
* Pulled out first axis of all target labels, assigned to binary (E = 0, I = 1)

### Architecture:
* Encoder: Bag of Words 
* Decoder: Softmax
* Classification: Binary (2 MBTI types - I or E)

### Parameters
* Batch Size: 25 
* Text length: 100
* Vocabulary size (V): ~328K - removed stopwords
* Embedding Size: 50
* Hidden Dimensions: 25

### Training:
* Epochs = 10 
* 80% train, 20% test
* Loss: Sparse Softmax Cross Entropy 
* Optimizers: Adagrad Optimizer

## Load Corpus & Pre-Process

In [2]:
#load data
df = pd.read_csv('../personalities_FINAL.csv')
df.head(5)

Unnamed: 0,Personality,username,user_id,tweets,followers_count,image
0,enfj,pr3achlikeagirl,2310891000.0,"['God is on the move!', 'Stop telling God what...",67,http://pbs.twimg.com/profile_images/1090079036...
1,enfj,ChurchTherapist,860692100.0,['@user It describes anxious attachment as opp...,1841,http://pbs.twimg.com/profile_images/1033441323...
2,enfj,camperry21,26539820.0,"[""@user @user He ain't winning with the Clippe...",1240,http://pbs.twimg.com/profile_images/1078547897...
3,enfj,galaxiaskykloz,2268542000.0,"['does anyone remember poopreport dot com', '@...",2227,http://pbs.twimg.com/profile_images/1001856767...
4,enfj,sarahyoung_esq,1.090463e+18,"['MeatPotatoesNBeans ', 'THE LITTLE BLEP TONGU...",76,http://pbs.twimg.com/profile_images/1090465009...


In [3]:
#remove nan values
df = df.dropna(subset=['tweets'])


In [28]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean_tokenize(sentence):
    ignore_words = ['a', 'the', 'user', 'i','is']
    sentence = re.sub("\'","",sentence)
    words = re.sub("[^\w]|[0-9]", " ",  sentence).split() #removes all non-alphanumeric words, removes all numbers
    words_cleaned = [w.lower() for w in words if w.lower() not in ignore_words]
    #stop_words = set(stopwords.words('english'))
    #words_cleaned = ' '.join(word for word in words_cleaned)
    
    return words_cleaned

In [29]:
df["clean_tweets"] = df["tweets"].apply(clean_tokenize)


In [30]:
df["clean_tweets"][0]

['god',
 'on',
 'move',
 'stop',
 'telling',
 'god',
 'what',
 'you',
 'don',
 't',
 'have',
 'and',
 'instead',
 'give',
 'christ',
 'what',
 'you',
 'do',
 'have',
 'and',
 'let',
 'him',
 'take',
 'it',
 'bless',
 'it',
 'break',
 'it',
 'and',
 'multiply',
 'it',
 'for',
 'his',
 'glory',
 'in',
 'hands',
 'of',
 'jesus',
 'all',
 'things',
 'become',
 'possible',
 'dr',
 'david',
 'busic',
 'watching',
 'people',
 'love',
 'receive',
 'their',
 'district',
 'licenses',
 'and',
 'ordination',
 'very',
 'emotional',
 'experience',
 'for',
 'me',
 'love',
 'seeing',
 'and',
 'hearing',
 'how',
 'far',
 'god',
 'has',
 'brought',
 'them',
 'and',
 'love',
 'their',
 'obedience',
 'to',
 'god',
 'as',
 'they',
 'follow',
 'his',
 'call',
 'upon',
 'their',
 'lives',
 'pghcotndistrictassembly',
 'not',
 'our',
 'own',
 'righteousness',
 'but',
 'christ',
 'within',
 'living',
 'and',
 'reigning',
 'and',
 'saving',
 'from',
 'sin',
 'holinessuntothelord',
 'calleduntoholiness',
 'pittsb

In [6]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean(sentence):
    ignore_words = ['a']
    words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
    stop_words = set(stopwords.words('english'))
    words_cleaned = [w for w in words_cleaned if not w in stop_words]
    words_string = ''.join(words_cleaned)
    return words_string

In [None]:
# split posts per users into separate sentences
#post = []
#utype = []
#user = []

#for index, row in df.iterrows():
#    posts = row['posts'].split('|||')
#    posts_clean = []
##    for sentence in posts:
#       posts_clean.append(clean(sentence))
#    post.extend(posts_clean)
#     post.extend(posts)
#    utype.extend([row['type'] for i in range(len(posts))])
#    user.extend([index for i in range(len(posts))])
    
#short_posts = pd.DataFrame({"user": user,"type": utype,"post": post})
#print(short_posts.shape)
#short_posts.head(5)

In [31]:
# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(np.array(df['clean_tweets']), 
                                                    np.array(df['Personality']), 
                                                    test_size=0.2, 
                                                    random_state=88)


print("MBIT posts", post_train[2])
print('')
#print("MBTI Labels: ",label_train[:5])

MBIT posts ['yeess', 'am', 'an', 'intj', 'days', 'you', 'should', 'be', 'glad', 'do', 'something', 'today', 'that', 'your', 'future', 'self', 'will', 'be', 'proud', 'of', 'more', 'days', 'couldn', 't', 'be', 'prouder', 'and', 'best', 'yet', 'to', 'come', 'cutest', 'barista', 'ever', 'such', 'role', 'model', 'that', 'look', 'up', 'to', 'march', 'has', 'been', 'so', 'good', 'to', 'me', 'days', 'left', 'last', 'weekly', 'submission', 'more', 'days', 'to', 'go', 'morning', 'sunshine', 'm', 'full', 'of', 'positivity', 'these', 'days', 'couldnt', 'be', 'prouder', 'it', 'was', 'pleasant', 'visit', 'enjoyed', 'it', 'so', 'much', 'in', 'middle', 'of', 'portfolio', 'management', 'class', 'u', 'd', 'days', 'left', 'so', 'blessed', 'with', 'my', 'family', 'amp', 'friends', 'ily', 'alll', 'years', 'of', 'fabulous', 'so', 'proud', 'of', 'you', 'finally', 'what', 'goes', 'around', 'comes', 'around', 'last', 'message', 'to', 'more', 'to', 'go', 'true', 'story', 'always', 'when', 'we', 'change', 'way',

In [39]:
# Build a vocabulary (V size is defaulted to full text) for train corpus

vocab_train = []
for i in range(len(post_train)):
    for word in post_train[i]:
        vocab_train.append(word)


vocab_mbti = vocabulary.Vocabulary((w for w in vocab_train))
#need to lower case all words if they are words
vocab_mbti.size

68981

In [40]:
vocab_mbti.write_flat_file('vocab.csv')

Vocabulary (68,981 words) written to 'vocab.csv'


In [43]:
print (vocab_mbti.ids_to_words([3]))

['to']


In [41]:
print (vocab_mbti.words_to_ids(['got','what','and','the']))
#print (vocab_mbti.ids_to_words([202, 147565, 317206, 159348])) 

[104, 28, 4, 2]


In [52]:
vocab_mbti.words_to_ids(post_train[0])

[38,
 47,
 928,
 221,
 4,
 221,
 4,
 11,
 403,
 1589,
 930,
 16,
 2259,
 259,
 13,
 759,
 71,
 73,
 1117,
 7756,
 515,
 2013]

In [51]:
post_train[0][1]

'an'

In [53]:
# tokenize and canonicalize train and test sets
x_train = []
for i in range(len(post_train)):
    x_train.append(vocab_mbti.words_to_ids(post_train[i]))

x_test = []
for i in range(len(post_test)):
    x_test.append(vocab_mbti.words_to_ids(post_test[i]))

In [54]:
print("Original Text: ",post_train[88])
print("Canonicalized Text: ", x_train[88])
print("Max lengths of texts: ", max([len(x) for x in x_train+x_test]))

Canonicalized Text:  [21, 42, 7, 55982, 199, 38, 47, 821, 4, 70, 122, 494, 3, 18, 5, 9, 262, 5, 476, 15, 36, 15246, 7836, 138, 361, 44, 5, 187, 25, 41, 5, 240, 26, 3740, 5006, 37, 5, 389, 2384, 8063, 55983, 55984, 112, 5, 12, 638, 118, 883, 5821, 7, 3757, 4, 2273, 170, 369, 3, 79, 118, 21, 235, 2744, 55985, 648, 76, 21, 255, 323, 1091, 24, 4743, 3773, 3, 6782, 37, 135, 24715, 15, 20, 1099, 1417, 511, 3, 32, 245, 1157, 1575, 73, 7, 487, 32, 121, 1809, 44, 414, 1126, 10, 24, 385, 22, 144, 113, 5, 44, 30, 476, 2368, 24554, 10, 24, 28, 219, 148, 68, 23, 16, 237, 37, 170, 4363, 9, 2253, 8, 24, 216, 19, 3, 3904, 44, 5, 158, 287, 1008, 2587, 15, 5002, 3, 794, 9, 11, 3963, 205, 9, 567, 38399, 299, 286, 441, 23, 211, 11292, 77, 70, 2633, 12, 55986, 7771, 1376, 476, 80, 828, 5680, 346, 4641, 11, 327, 762, 324, 82, 170, 11012, 16, 81, 39, 762, 2958, 87, 98, 94, 39110, 469, 48, 119, 15751, 3896, 14, 225, 596, 2580, 69, 55987, 18, 24008, 24, 2146, 102, 506, 38, 11, 327, 116, 7, 13, 466, 7, 5014, 44

In [57]:
def binary_mbti(string):
    label_bin = []
    if string[0]=="e":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[1]=="n":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[2]=="f":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[3]=="j":
        label_bin.append(0)
    else:
        label_bin.append(1)
        
    assert len(label_bin) == 4,"Not a valid MBTI type"
    return label_bin

In [58]:
print(label_train[0])
print(binary_mbti(label_train[0]))

infj
[1, 0, 0, 0]


In [59]:
y_train_id = list(map(lambda x: binary_mbti(x), label_train))
y_test_id = list(map(lambda x: binary_mbti(x), label_test))

print(y_train_id[0:5])
print(label_train[0:5])

[[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 0], [1, 1, 1, 1]]
['infj' 'isfj' 'intj' 'infj' 'istp']


## Build the NBOW Model

In [60]:
def pad_np_array(example_ids, max_len=100, pad_id=0):

    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def tokenize_post(post_string):
    return vocab_mbti.words_to_ids(post_string)

In [61]:
def as_padded_array(post_ids, targets, max_len=100, pad_id=0,
                    root_only=False, df_idxs=None):

    x, ns = pad_np_array(post_ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(targets)

In [62]:
len(y_train_id)

139

In [63]:
y_train_1 = []
for i in range(len(y_train_id)):
    y_train_1.append(y_train_id[i][0])

y_test_1 = []
for i in range(len(y_test_id)):
    y_test_1.append(y_test_id[i][0])

In [64]:
train_x, train_ns, train_y = as_padded_array(x_train, y_train_1)
test_x, test_ns, test_y = as_padded_array(x_test, y_test_1)

In [65]:
len(y_train_1)
print(len(y_test_1))

35


In [66]:
#set up model using tf.estimator

import MBTI_BOW_model; reload(MBTI_BOW_model)

# Specify model hyperparameters as used by model
model_params = dict(V=vocab_mbti.size, embed_dim=50, hidden_dims=[25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)

vocab_mbti.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=MBTI_BOW_model.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (68,981 words) written to '/tmp/tf_bow_sst_20190713-1909/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20190713-1909/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20190713-1909', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12d451828>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20190713-1909' --port 6006

Then in your browser, open: http://localhost:6006


## Train Model

In [67]:
#start training


train_params = dict(batch_size=25, total_epochs=10, eval_every=2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)


train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )


test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=25, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
  
    model.train(input_fn=train_input_fn)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20190713-1909/model.ckpt.
INFO:tensorflow:loss = 1.1737149, step = 1
INFO:tensorflow:Saving checkpoints for 12 into /tmp/tf_bow_sst_20190713-1909/model.ckpt.
INFO:tensorflow:Loss for final step: 1.0998626.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190713-1909/model.ckpt-12
INFO:tensorflow:Saving checkpoints for 13 into /tmp/tf_bow_sst_20190713-1909/model.ckpt.
INFO:tensorflow:loss = 1.5724244, step = 13
INFO:tensorflow:Saving checkpoints for 24 into /tmp/tf_bow_sst_20190713-1909/model.ckpt.
INFO:tensorflow:Loss for final step: 0.7723193.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190713-1909/model.ckpt-24
INFO:tensorflow:Saving checkpoints for 25 into /tmp/tf_bow_sst_20190713-1909/model.ckpt.
INFO:tensorflow:loss = 0.83176965, step = 25
INFO:tensorflow:Saving c

## Evaluation

In [68]:
#Evaluation on test data

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")  

print ("Perplexity on test set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))

eval_metrics

INFO:tensorflow:Starting evaluation at 2019-07-14-02:11:33
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190713-1909/model.ckpt-60
INFO:tensorflow:Finished evaluation at 2019-07-14-02:11:33
INFO:tensorflow:Saving dict for global step 60: accuracy = 0.54285717, cross_entropy_loss = 1.2942256, global_step = 60, loss = 1.7624204
Perplexity on test set: 3.65
Accuracy on test set: 54.29%


{'accuracy': 0.54285717,
 'cross_entropy_loss': 1.2942256,
 'loss': 1.7624204,
 'global_step': 60}

In [69]:
#Evaluation on training data

eval_metrics = model.evaluate(input_fn=train_input_fn, name="train")  

print ("Perplexity on train set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on train set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Starting evaluation at 2019-07-14-02:11:47
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190713-1909/model.ckpt-60
INFO:tensorflow:Finished evaluation at 2019-07-14-02:11:47
INFO:tensorflow:Saving dict for global step 60: accuracy = 0.9352518, cross_entropy_loss = 0.14293686, global_step = 60, loss = 0.6450743
Perplexity on train set: 1.15
Accuracy on train set: 93.53%


{'accuracy': 0.9352518,
 'cross_entropy_loss': 0.14293686,
 'loss': 0.6450743,
 'global_step': 60}