# MBTI Parallel Classification Model with Neural BOW (I/E Axis)

First, load libraries and useful functions from class:

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division


import os, sys, re, json, time, datetime, shutil
from importlib import reload
import collections, itertools

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
from w266_common import patched_numpy_io
assert(tf.__version__.startswith("1."))
from nltk.corpus import stopwords

# Utils and Helper libraries
# import nltk
from w266_common import utils, vocabulary
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import math
from nltk.corpus import stopwords

  from ._conv import register_converters as _register_converters


## Specifications for Binary Classification NBOW for MBTI

In this baseline, the task is to predict the first MBTI axis (I vs. E) given a text string. We will model after the A2 assignment, with Architecture and Parameters defined below.

### Pre-Processing:
* Minimial pre-processing, only separating punctuation from text and lower-case all text
* Assigning words to numerical indices based on a fixed Vocab size, defined by word frequency in training set
* Pulled out first axis of all target labels, assigned to binary (E = 0, I = 1)

### Architecture:
* Encoder: Bag of Words 
* Decoder: Softmax
* Classification: Binary (2 MBTI types - I or E)

### Parameters
* Batch Size: 25 
* Text length: 100
* Vocabulary size (V): ~328K - removed stopwords
* Embedding Size: 50
* Hidden Dimensions: 25

### Training:
* Epochs = 10 
* 80% train, 20% test
* Loss: Sparse Softmax Cross Entropy 
* Optimizers: Adagrad Optimizer

## Load Corpus & Pre-Process

In [2]:
pwd

'/Users/heatherkoo/Documents/MIDS/W210 Capstone/personality/Heather/code'

In [2]:
#load data
df = pd.read_csv('../personalities_large_no_duplicates_C.csv',index_col = 0)
df.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,MBTI,created_prof,user_screen_name,tweets,id_,loc,descr,prot,followers,follow,friends,fav,timeZ,geo_en,prof_bg_col,prof_bkim_url,prof_image_url
0,0,0,ENFP,,BillTooke,"[""Yes. The College as the voting block for the...",,"New York, USA",,,,,,,,,,,http://pbs.twimg.com/profile_images/8070634314...
1,4,4,ENFP,,tanishatray4,['Are you left handed or right handed? — Left ...,,,,,,,,,,,,,http://pbs.twimg.com/profile_images/9792931110...
2,5,5,ENFP,,100YrLifeStyle,"['All that I would add is ""May you be the reas...",,Long Beach CA,,,,,,,,,,,http://pbs.twimg.com/profile_images/9691532675...
3,6,6,ENFP,,Ally_759_,"['Hahaha happy to entertain', 'Not to be a bit...",,,,,,,,,,,,,http://pbs.twimg.com/profile_images/1134708984...
4,7,7,ENFP,,craigymcmillan,"['Yum yum', 'Hot off the press, new legislatio...",,"London, England",,,,,,,,,,,http://pbs.twimg.com/profile_images/1148650599...


In [4]:
mbti_types = df.MBTI.unique().tolist()

mbti_types_low = df['MBTI'].str.lower().unique().tolist()
mbti_types = mbti_types + mbti_types_low
mbti_types

['ENFP',
 'INFP',
 'ISFP',
 'ESFP',
 'ISTP',
 'ESTP',
 'ENTP',
 'INTP',
 'INTJ',
 'ENTJ',
 'ESTJ',
 'ISTJ',
 'ESFJ',
 'ISFJ',
 'INFJ',
 'ENFJ',
 'enfp',
 'infp',
 'isfp',
 'esfp',
 'istp',
 'estp',
 'entp',
 'intp',
 'intj',
 'entj',
 'estj',
 'istj',
 'esfj',
 'isfj',
 'infj',
 'enfj']

In [5]:
#remove mbti types from tweets
for x in mbti_types:
    df['tweets'] = df['tweets'].str.replace(x,'mbti')


In [6]:
#remove nan values
df = df.dropna(subset=['tweets'])


In [7]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean_tokenize(sentence):
    ignore_words = ['a', 'the', 'user', 'i','is']
    sentence = re.sub("\'","",sentence)
    words = re.sub("[^\w]|[0-9]", " ",  sentence).split() #removes all non-alphanumeric words, removes all numbers
    words_cleaned = [w.lower() for w in words if w.lower() not in ignore_words]
    #stop_words = set(stopwords.words('english'))
    #words_cleaned = ' '.join(word for word in words_cleaned)
    
    return words_cleaned

In [8]:
df["clean_tweets"] = df["tweets"].apply(clean_tokenize)


In [9]:
df["clean_tweets"][0]

['yes',
 'college',
 'as',
 'voting',
 'block',
 'for',
 'pope',
 'didnt',
 'come',
 'around',
 'until',
 'like',
 'th',
 'century',
 'but',
 'cardinals',
 'were',
 'important',
 'romans',
 'mean',
 'lot',
 'of',
 'arabia',
 'was',
 'folk',
 'religion',
 'mesopotamia',
 'was',
 'nestorian',
 'egypt',
 'monophysite',
 'to',
 'go',
 'along',
 'with',
 'catholicism',
 'but',
 'arianism',
 'hanging',
 'around',
 'in',
 'north',
 'africa',
 'certainly',
 'helped',
 'smooth',
 'pathway',
 'for',
 'islam',
 'sorry',
 'very',
 'good',
 'ancient',
 'church',
 'taught',
 'islam',
 'was',
 'christian',
 'heresy',
 'dante',
 'put',
 'mohammad',
 'in',
 'with',
 'schismatics',
 'richard',
 'harriss',
 'kid',
 'fabulous',
 'as',
 'stellan',
 'of',
 'course',
 'piecemealed',
 'chernobyl',
 'through',
 'youtube',
 'and',
 'fuck',
 'it',
 'magnificent',
 'https',
 'www',
 'ncronline',
 'org',
 'blogs',
 'ncr',
 'toda',
 'y',
 'faith',
 'facts',
 'mike',
 'pence',
 'born',
 'again',
 'evangelical',
 'ca

In [10]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean(sentence):
    ignore_words = ['a']
    words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
    stop_words = set(stopwords.words('english'))
    words_cleaned = [w for w in words_cleaned if not w in stop_words]
    words_string = ''.join(words_cleaned)
    return words_string

In [11]:
# split posts per users into separate sentences
#post = []
#utype = []
#user = []

#for index, row in df.iterrows():
#    posts = row['posts'].split('|||')
#    posts_clean = []
##    for sentence in posts:
#       posts_clean.append(clean(sentence))
#    post.extend(posts_clean)
#     post.extend(posts)
#    utype.extend([row['type'] for i in range(len(posts))])
#    user.extend([index for i in range(len(posts))])
    
#short_posts = pd.DataFrame({"user": user,"type": utype,"post": post})
#print(short_posts.shape)
#short_posts.head(5)

In [12]:
# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(np.array(df['clean_tweets']), 
                                                    np.array(df['MBTI']), 
                                                    test_size=0.2, 
                                                    random_state=88)


print("MBIT posts", post_train[2])
print('')
#print("MBTI Labels: ",label_train[:5])

MBIT posts ['pic', 'twitter', 'com', 'r', 'l', 'jswje']



In [13]:
# Build a vocabulary (V size is defaulted to full text) for train corpus

vocab_train = []
for i in range(len(post_train)):
    for word in post_train[i]:
        vocab_train.append(word)


vocab_mbti = vocabulary.Vocabulary((w for w in vocab_train))
vocab_mbti.size

380168

In [14]:
vocab_mbti.write_flat_file('vocab.csv')

Vocabulary (380,168 words) written to 'vocab.csv'


In [15]:
print (vocab_mbti.ids_to_words([3]))

['to']


In [16]:
print (vocab_mbti.words_to_ids(['got','what','and','the']))
#print (vocab_mbti.ids_to_words([202, 147565, 317206, 159348])) 

[99, 33, 5, 2]


In [17]:
vocab_mbti.words_to_ids(post_train[0])

[33,
 318,
 16,
 9,
 291,
 28,
 1979,
 147,
 20,
 1752,
 45,
 16,
 101,
 3,
 224,
 16,
 1565,
 115294,
 115295,
 115296,
 115297,
 70,
 48,
 11,
 300,
 3450,
 962,
 2917,
 5337,
 5905,
 115298,
 115299,
 153,
 37,
 58,
 16,
 16,
 1421,
 108,
 3,
 156,
 141,
 12,
 107,
 23,
 124,
 76,
 174,
 9514,
 5,
 17456,
 755,
 3,
 22,
 5798,
 707,
 40,
 207,
 16,
 1178,
 25,
 3206,
 5,
 32,
 8,
 526,
 24,
 108,
 3,
 22,
 53,
 12664,
 41613,
 85,
 77,
 73,
 72,
 102,
 343,
 43,
 73,
 1313,
 27,
 413,
 3,
 16865,
 5,
 78,
 391,
 387,
 50,
 9003,
 724,
 12,
 107,
 194,
 590,
 65,
 78,
 58339,
 33482,
 38,
 7,
 23,
 641,
 837,
 18,
 115300,
 8109,
 203,
 370,
 19,
 562,
 40,
 108,
 3,
 12665,
 32,
 8,
 370,
 985,
 350,
 233,
 1605,
 674,
 13,
 33482,
 164,
 38,
 16,
 27,
 370,
 674,
 194,
 18,
 59,
 19,
 37,
 459,
 73,
 550,
 16,
 674,
 21,
 36,
 973,
 96,
 12083,
 5,
 27,
 111,
 694,
 5,
 743,
 3,
 459,
 74,
 14,
 6,
 4,
 58340,
 81,
 198,
 216,
 40,
 57,
 949,
 7439,
 192,
 92,
 1571,
 672,
 5,
 318

In [18]:
post_train[0][1]

'game'

In [19]:
# tokenize and canonicalize train and test sets
x_train = []
for i in range(len(post_train)):
    x_train.append(vocab_mbti.words_to_ids(post_train[i]))

x_test = []
for i in range(len(post_test)):
    x_test.append(vocab_mbti.words_to_ids(post_test[i]))

In [20]:
print("Original Text: ",post_train[88])
print("Canonicalized Text: ", x_train[88])
print("Max lengths of texts: ", max([len(x) for x in x_train+x_test]))

Original Text:  ['steve', 'irwin', 'literally', 'spent', 'weeks', 'crying', 'and', 'carrie', 'fisher', 'https', 'twitter', 'com', 'acousticdillon', 'status', 'think', 'area', 'jokes', 'are', 'best', 'we', 'have', 'seen', 'in', 'while', 'on', 'this', 'hell', 'hole', 'hahhh', 'u', 'age', 'apparently', 'that', 'super', 'bright', 'star', 'beside', 'moon', 'last', 'night', 'was', 'actually', 'jupiter', 'wow', 'thats', 'so', 'cool', 'hello', 'aliens', 'name', 'more', 'iconic', 'duo', 'my', 'ma', 'and', 'my', 'nanny', 'fought', 'off', 'burglar', 'yesterday', 'that', 'broke', 'into', 'my', 'aunts', 'shed', 'guys', 'had', 'grand', 'total', 'of', 'four', 'drinks', 'last', 'night', 'and', 'am', 'absolutely', 'dying', 'of', 'hangover', 'this', 'must', 'be', 'what', 'getting', 'old', 'feels', 'like', 'found', '_artpoop', 'in', 'neesons', 'pic', 'twitter', 'com', 'lgu', 'txjmzr', 'it', 'probs', 'seems', 'long', 'cuz', 'have', 'it', 'up', 'like', 'of', 'time', 'it', 'shocks', 'me', 'too', 'hahaha', '

In [21]:
def binary_mbti(string):
    label_bin = []
    if string[0]=="E":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[1]=="N":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[2]=="F":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[3]=="J":
        label_bin.append(0)
    else:
        label_bin.append(1)
        
    assert len(label_bin) == 4,"Not a valid MBTI type"
    return label_bin

In [22]:
print(label_train[0])
print(binary_mbti(label_train[0]))

ISFP
[1, 1, 0, 1]


In [23]:
y_train_id = list(map(lambda x: binary_mbti(x), label_train))
y_test_id = list(map(lambda x: binary_mbti(x), label_test))

print(y_train_id[0:5])
print(label_train[0:5])

[[1, 1, 0, 1], [0, 1, 1, 0], [0, 1, 0, 1], [1, 0, 0, 1], [1, 0, 1, 0]]
['ISFP' 'ESTJ' 'ESFP' 'INFP' 'INTJ']


## Build the NBOW Model

In [24]:
def pad_np_array(example_ids, max_len=100, pad_id=0):

    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def tokenize_post(post_string):
    return vocab_mbti.words_to_ids(post_string)

In [25]:
def as_padded_array(post_ids, targets, max_len=100, pad_id=0,
                    root_only=False, df_idxs=None):

    x, ns = pad_np_array(post_ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(targets)

In [26]:
y_train_1 = []
for i in range(len(y_train_id)):
    y_train_1.append(y_train_id[i][1])

y_test_1 = []
for i in range(len(y_test_id)):
    y_test_1.append(y_test_id[i][1])

In [27]:
train_x, train_ns, train_y = as_padded_array(x_train, y_train_1)
test_x, test_ns, test_y = as_padded_array(x_test, y_test_1)

In [28]:
len(y_train_1)
print(len(y_test_1))

1918


In [29]:
#set up model using tf.estimator

import MBTI_BOW_model; reload(MBTI_BOW_model)

# Specify model hyperparameters as used by model
model_params = dict(V=vocab_mbti.size, embed_dim=50, hidden_dims=[25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)

vocab_mbti.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=MBTI_BOW_model.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (380,168 words) written to '/tmp/tf_bow_sst_20190721-1848/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20190721-1848/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20190721-1848', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12cd83d68>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20190721-1848' --port 6006

Then in your browser, open: http://localhost:6006


## Train Model

In [30]:
#start training


train_params = dict(batch_size=25, total_epochs=10, eval_every=2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)


train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )


test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=25, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
  
    model.train(input_fn=train_input_fn)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20190721-1848/model.ckpt.
INFO:tensorflow:loss = 1.1785045, step = 1
INFO:tensorflow:global_step/sec: 229.37
INFO:tensorflow:loss = 1.0460688, step = 101 (0.438 sec)
INFO:tensorflow:global_step/sec: 238.543
INFO:tensorflow:loss = 0.93105084, step = 201 (0.419 sec)
INFO:tensorflow:global_step/sec: 238.187
INFO:tensorflow:loss = 0.9668246, step = 301 (0.420 sec)
INFO:tensorflow:global_step/sec: 239.063
INFO:tensorflow:loss = 0.9376936, step = 401 (0.418 sec)
INFO:tensorflow:global_step/sec: 238.416
INFO:tensorflow:loss = 0.83370113, step = 501 (0.419 sec)
INFO:tensorflow:global_step/sec: 242.356
INFO:tensorflow:loss = 0.77402675, step = 601 (0.413 sec)
INFO:tensorflow:Saving checkpoints for 614 into /tmp/tf_bow_sst_20190721-1848/model.ckpt.
INFO:tensorflow:Loss for final step: 0.7872611.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow

## Evaluation

In [31]:
#Evaluation on test data

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")  

print ("Perplexity on test set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))

eval_metrics

INFO:tensorflow:Starting evaluation at 2019-07-22-01:49:37
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190721-1848/model.ckpt-3070
INFO:tensorflow:Finished evaluation at 2019-07-22-01:49:38
INFO:tensorflow:Saving dict for global step 3070: accuracy = 0.5542231, cross_entropy_loss = 1.0370797, global_step = 3070, loss = 1.107471
Perplexity on test set: 2.82
Accuracy on test set: 55.42%


{'accuracy': 0.5542231,
 'cross_entropy_loss': 1.0370797,
 'loss': 1.107471,
 'global_step': 3070}

In [32]:
#Evaluation on training data

eval_metrics = model.evaluate(input_fn=train_input_fn, name="train")  

print ("Perplexity on train set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on train set: {:.02%}".format(eval_metrics['accuracy']))


INFO:tensorflow:Starting evaluation at 2019-07-22-01:49:39
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190721-1848/model.ckpt-3070
INFO:tensorflow:Finished evaluation at 2019-07-22-01:49:41
INFO:tensorflow:Saving dict for global step 3070: accuracy = 0.8675358, cross_entropy_loss = 0.2087399, global_step = 3070, loss = 0.28114536
Perplexity on train set: 1.23
Accuracy on train set: 86.75%


{'accuracy': 0.8675358,
 'cross_entropy_loss': 0.2087399,
 'loss': 0.28114536,
 'global_step': 3070}

In [34]:
pred_y = model.predict(input_fn=test_input_fn)
y_pred = []
for i, p in enumerate(pred_y):
    probs = list(p['proba'])
    y_pred.append(probs.index(max(probs)))
        

INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190721-1848/model.ckpt-3070


In [37]:
y_pred_label=[]
for i in y_pred:
    if i == 0:
        y_pred_label.append('e')
    else:
        y_pred_label.append('i')

In [39]:
import collections

counter=collections.Counter(y_pred_label)
print(counter)

Counter({'i': 987, 'e': 931})


In [40]:
987/(987+931)

0.5145985401459854