# MBTI Parallel Classification Model with Neural BOW (I/E Axis)

First, load libraries and useful functions from class:

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division


import os, sys, re, json, time, datetime, shutil
from importlib import reload
import collections, itertools

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
from w266_common import patched_numpy_io
assert(tf.__version__.startswith("1."))
from nltk.corpus import stopwords

# Utils and Helper libraries
# import nltk
from w266_common import utils, vocabulary
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import math
from nltk.corpus import stopwords

  from ._conv import register_converters as _register_converters


## Specifications for Binary Classification NBOW for MBTI

In this baseline, the task is to predict the first MBTI axis (I vs. E) given a text string. We will model after the A2 assignment, with Architecture and Parameters defined below.

### Pre-Processing:
* Minimial pre-processing, only separating punctuation from text and lower-case all text
* Assigning words to numerical indices based on a fixed Vocab size, defined by word frequency in training set
* Pulled out first axis of all target labels, assigned to binary (E = 0, I = 1)

### Architecture:
* Encoder: Bag of Words 
* Decoder: Softmax
* Classification: Binary (2 MBTI types - I or E)

### Parameters
* Batch Size: 25 
* Text length: 100
* Vocabulary size (V): ~328K - removed stopwords
* Embedding Size: 50
* Hidden Dimensions: 25

### Training:
* Epochs = 10 
* 80% train, 20% test
* Loss: Sparse Softmax Cross Entropy 
* Optimizers: Adagrad Optimizer

## Load Corpus & Pre-Process

In [2]:
pwd

'/Users/heatherkoo/Documents/MIDS/W210 Capstone/personality/Heather/code'

In [3]:
#load data
df = pd.read_csv('../personalities_large_no_duplicates_C.csv',index_col = 0)
df.head(5)

Unnamed: 0,user_screen_name,tweets,prof_image_url,MBTI,orig_tweets,id_,retweets,fav
0,BillTooke,"[""Yes. The College as the voting block for the...",http://pbs.twimg.com/profile_images/8070634314...,ENFP,"""I have no shame and little guile when it come...",1123269839026642944,0,1
1,dougie0216,"['All Hail De Gendt! #TDF2019', 'He has a powe...",http://pbs.twimg.com/profile_images/1126179691...,INFP,"""@princessfemme Wow another ENFP what are th...",1123265408419733505,0,1
2,kimbetech,['Get on LIVE now. 10:30pm EST https:// share....,http://pbs.twimg.com/profile_images/1101342873...,ENFP,"""I am an ENFP . We do not take kindly to being...",988128789577240576,0,1
3,honeyBklein,"[""I agree. It's a spiritual battle. In light o...",http://pbs.twimg.com/profile_images/1130313304...,ENFJ,"""@ABeardedPoet actually I am an INFP not INF...",1123206556181573632,0,2
4,tanishatray4,['Are you left handed or right handed? — Left ...,http://pbs.twimg.com/profile_images/9792931110...,ENFP,"""Ya Absolutely that makes a lot of sense .The...",1123130637664309248,0,0


In [4]:
mbti_types = df.MBTI.unique().tolist()

mbti_types_low = df['MBTI'].str.lower().unique().tolist()
mbti_types = mbti_types + mbti_types_low
mbti_types

['ENFP',
 'INFP',
 'ENFJ',
 'ENTP',
 'ESFJ',
 'INFJ',
 'INTJ',
 'INTP',
 'ESTJ',
 'ISTJ',
 'ENTJ',
 'ISFJ',
 'ESFP',
 'ISTP',
 'ESTP',
 'ISFP',
 'enfp',
 'infp',
 'enfj',
 'entp',
 'esfj',
 'infj',
 'intj',
 'intp',
 'estj',
 'istj',
 'entj',
 'isfj',
 'esfp',
 'istp',
 'estp',
 'isfp']

In [5]:
#remove mbti types from tweets
for x in mbti_types:
    df['tweets'] = df['tweets'].str.replace(x,'mbti')


In [6]:
#remove nan values
df = df.dropna(subset=['tweets'])


In [7]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean_tokenize(sentence):
    ignore_words = ['a', 'the', 'user', 'i','is']
    sentence = re.sub("\'","",sentence)
    words = re.sub("[^\w]|[0-9]", " ",  sentence).split() #removes all non-alphanumeric words, removes all numbers
    words_cleaned = [w.lower() for w in words if w.lower() not in ignore_words]
    #stop_words = set(stopwords.words('english'))
    #words_cleaned = ' '.join(word for word in words_cleaned)
    
    return words_cleaned

In [8]:
df["clean_tweets"] = df["tweets"].apply(clean_tokenize)


In [9]:
df["clean_tweets"][0]

['yes',
 'college',
 'as',
 'voting',
 'block',
 'for',
 'pope',
 'didnt',
 'come',
 'around',
 'until',
 'like',
 'th',
 'century',
 'but',
 'cardinals',
 'were',
 'important',
 'romans',
 'mean',
 'lot',
 'of',
 'arabia',
 'was',
 'folk',
 'religion',
 'mesopotamia',
 'was',
 'nestorian',
 'egypt',
 'monophysite',
 'to',
 'go',
 'along',
 'with',
 'catholicism',
 'but',
 'arianism',
 'hanging',
 'around',
 'in',
 'north',
 'africa',
 'certainly',
 'helped',
 'smooth',
 'pathway',
 'for',
 'islam',
 'sorry',
 'very',
 'good',
 'ancient',
 'church',
 'taught',
 'islam',
 'was',
 'christian',
 'heresy',
 'dante',
 'put',
 'mohammad',
 'in',
 'with',
 'schismatics',
 'richard',
 'harriss',
 'kid',
 'fabulous',
 'as',
 'stellan',
 'of',
 'course',
 'piecemealed',
 'chernobyl',
 'through',
 'youtube',
 'and',
 'fuck',
 'it',
 'magnificent',
 'https',
 'www',
 'ncronline',
 'org',
 'blogs',
 'ncr',
 'toda',
 'y',
 'faith',
 'facts',
 'mike',
 'pence',
 'born',
 'again',
 'evangelical',
 'ca

In [10]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean(sentence):
    ignore_words = ['a']
    words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
    stop_words = set(stopwords.words('english'))
    words_cleaned = [w for w in words_cleaned if not w in stop_words]
    words_string = ''.join(words_cleaned)
    return words_string

In [11]:
# split posts per users into separate sentences
#post = []
#utype = []
#user = []

#for index, row in df.iterrows():
#    posts = row['posts'].split('|||')
#    posts_clean = []
##    for sentence in posts:
#       posts_clean.append(clean(sentence))
#    post.extend(posts_clean)
#     post.extend(posts)
#    utype.extend([row['type'] for i in range(len(posts))])
#    user.extend([index for i in range(len(posts))])
    
#short_posts = pd.DataFrame({"user": user,"type": utype,"post": post})
#print(short_posts.shape)
#short_posts.head(5)

In [12]:
# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(np.array(df['clean_tweets']), 
                                                    np.array(df['MBTI']), 
                                                    test_size=0.2, 
                                                    random_state=88)


print("MBIT posts", post_train[2])
print('')
#print("MBTI Labels: ",label_train[:5])

MBIT posts []



In [13]:
# Build a vocabulary (V size is defaulted to full text) for train corpus

vocab_train = []
for i in range(len(post_train)):
    for word in post_train[i]:
        vocab_train.append(word)


vocab_mbti = vocabulary.Vocabulary((w for w in vocab_train))
vocab_mbti.size

391714

In [14]:
vocab_mbti.write_flat_file('vocab.csv')

Vocabulary (391,714 words) written to 'vocab.csv'


In [15]:
print (vocab_mbti.ids_to_words([3]))

['to']


In [16]:
print (vocab_mbti.words_to_ids(['got','what','and','the']))
#print (vocab_mbti.ids_to_words([202, 147565, 317206, 159348])) 

[97, 33, 5, 2]


In [17]:
vocab_mbti.words_to_ids(post_train[0])

[]

In [19]:
# tokenize and canonicalize train and test sets
x_train = []
for i in range(len(post_train)):
    x_train.append(vocab_mbti.words_to_ids(post_train[i]))

x_test = []
for i in range(len(post_test)):
    x_test.append(vocab_mbti.words_to_ids(post_test[i]))

In [20]:
print("Original Text: ",post_train[88])
print("Canonicalized Text: ", x_train[88])
print("Max lengths of texts: ", max([len(x) for x in x_train+x_test]))

Original Text:  ['whaaaat', 'buffy', 'friends', 'source', 'https', 'www', 'cracked', 'com', 'pictofacts', 'theres', 'reason', 'these', 'movie', 'tv', 'scenes', 'look', 'familiar', 'pic', 'twitter', 'com', 'norfv', 'tn', 'use', 'filters', 'to', 'push', 'contrast', 'to', 'see', 'shapes', 'and', 'planes', 'better', 'pic', 'twitter', 'com', 'z', 'ygsmyiiw', 'and', 'sorry', 'for', 'my', 'part', 'its', 'elegant', 'and', 'simple', 'to', 'have', 'one', 'rule', 'for', 'all', 'genders', 'but', 'could', 'have', 'suggested', 'it', 'better', 'fortunately', 'phones', 'now', 'have', 'voice', 'activated', 'cameras', 'makes', 'getting', 'hands', 'waaaay', 'easier', 'reminder', 'that', 'pro', 'artists', 'use', 'reference', 'it', 'never', 'cheating', 'always', 'lends', 'hand', 'pic', 'twitter', 'com', 'szkjkveur', 'by', 'replier', 'you', 'mean', 'me', 'nah', 'it', 'was', 'wrong', 'how', 'went', 'about', 'suggesting', 'it', 'posted', 'an', 'apology', 'link', 'sould', 'be', 'below', 'somewhere', 'like', 'a

In [21]:
def binary_mbti(string):
    label_bin = []
    if string[0]=="E":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[1]=="N":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[2]=="F":
        label_bin.append(0)
    else:
        label_bin.append(1)
    if string[3]=="J":
        label_bin.append(0)
    else:
        label_bin.append(1)
        
    assert len(label_bin) == 4,"Not a valid MBTI type"
    return label_bin

In [22]:
print(label_train[0])
print(binary_mbti(label_train[0]))

ESFP
[0, 1, 0, 1]


In [23]:
y_train_id = list(map(lambda x: binary_mbti(x), label_train))
y_test_id = list(map(lambda x: binary_mbti(x), label_test))

print(y_train_id[0:5])
print(label_train[0:5])

[[0, 1, 0, 1], [1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 0], [1, 1, 1, 1]]
['ESFP' 'ISTP' 'ISTP' 'ENTJ' 'ISTP']


## Build the NBOW Model

In [24]:
def pad_np_array(example_ids, max_len=100, pad_id=0):

    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def tokenize_post(post_string):
    return vocab_mbti.words_to_ids(post_string)

In [25]:
def as_padded_array(post_ids, targets, max_len=100, pad_id=0,
                    root_only=False, df_idxs=None):

    x, ns = pad_np_array(post_ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(targets)

In [26]:
y_train_1 = []
for i in range(len(y_train_id)):
    y_train_1.append(y_train_id[i][1])

y_test_1 = []
for i in range(len(y_test_id)):
    y_test_1.append(y_test_id[i][1])

In [27]:
train_x, train_ns, train_y = as_padded_array(x_train, y_train_1)
test_x, test_ns, test_y = as_padded_array(x_test, y_test_1)

In [28]:
len(y_train_1)
print(len(y_test_1))

2068


In [29]:
#set up model using tf.estimator

import MBTI_BOW_model; reload(MBTI_BOW_model)

# Specify model hyperparameters as used by model
model_params = dict(V=vocab_mbti.size, embed_dim=50, hidden_dims=[25], num_classes=2,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)

vocab_mbti.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=MBTI_BOW_model.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (391,714 words) written to '/tmp/tf_bow_sst_20190803-1657/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20190803-1657/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20190803-1657', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x15a7975c0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20190803-1657' --port 6006

Then in your browser, open: http://localhost:6006


## Train Model

In [30]:
#start training


train_params = dict(batch_size=25, total_epochs=10, eval_every=2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)


train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )


test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=25, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
  
    model.train(input_fn=train_input_fn)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20190803-1657/model.ckpt.
INFO:tensorflow:loss = 1.175193, step = 1
INFO:tensorflow:global_step/sec: 214.806
INFO:tensorflow:loss = 1.0870415, step = 101 (0.467 sec)
INFO:tensorflow:global_step/sec: 227.206
INFO:tensorflow:loss = 0.9302255, step = 201 (0.442 sec)
INFO:tensorflow:global_step/sec: 228.956
INFO:tensorflow:loss = 0.8070522, step = 301 (0.435 sec)
INFO:tensorflow:global_step/sec: 226.459
INFO:tensorflow:loss = 0.7835412, step = 401 (0.441 sec)
INFO:tensorflow:global_step/sec: 223.086
INFO:tensorflow:loss = 0.7680011, step = 501 (0.448 sec)
INFO:tensorflow:global_step/sec: 200.603
INFO:tensorflow:loss = 1.0118186, step = 601 (0.498 sec)
INFO:tensorflow:Saving checkpoints for 662 into /tmp/tf_bow_sst_20190803-1657/model.ckpt.
INFO:tensorflow:Loss for final step: 0.7286373.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_ss

## Evaluation

In [31]:
#Evaluation on test data

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")  

print ("Perplexity on test set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))

eval_metrics

INFO:tensorflow:Starting evaluation at 2019-08-03-23:58:31
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190803-1657/model.ckpt-3310
INFO:tensorflow:Finished evaluation at 2019-08-03-23:58:32
INFO:tensorflow:Saving dict for global step 3310: accuracy = 0.6000967, cross_entropy_loss = 0.98044854, global_step = 3310, loss = 1.0477558
Perplexity on test set: 2.67
Accuracy on test set: 60.01%


{'accuracy': 0.6000967,
 'cross_entropy_loss': 0.98044854,
 'loss': 1.0477558,
 'global_step': 3310}

In [32]:
#Evaluation on training data

eval_metrics = model.evaluate(input_fn=train_input_fn, name="train")  

print ("Perplexity on train set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on train set: {:.02%}".format(eval_metrics['accuracy']))


INFO:tensorflow:Starting evaluation at 2019-08-04-00:00:11
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190803-1657/model.ckpt-3310
INFO:tensorflow:Finished evaluation at 2019-08-04-00:00:13
INFO:tensorflow:Saving dict for global step 3310: accuracy = 0.879792, cross_entropy_loss = 0.21594322, global_step = 3310, loss = 0.28545442
Perplexity on train set: 1.24
Accuracy on train set: 87.98%


In [None]:
pred_y = model.predict(input_fn=test_input_fn)
y_pred = []
for i, p in enumerate(pred_y):
    probs = list(p['proba'])
    y_pred.append(probs.index(max(probs)))
        

In [None]:
y_pred_label=[]
for i in y_pred:
    if i == 0:
        y_pred_label.append('e')
    else:
        y_pred_label.append('i')

In [None]:
import collections

counter=collections.Counter(y_pred_label)
print(counter)

In [None]:
987/(987+931)