# MBTI Parallel Classification Model with Neural BOW (I/E Axis)

First, load libraries and useful functions from class:

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division


import os, sys, re, json, time, datetime, shutil
from importlib import reload
import collections, itertools

# NumPy and TensorFlow
import numpy as np
import pandas as pd
import tensorflow as tf
from w266_common import patched_numpy_io
assert(tf.__version__.startswith("1."))
from nltk.corpus import stopwords

# Utils and Helper libraries
# import nltk
from w266_common import utils, vocabulary
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import math
from nltk.corpus import stopwords

  from ._conv import register_converters as _register_converters


## Specifications for Binary Classification NBOW for MBTI

In this baseline, the task is to predict the first MBTI axis (I vs. E) given a text string. We will model after the A2 assignment, with Architecture and Parameters defined below.

### Pre-Processing:
* Minimial pre-processing, only separating punctuation from text and lower-case all text
* Assigning words to numerical indices based on a fixed Vocab size, defined by word frequency in training set
* Pulled out first axis of all target labels, assigned to binary (E = 0, I = 1)

### Architecture:
* Encoder: Bag of Words 
* Decoder: Softmax
* Classification: Binary (2 MBTI types - I or E)

### Parameters
* Batch Size: 25 
* Text length: 100
* Vocabulary size (V): ~328K - removed stopwords
* Embedding Size: 50
* Hidden Dimensions: 25

### Training:
* Epochs = 10 
* 80% train, 20% test
* Loss: Sparse Softmax Cross Entropy 
* Optimizers: Adagrad Optimizer

## Load Corpus & Pre-Process

In [2]:
#load data
df = pd.read_csv('../personalities_large_no_duplicates_C.csv',index_col = 0)
df.head(5)

Unnamed: 0,user_screen_name,tweets,prof_image_url,MBTI,orig_tweets,id_,retweets,fav
0,BillTooke,"[""Yes. The College as the voting block for the...",http://pbs.twimg.com/profile_images/8070634314...,ENFP,"""I have no shame and little guile when it come...",1123269839026642944,0,1
1,dougie0216,"['All Hail De Gendt! #TDF2019', 'He has a powe...",http://pbs.twimg.com/profile_images/1126179691...,INFP,"""@princessfemme Wow another ENFP what are th...",1123265408419733505,0,1
2,kimbetech,['Get on LIVE now. 10:30pm EST https:// share....,http://pbs.twimg.com/profile_images/1101342873...,ENFP,"""I am an ENFP . We do not take kindly to being...",988128789577240576,0,1
3,honeyBklein,"[""I agree. It's a spiritual battle. In light o...",http://pbs.twimg.com/profile_images/1130313304...,ENFJ,"""@ABeardedPoet actually I am an INFP not INF...",1123206556181573632,0,2
4,tanishatray4,['Are you left handed or right handed? — Left ...,http://pbs.twimg.com/profile_images/9792931110...,ENFP,"""Ya Absolutely that makes a lot of sense .The...",1123130637664309248,0,0


In [3]:
## dimensions:
df.shape

(10339, 8)

In [4]:
mbti_types = df.MBTI.unique().tolist()

mbti_types_low = df['MBTI'].str.lower().unique().tolist()
mbti_types = mbti_types + mbti_types_low
mbti_types

['ENFP',
 'INFP',
 'ENFJ',
 'ENTP',
 'ESFJ',
 'INFJ',
 'INTJ',
 'INTP',
 'ESTJ',
 'ISTJ',
 'ENTJ',
 'ISFJ',
 'ESFP',
 'ISTP',
 'ESTP',
 'ISFP',
 'enfp',
 'infp',
 'enfj',
 'entp',
 'esfj',
 'infj',
 'intj',
 'intp',
 'estj',
 'istj',
 'entj',
 'isfj',
 'esfp',
 'istp',
 'estp',
 'isfp']

In [5]:
## Hypothesis 2: political leaning
types = set([e.lower() for e in mbti_types])
N = len(list(types))
N

16

## Skip?

In [6]:
#remove mbti types from tweets
for x in mbti_types:
    df['tweets'] = df['tweets'].str.replace(x,'mbti')


In [7]:
df.tweets.isnull().sum()

2

In [8]:
#remove nan values
df = df.dropna(subset=['tweets'])


In [9]:
df.shape

(10337, 8)

In [10]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

import string
string.punctuation

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/heatherkoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/heatherkoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Use nltk stopwords

In [11]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean_tokenize(sentence):
    #ignore_words = ['a', 'the', 'user', 'i','is',"'",'com','pic_twitt','s','https_twitt','https_www','https_twitter','t', 'u','lol']
    custom = ["'",'com','pic_twitt','s','https_twitt','https_www','https_twitter','t', 'u','lol']
    ignore_words = nltk.corpus.stopwords.words("english") + list(string.punctuation) + custom
    sentence = re.sub("\'","",sentence)
    words = re.sub("[^\w]|[0-9]", " ",  sentence).split() #removes all non-alphanumeric words, removes all numbers
    words_cleaned = [w.lower() for w in words if w.lower() not in ignore_words]
    #stop_words = set(stopwords.words('english'))
    #words_cleaned = ' '.join(word for word in words_cleaned)
    
    return words_cleaned

In [12]:
df["clean_tweets"] = df["tweets"].apply(clean_tokenize)


In [13]:
df["clean_tweets"][0]

['yes',
 'college',
 'voting',
 'block',
 'pope',
 'didnt',
 'come',
 'around',
 'like',
 'th',
 'century',
 'cardinals',
 'important',
 'romans',
 'mean',
 'lot',
 'arabia',
 'folk',
 'religion',
 'mesopotamia',
 'nestorian',
 'egypt',
 'monophysite',
 'go',
 'along',
 'catholicism',
 'arianism',
 'hanging',
 'around',
 'north',
 'africa',
 'certainly',
 'helped',
 'smooth',
 'pathway',
 'islam',
 'sorry',
 'good',
 'ancient',
 'church',
 'taught',
 'islam',
 'christian',
 'heresy',
 'dante',
 'put',
 'mohammad',
 'schismatics',
 'richard',
 'harriss',
 'kid',
 'fabulous',
 'stellan',
 'course',
 'piecemealed',
 'chernobyl',
 'youtube',
 'fuck',
 'magnificent',
 'https',
 'www',
 'ncronline',
 'org',
 'blogs',
 'ncr',
 'toda',
 'faith',
 'facts',
 'mike',
 'pence',
 'born',
 'evangelical',
 'catholic',
 'whole',
 'midwest',
 'catholic',
 'thing',
 'pat',
 'altar',
 'boy',
 'devout',
 'family',
 'still',
 'calls',
 'catholic',
 'evangelical',
 'something',
 'like',
 'actually',
 'catho

In [14]:
# function to tokenize and clean sentence ["Hello world."] into list of words ["hello","world"]
def clean(sentence):
    ignore_words = ['a']
    words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
    stop_words = set(stopwords.words('english'))
    words_cleaned = [w for w in words_cleaned if not w in stop_words]
    words_string = ''.join(words_cleaned)
    return words_string

In [15]:
# split posts per users into separate sentences
#post = []
#utype = []
#user = []

#for index, row in df.iterrows():
#    posts = row['posts'].split('|||')
#    posts_clean = []
##    for sentence in posts:
#       posts_clean.append(clean(sentence))
#    post.extend(posts_clean)
#     post.extend(posts)
#    utype.extend([row['type'] for i in range(len(posts))])
#    user.extend([index for i in range(len(posts))])
    
#short_posts = pd.DataFrame({"user": user,"type": utype,"post": post})
#print(short_posts.shape)
#short_posts.head(5)

In [16]:
df['clean_tweets'].isnull().sum()

0

In [17]:
## Remove empty tweets
df[df['clean_tweets'] == '[]'].count()

user_screen_name    0
tweets              0
prof_image_url      0
MBTI                0
orig_tweets         0
id_                 0
retweets            0
fav                 0
clean_tweets        0
dtype: int64

In [18]:
df[df['clean_tweets'] == '[]'].count()

user_screen_name    0
tweets              0
prof_image_url      0
MBTI                0
orig_tweets         0
id_                 0
retweets            0
fav                 0
clean_tweets        0
dtype: int64

In [19]:
len(df['clean_tweets'].iloc[0])

1344

### Empty lists came as a result of pre-processing!

In [20]:
empty_ind = []
for i in range(df['clean_tweets'].shape[0]):
    if len(df['clean_tweets'].iloc[i]) == 0:
        empty_ind.append(i)
    
    

In [21]:
df['clean_tweets'].shape[0]

10337

In [22]:
len(empty_ind)

2879

In [23]:
empty_ind[:12]

[77, 84, 106, 114, 115, 122, 123, 143, 154, 179, 201, 255]

In [24]:
df['clean_tweets'].iloc[77]

[]

In [25]:
new_df_L = df.shape[0]-len(empty_ind)

In [26]:
new_df_L

7458

In [27]:
dff = df.drop(empty_ind, inplace = True)


In [28]:
df.shape

(7458, 9)

## TRY Stratified sampling?

In [29]:
def stratifiedSplit(X,y,size):
    sss = StratifiedShuffleSplit(n_splits = 16, test_size=size, random_state=0)

    for train_index, test_index in sss:
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    return X_train, X_test, y_train, y_test

## about calss imbalance:
https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras

In [30]:
#np.array(df['clean_tweets'])

In [31]:
### Remob=ve emmpty []?

In [32]:
# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(np.array(df['clean_tweets']), 
                                                    np.array(df['MBTI']), 
                                                    test_size=0.2, 
                                                    random_state=88)


#print("MBTI posts", post_train[5])
print('')
#print("MBTI Labels: ",label_train[:5])




In [33]:
len(post_train[0])

146

In [34]:
# Build a vocabulary (V size is defaulted to full text) for train corpus

vocab_train = []
for i in range(len(post_train)):
    for word in post_train[i]:
        vocab_train.append(word)


vocab_mbti = vocabulary.Vocabulary((w for w in vocab_train))
vocab_mbti.size

372677

In [35]:
vocab_mbti.write_flat_file('vocab.csv')

Vocabulary (372,677 words) written to 'vocab.csv'


In [36]:
print (vocab_mbti.ids_to_words([0]))

['<s>']


In [37]:
print (vocab_mbti.words_to_ids(['got','what','and','the']))
#print (vocab_mbti.ids_to_words([202, 147565, 317206, 159348])) 

[31, 2, 2, 2]


In [38]:
print (vocab_mbti.words_to_ids(['intj','twitter','and','the']))

[88106, 3, 2, 2]


In [39]:
#vocab_mbti.words_to_ids(post_train[5])

In [40]:
#post_train[5]

In [41]:
# tokenize and canonicalize train and test sets
x_train = []
for i in range(len(post_train)):
    x_train.append(vocab_mbti.words_to_ids(post_train[i]))

x_test = []
for i in range(len(post_test)):
    x_test.append(vocab_mbti.words_to_ids(post_test[i]))

In [42]:
print("Original Text: ",post_train[88])
print("Canonicalized Text: ", x_train[88])
print("Max lengths of texts: ", max([len(x) for x in x_train+x_test]))

Original Text:  ['lolll', 'looks', 'tall', 'obvs', 'cause', 'stroke', 'baseline', 'stupid', 'yeah', 'prob', 'look', 'pinch', 'wide', 'sans', 'pjs', 'cause', 'formal', 'attire', 'p', 'similarly', 'manic', 'inducing', 'period', 'undiagnosed', 'medical', 'disorder', 'super', 'long', 'shot', 'info', 'maybe', 'get', 'blood', 'work', 'done', 'see', 'everything', 'comes', 'back', 'normal', 'levels', 'one', 'area', 'seeing', 'saved', 'life', 'basically', 'courier', 'bit', 'nice', 'getting', 'designs', 'program', 'useable', 'state', 'developers', 'use', 'course', 'listening', 'loud', 'sounds', 'continued', 'headphone', 'usage', 'cold', 'saturate', 'ears', 'maybe', 'aural', 'break', 'different', 'elevation', 'soft', 'neck', 'back', 'yoga', 'might', 'loosen', 'muscles', 'unkink', 'whatever', 'causing', 'ringing', 'permanent', 'tinnitus', 'right', 'ear', 'work', 'mildly', 'reduces', 'sadly', 'glad', 'worked', 'seth', 'sucks', 'places', 'rural', 'town', 'next', 'city', 'lucky', 'mpbs', 'free', 'upg

# Binary vs Cluster label assignment: choose one only

### Binary mbti (performs better!)

In [45]:
def binary_mbti(string):
    
    if string[:2]=="EN" and string[-1]=="P": #ENP
        return 0
    elif string[:2]=="EN" and string[-1]=="J": #ENJ 
        return 1
    elif string[:2]=="IS" and string[-1]=="P": #ISP
        return 2
    elif string[:2]=="IS"and string[-1]=="J": #ISJ 
        return 3
    elif string[:2]!="IS" and string[:2]!="EN" and string[-1]=="P": #other + P
        return 4
    elif string[:2]!="IS" and string[:2]!="EN" and string[-1]=="J": #other + J
        return 5

        
   # assert len(label_bin) == 4,"Not a valid MBTI type"
   # return label_bin

In [46]:
print(label_train[0])
print(binary_mbti(label_train[0]))

ESTP
4


In [47]:
## binary
y_train_id = list(map(lambda x: binary_mbti(x), label_train))
y_test_id = list(map(lambda x: binary_mbti(x), label_test))

print(y_train_id[0:5])
print(label_train[0:5])
print(y_test_id[0:5])
print(label_test[0:5])

[4, 5, 5, 4, 2]
['ESTP' 'ESFJ' 'INFJ' 'INTP' 'ISFP']
[1, 5, 0, 0, 5]
['ENFJ' 'INTJ' 'ENTP' 'ENFP' 'INFJ']


### Cluster mbti

In [174]:
def cluster_mbti(string):
    label_bin = []
    if string[0]=="E":
        label_bin.append(1)
    else:
        label_bin.append(0)
    if string[1]=="N":
        label_bin.append(1)
    else:
        label_bin.append(0)
    if string[2]=="F":
        label_bin.append(1)
    else:
        label_bin.append(0)
    if string[3]=="J":
        label_bin.append(1)
    else:
        label_bin.append(0)
        
    assert len(label_bin) == 4,"Not a valid MBTI type"
    return label_bin

In [175]:
print(label_train[0])
print(cluster_mbti(label_train[0]))

ESTP
[1, 0, 0, 0]


In [176]:
#cluster
y_train_id = list(map(lambda x: cluster_mbti(x), label_train))
y_test_id = list(map(lambda x: cluster_mbti(x), label_test))

print(y_train_id[0:5])
print(label_train[0:5])

[[1, 0, 0, 0], [1, 0, 1, 1], [0, 1, 1, 1], [0, 1, 0, 0], [0, 0, 1, 0]]
['ESTP' 'ESFJ' 'INFJ' 'INTP' 'ISFP']


## Build the NBOW Model

In [48]:
def pad_np_array(example_ids, max_len=100, pad_id=0):

    arr = np.full([len(example_ids), max_len], pad_id, dtype=np.int32)
    ns = np.zeros([len(example_ids)], dtype=np.int32)
    for i, ids in enumerate(example_ids):
        cpy_len = min(len(ids), max_len)
        arr[i,:cpy_len] = ids[:cpy_len]
        ns[i] = cpy_len
    return arr, ns

def tokenize_post(post_string):
    return vocab_mbti.words_to_ids(post_string)

In [49]:
def as_padded_array(post_ids, targets, max_len=100, pad_id=0,
                    root_only=False, df_idxs=None):

    x, ns = pad_np_array(post_ids, max_len=max_len, pad_id=pad_id)
    return x, ns, np.array(targets)

In [50]:
y_train_1 = []
for i in range(len(y_train_id)):
    y_train_1.append(y_train_id[i])

y_test_1 = []
for i in range(len(y_test_id)):
    y_test_1.append(y_test_id[i])

In [51]:
train_x, train_ns, train_y = as_padded_array(x_train, y_train_1)
test_x, test_ns, test_y = as_padded_array(x_test, y_test_1)

In [138]:
test_x.shape

(1492, 100)

In [139]:
train_x.shape

(5966, 100)

In [97]:
len(y_train_1)

5966

In [153]:
len(y_train_1)
print((y_test_1))

[1, 5, 0, 0, 5, 5, 5, 1, 5, 1, 0, 4, 1, 3, 5, 1, 4, 1, 0, 5, 5, 1, 1, 5, 4, 4, 2, 1, 5, 5, 4, 5, 4, 2, 5, 0, 1, 1, 2, 2, 5, 2, 0, 0, 4, 4, 4, 1, 1, 0, 5, 2, 5, 0, 4, 4, 5, 5, 2, 0, 1, 2, 4, 1, 1, 0, 0, 0, 1, 3, 2, 3, 1, 5, 5, 1, 2, 4, 1, 1, 3, 0, 1, 2, 1, 4, 4, 1, 5, 0, 5, 4, 1, 5, 4, 4, 5, 4, 1, 5, 5, 5, 0, 0, 5, 4, 5, 2, 5, 4, 5, 5, 4, 5, 5, 2, 3, 5, 0, 1, 2, 5, 2, 5, 0, 2, 0, 1, 4, 5, 0, 5, 3, 4, 0, 5, 0, 0, 5, 5, 5, 5, 4, 0, 5, 5, 3, 1, 1, 5, 4, 3, 2, 2, 4, 4, 0, 0, 2, 2, 1, 4, 3, 4, 4, 3, 0, 4, 5, 3, 2, 3, 0, 0, 5, 5, 3, 1, 5, 4, 3, 0, 5, 4, 4, 4, 0, 5, 2, 1, 2, 1, 5, 4, 4, 3, 1, 3, 5, 3, 4, 1, 5, 5, 0, 5, 0, 3, 4, 5, 4, 0, 1, 5, 5, 4, 4, 5, 1, 5, 1, 3, 5, 3, 2, 4, 1, 4, 1, 4, 1, 4, 3, 1, 0, 4, 5, 5, 0, 1, 0, 4, 4, 0, 5, 3, 0, 0, 3, 2, 4, 1, 0, 4, 3, 3, 4, 0, 5, 5, 4, 5, 5, 1, 1, 2, 0, 1, 0, 3, 4, 4, 1, 1, 5, 5, 5, 0, 3, 3, 4, 4, 0, 2, 0, 4, 2, 1, 3, 4, 3, 5, 2, 2, 4, 2, 0, 0, 0, 4, 3, 1, 1, 1, 4, 5, 5, 0, 4, 0, 0, 4, 2, 5, 5, 0, 0, 5, 2, 5, 5, 5, 4, 5, 4, 5, 3, 1, 4, 4, 3, 2, 5, 

In [52]:
import MBTI_BOW_model; reload(MBTI_BOW_model)


<module 'MBTI_BOW_model' from '/Users/heatherkoo/Documents/MIDS/W210 Capstone/personality/Heather/code/MBTI_BOW_model.py'>

In [53]:
#set up model using tf.estimator

import MBTI_BOW_model; reload(MBTI_BOW_model)

# Specify model hyperparameters as used by model
model_params = dict(V=vocab_mbti.size, embed_dim=50, hidden_dims=[25], num_classes=6,
                    encoder_type='bow',
                    lr=0.1, optimizer='adagrad', beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)

vocab_mbti.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=MBTI_BOW_model.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

Vocabulary (372,677 words) written to '/tmp/tf_bow_sst_20190802-0548/metadata.tsv'
Projector config written to /tmp/tf_bow_sst_20190802-0548/projector_config.pbtxt
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20190802-0548', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13f3b9908>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20190802-0548' --port 6006

Then in your browser, open: http://localhost:6006


## Train Model

In [54]:
#start training


train_params = dict(batch_size=25, total_epochs=10, eval_every=2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)


train_input_fn = patched_numpy_io.numpy_input_fn(
                    x={"ids": train_x, "ns": train_ns}, y=train_y,
                    batch_size=train_params['batch_size'], 
                    num_epochs=train_params['eval_every'], shuffle=True, seed=42
                 )


test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": test_x, "ns": test_ns}, y=test_y,
                    batch_size=25, num_epochs=1, shuffle=False
                )

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
  
    model.train(input_fn=train_input_fn)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf_bow_sst_20190802-0548/model.ckpt.
INFO:tensorflow:loss = 2.7754898, step = 1
INFO:tensorflow:global_step/sec: 170.467
INFO:tensorflow:loss = 2.3798978, step = 101 (0.589 sec)
INFO:tensorflow:global_step/sec: 170.046
INFO:tensorflow:loss = 2.2799463, step = 201 (0.588 sec)
INFO:tensorflow:global_step/sec: 214.757
INFO:tensorflow:loss = 1.8924954, step = 301 (0.465 sec)
INFO:tensorflow:global_step/sec: 212.635
INFO:tensorflow:loss = 2.0387695, step = 401 (0.471 sec)
INFO:tensorflow:Saving checkpoints for 478 into /tmp/tf_bow_sst_20190802-0548/model.ckpt.
INFO:tensorflow:Loss for final step: 2.0891361.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190802-0548/model.ckpt-478
INFO:tensorflow:Saving checkpoints for 479 into /tmp/tf_bow_sst_20190802-0548/model.ckpt.
INFO:tensorflow:loss = 1.0758361, step = 479
INFO:tensorflow:global_step/s

## Evaluation

In [55]:
#Evaluation on test data

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")  

print ("Perplexity on test set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))

eval_metrics

INFO:tensorflow:Starting evaluation at 2019-08-02-12:49:29
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190802-0548/model.ckpt-2390
INFO:tensorflow:Finished evaluation at 2019-08-02-12:49:30
INFO:tensorflow:Saving dict for global step 2390: accuracy = 0.22520107, cross_entropy_loss = 2.130912, global_step = 2390, loss = 2.3780413
Perplexity on test set: 8.42
Accuracy on test set: 22.52%


{'accuracy': 0.22520107,
 'cross_entropy_loss': 2.130912,
 'loss': 2.3780413,
 'global_step': 2390}

In [235]:
## Evaluation on training data

eval_metrics = model.evaluate(input_fn=train_input_fn, name="train")  

print ("Perplexity on train set: {:.03}".format(math.exp(eval_metrics['cross_entropy_loss'])))
print("Accuracy on train set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-07-29T08:09:43Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20190729-0108/model.ckpt-3310
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-07-29-08:09:45
INFO:tensorflow:Saving dict for global step 3310: accuracy = 0.9203048, cross_entropy_loss = 0.18425459, global_step = 3310, loss = 0.2531177
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3310: /tmp/tf_bow_sst_20190729-0108/model.ckpt-3310
Perplexity on train set: 1.2
Accuracy on train set: 92.03%


{'accuracy': 0.9203048,
 'cross_entropy_loss': 0.18425459,
 'loss': 0.2531177,
 'global_step': 3310}

## On data with empty tweet lists!

## On data where no empty tweets