In [1]:
import pandas as pd
import numpy as np
import re


df = pd.read_csv('mbti_1.csv')

# replace URLs

# replace MBTI
# https://stackoverflow.com/questions/16720541/python-string-replace-regular-expression/16720705
mbti_pat = r"ISFJ|ESFP|ISFP|ISTP|ENFP|ENFJ|INFJ|ESTP|ESFJ|ESTJ|ENTP|INFP|INTP|INTJ|ISTJ|ENTJ"
mbti_regex = re.compile(mbti_pat, re.IGNORECASE)
MBTI_REP = '$MBTI$'

# replace hashtags
hashtag_pat = r"(\#[a-zA-Z0-9]+\b)"
hashtag_regex = re.compile(hashtag_pat)
HASHTAG_REP = '$HASHTAG$'

# Replace links with $link$
# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
link_pat = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
LINK_REP = '$LINK$'

In [2]:
df['posts'] = df['posts'].apply(lambda x: re.sub(mbti_pat, MBTI_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(hashtag_pat, HASHTAG_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(link_pat, LINK_REP, x))


df['posts'] = df['posts'].apply(lambda x: x.replace('|||', ''))

In [3]:
df['IE'] = df['type'].apply(lambda x: 'I' if x[0] == 'I' else 'E')
df['NS'] = df['type'].apply(lambda x: 'N' if x[1] == 'N' else 'S')
df['FT'] = df['type'].apply(lambda x: 'F' if x[2] == 'F' else 'T')
df['PJ'] = df['type'].apply(lambda x: 'P' if x[3] == 'P' else 'J')

In [4]:
df_ie = df[['type', 'posts', 'IE']]
df_ns = df[['type', 'posts', 'NS']]
df_ft = df[['type', 'posts', 'FT']]
df_pj = df[['type', 'posts', 'PJ']]

train_pct = 0.6

# indicates the location to split the data along
# since dev/test are the same size
test_split_position = 1.0 - (1.0 - train_pct) / 2

0.8

In [116]:
# IE split
import collections

train_ie, dev_ie, test_ie = np.split(df_ie.sample(frac=1, random_state = 224), [int(train_pct*len(df_ie)), int(test_split_position*len(df_ie))])
train_ns, dev_ns, test_ns = np.split(df_ns.sample(frac=1, random_state = 224), [int(train_pct*len(df_ns)), int(test_split_position*len(df_ns))])
train_ft, dev_ft, test_ft = np.split(df_ft.sample(frac=1, random_state = 224), [int(train_pct*len(df_ft)), int(test_split_position*len(df_ft))])
train_pj, dev_pj, test_pj = np.split(df_pj.sample(frac=1, random_state = 224), [int(train_pct*len(df_pj)), int(test_split_position*len(df_pj))])

def get5050sample(train_data, train_data_col):
    counts_train = collections.Counter(train_data_col)
#     print(counts_train)
#     print("least freq", counts_train.most_common(2)[1][0])
    most_freq = counts_train.most_common(1)[0][0]
    least_freq = counts_train.most_common(2)[1][0]
#     print(most_freq, least_freq)
    sample_count = counts_train[least_freq]
#     print(sample_count)
    new_sample = train_data[train_data_col == most_freq].sample(n = sample_count)
#     print(len(new_sample))
#     print("hi")
    return train_data[train_data_col == least_freq].append(new_sample).sample(frac=1)

train_ie = get5050sample(train_ie, train_ie['IE'])
train_ns = get5050sample(train_ns, train_ns['NS'])
train_ft = get5050sample(train_ft, train_ft['FT'])
train_pj = get5050sample(train_pj, train_pj['PJ'])

In [117]:
import tensorflow as tf

In [7]:
from sklearn.feature_extraction.text import CountVectorizer as cv

In [8]:
# build vocab dict, mapping word to unique number
# keep track of vocab size (then +1 for UNK)
def build_vocab_dict():
    df = train_ie.copy(deep=True)
    df.append(train_ns)
    df.append(train_ft)
    df.append(train_pj)
    posts = df['posts']
    counter = cv()
    counter.fit(posts)
    return counter.vocabulary_    

In [9]:
vocab_dict = build_vocab_dict()
vocab_size = len(vocab_dict)
vocab_dict['UNK'] = vocab_size

In [11]:
DEFAULT_FILE_PATH = "./glove.6B.50d.txt"
def loadWordVectors(tokens, filepath=DEFAULT_FILE_PATH, dimensions=50):
    """Read pretrained GloVe vectors"""
    lines_so_far = 0
    wordVectors = np.zeros((len(tokens), dimensions))
    with open(filepath, 'rb') as ifs:
        for line in ifs:
            lines_so_far += 1
            line = line.strip()
            if not line:
                continue
            row = line.split()
            token = row[0]
            if token not in tokens:
                continue
            data = [float(x) for x in row[1:]]
            if len(data) != dimensions:
                raise RuntimeError("wrong number of dimensions")
            wordVectors[tokens[token]] = np.asarray(data)
    return wordVectors

In [12]:
# have matrix that's [vocab_size, embedding size] 
glove_dimensions = 50
def build_embeddings_matrix():
    embed_matrix = loadWordVectors(vocab_dict)
    
    for word in vocab_dict:
        embed_matrix_word_index = vocab_dict[word]
        if len(embed_matrix[embed_matrix_word_index]) < glove_dimensions:
            embed_matrix[embed_matrix_word_index] = [float(0) for x in range(glove_dimensions)]
            vocab_dict[word] = vocab_dict['UNK']
    return embed_matrix    

In [13]:
embed_matrix = build_embeddings_matrix()

In [61]:
import itertools

def build_data_matrices(data, column, type_that_should_be_one):
    temp = [[vocab_dict[word] if word in vocab_dict else vocab_dict['UNK'] for word in post.split()]\
                  for post in data['posts'] ]
    data_posts = np.array(list(itertools.zip_longest(*temp, fillvalue=vocab_size))).T
    data_labels = np.asarray([ 1 if type == type_that_should_be_one else 0 for type in data[column]])
    return data_posts, data_labels

In [118]:
train_ie_data, train_ie_labels = build_data_matrices(train_ie, 'IE', 'E')
train_ns_data, train_ns_labels = build_data_matrices(train_ns, 'NS', 'N')
train_ft_data, train_ft_labels = build_data_matrices(train_ft, 'FT', 'F')
train_pj_data, train_pj_labels = build_data_matrices(train_pj, 'PJ', 'P')

In [119]:
train_ie_labels.shape

(2356,)

In [17]:
max_post_len = 300

In [79]:
def build_model(data_matrix, data_labels, hidden_size=128, lr=0.005):
    n_features = glove_dimensions
    n_classes = 2
    max_grad_norm = 5.

    # add placeholders
    input_placeholder = tf.placeholder(tf.int32, shape=(None, max_post_len))
    labels_placeholder = tf.placeholder(tf.int32, shape=(None, ))

    # add embedding layer!
    x = tf.nn.embedding_lookup(embed_matrix, input_placeholder)
    # x = tf.nn.dropout(x, 0.8)

    # build model
    U = tf.get_variable("U", shape=[hidden_size, n_classes], dtype=tf.float64, initializer=tf.contrib.layers.xavier_initializer())
    b = tf.get_variable("b", shape=[1, n_classes], dtype=tf.float64, initializer=tf.constant_initializer(0.0))
    
    rnn_cell = tf.contrib.rnn.BasicLSTMCell(hidden_size)
    rnn_cell = tf.nn.rnn_cell.DropoutWrapper(rnn_cell, output_keep_prob=0.8)
    outputs, final_state = tf.nn.dynamic_rnn(rnn_cell, x, dtype=tf.float64)

    h = final_state[1]
    pred = tf.matmul(h, U) + b

    labels_one_hot = tf.one_hot(labels_placeholder, n_classes)
    loss_op = tf.nn.softmax_cross_entropy_with_logits(labels=labels_one_hot, logits=pred)
    loss_op = tf.reduce_mean(loss_op, 0)

    params = tf.trainable_variables()
#     gradients = tf.gradients(loss_op, params)
#     clippied_gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm)
#     optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
#     train_op = optimizer.apply_gradients(zip(clippied_gradients, params))
    train_op = tf.train.AdamOptimizer(learning_rate = lr).minimize(loss_op)
    return pred, input_placeholder, labels_placeholder, train_op, loss_op

In [68]:
def get_minibatches(data_matrix, data_labels, batch_size, max_sequence_length):
    batch_list = []
    indices = []
    n_matrix_rows = data_matrix.shape[0] 
    for i in range(0, n_matrix_rows, batch_size):
        batch = data_matrix[i : i+batch_size, : max_sequence_length]
        batch_label = data_labels[i : i+batch_size]
        batch_list.append((batch, batch_label))
    return batch_list

In [69]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


def get_accuracy(pred, labels, classes):
    """ Precision for classifier """
    prec = 2
    accuracy = accuracy_score(labels, pred)
    print ("Accuracy: " + str(round(accuracy * 100, prec)) + "%")
    micro_f1 = f1_score(labels, pred, average="micro")
    macro_f1 = f1_score(labels, pred, average="macro")
    class_f1 = f1_score(labels, pred, average=None)
    print ("Micro F1 score: " + str(round(micro_f1 * 100, prec)) + "%")
    print ("Macro F1 score: " + str(round(macro_f1 * 100, prec)) + "%")
    for class_name, score in zip(classes, class_f1):
        print("F1 score for " + class_name + ": ", str(round(score*100, 3)) + "%")

In [70]:
def train(data_matrix, data_labels, save_path, hidden_size=128, lr=0.005, saved_model_path=None, RESUME=False, batch_size=256, n_epochs=30):
    tf.reset_default_graph()
    _, input_placeholder, labels_placeholder, train_op, loss_op = build_model(data_matrix, data_labels, hidden_size=hidden_size, lr=lr)	
    saver = tf.train.Saver()
    avg_loss_list = []
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())
        if RESUME:
            sess.run(tf.global_variables_initializer())
            saver.restore(sess, saved_model_path)
            print("Model restored.")
            
        minibatches = get_minibatches(data_matrix, data_labels, batch_size, max_post_len)
        for i in range(n_epochs):
            batch_loss_list = []
            print ("Epoch " + str(i+1) + ": ")
            for tup in minibatches:
                _, loss = sess.run([train_op, loss_op], feed_dict={input_placeholder: tup[0], labels_placeholder: tup[1]})
                batch_loss_list.append(loss)
            avg_loss_list.append(np.mean(batch_loss_list))
            print ("=====>loss: " + str(avg_loss_list[i]) + " ")
            if (i > 0) and (avg_loss_list[i] < avg_loss_list[i-1]):
                tmp_path = save_path + "--smallest loss"
                saver.save(sess, tmp_path)
                print ("New min loss at epoch %s! Model saved in path %s" % (str(i+1), tmp_path))
        saver.save(sess, save_path)
        print("Final model saved in path: %s" % save_path)

In [71]:
def test(data_matrix, data_labels, saved_model_path, classes, hidden_size=128, batch_size=256):
    tf.reset_default_graph()
    pred, input_placeholder, labels_placeholder, _, loss_op = build_model(data_matrix, data_labels, hidden_size=hidden_size)
    saver = tf.train.Saver()
    loss_list = []
    pred_list = []
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, saved_model_path)
        print("Model restored.")

        minibatches = get_minibatches(data_matrix, data_labels, batch_size, max_post_len)
        for tup in minibatches:
            pred_temp, loss, labels_temp = sess.run([pred, loss_op, labels_placeholder], feed_dict={input_placeholder: tup[0], labels_placeholder: tup[1]})
            for i, row in enumerate(pred_temp):
                pred_list.append(np.where(row == max(row))[0][0])
            loss_list.append(loss)
        print ("Loss: " + str(np.mean(loss_list)) + "\n")        

    get_accuracy(pred_list, data_labels, classes)
    print("predictions: ", pred_list)
    print("labels: ", data_labels)

In [None]:
train(train_ie_data, train_ie_labels, "ie", n_epochs=15)

In [None]:
train(train_ns_data, train_ns_labels, "ns", n_epochs=15)

In [None]:
train(train_ft_data, train_ft_labels, "ft", n_epochs=15)

In [None]:
train(train_pj_data, train_pj_labels, "pj", n_epochs=15)

In [77]:
dev_ie_data, dev_ie_labels = build_data_matrices(dev_ie, 'IE', 'E')
dev_ns_data, dev_ns_labels = build_data_matrices(dev_ns, 'NS', 'N')
dev_ft_data, dev_ft_labels = build_data_matrices(dev_ft, 'FT', 'F')
dev_pj_data, dev_pj_labels = build_data_matrices(dev_pj, 'PJ', 'P')

In [None]:
test(dev_ie_data, dev_ie_labels, "ie--smallest loss", ['E', 'I'])

In [None]:
test(dev_ns_data, dev_ns_labels, "ns--smallest loss", ['N', 'S'])

In [None]:
test(dev_ft_data, dev_ft_labels, "ft--smallest loss", ['F', 'T'])

In [None]:
test(dev_pj_data, dev_pj_labels, "pj--smallest loss", ['P', 'J'])

In [None]:
test_ie_data, test_ie_labels = build_data_matrices(test_ie, 'IE', 'E')
test_ns_data, test_ns_labels = build_data_matrices(test_ns, 'NS', 'N')
test_ft_data, test_ft_labels = build_data_matrices(test_ft, 'FT', 'F')
test_pj_data, test_pj_labels = build_data_matrices(test_pj, 'PJ', 'P')

In [None]:
test(test_ie_data, test_ie_labels, "ie--smallest loss", ['E', 'I'])

In [None]:
test(test_ns_data, test_ns_labels, "ns--smallest loss", ['N', 'S'])

In [None]:
test(test_ft_data, test_ft_labels, "ft--smallest loss", ['F', 'T'])

In [None]:
test(test_pj_data, test_pj_labels, "pj--smallest loss", ['P', 'J'])