To Add:
    * Extract more features
    * add more to writeup sections after the train and eval section

Joey Demple<br>
Prof Hulden<br>
LING 5800<br>
Final Project<br>
Dec 17, 2017

# Myers-Briggs Personality Type Classifier
Earlier, I trained a classifier to predict which type out of the 16 possible MBTI personality types belongs to a person based on their last 50 online posts. The accuracy is about 27% for this holistic model with all 16 labels. Although this sounds like pretty bad accuracy when you consider that if we were to pick one personality type at random as a guess that would give us 6.25% accuracy. So we are doing quite a bit better than that.
However, what I want to do here is create four separate classifiers and output a prediction for each one. Breaking this up should provide some interesting insights.

In [1]:
import pandas as pd
import numpy as np
import sklearn
from nltk import word_tokenize
import re

# Classifiers to use in this notebook
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn import tree
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=True)

## Set Parameters for this notebook

In [2]:
# Classifiers to be fitted and evaluated
classifiers = [MultinomialNB(), 
               LogisticRegression(), 
               # svm.SVC(kernel='rbf'), 
               Perceptron(), 
               tree.DecisionTreeClassifier()]

In [3]:
# ratio of train/test data
split_ratio = 0.85

# seed for random split of train/test data
seed = 1

In [4]:
# Location of data
file = './data/mbti_1.csv'

# Our data is the MBTI-Type dataset from Kaggle.com
# https://www.kaggle.com/datasnaek/mbti-type/data

## Define Our Functions

Preprocessing

In [5]:
def load_and_extract(filepath, split_ratio=0.85, seed=1):
    loaded = pd.read_csv(filepath)
    # print('Dataset size', loaded.shape, loaded.info)
    # split into training, testing
    train=loaded.sample(frac=split_ratio, random_state=seed)
    test=loaded.drop(train.index)
    return train, test

In [6]:
def split_into_dichotomies(df):
    # split ENTP or other personality type into four columns
    df['dicho1'] = df.type.str[0]
    df['dicho2'] = df.type.str[1]
    df['dicho3'] = df.type.str[2]
    df['dicho4'] = df.type.str[3]
    return df

In [7]:
def remove_double_period(clean_str):
    return clean_str.replace('..', '. ')

In [8]:
def remove_delim(clean_str):
    return clean_str.replace('|||', '. ')

In [9]:
def replace_links(clean_str):
    urlpattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(urlpattern, '*LINK*', clean_str)

In [10]:
def clean_posts(posts_raw):
    bow = []
    if posts_raw.startswith('"') or posts_raw.startswith("'"):
                # remove extra string quotes if exist
        clean = posts_raw[1:-1]
    else:
        clean = posts_raw
    clean = remove_double_period(clean)
    clean = remove_delim(clean)
    clean = replace_links(clean)
    clean = word_tokenize(clean)   
    return [c.lower() for c in clean]
    

Vectorization

In [11]:
def get_vocab(df):
    ## get vocabulary for a dataset, i.e. all possbile features
    vocab = []
    for row in df.itertuples():
        vocab.extend(row.post_list)
    return set(vocab)

In [12]:
def vectorize_one(x, vocab, dichotomy=False):
    # take one training example (dataframe row) and return a sparse feature vector
    # init feature vec with zeros
    feature_dict = {}
    count_keys = len(feature_dict.keys())
    
    # I know this part is really not elegant code. Rewriting these functions from
    # scratch would be the next step I take.
    if dichotomy == 1:  # if personality type is split into dichotomies
        label = x.dicho1
    elif dichotomy == 2:  # if personality type is split into dichotomies
        label = x.dicho2
    elif dichotomy == 3:  # if personality type is split into dichotomies
        label = x.dicho3
    elif dichotomy == 4:  # if personality type is split into dichotomies
        label = x.dicho4
    else:          # if personality type is not split e.g. INTJ
        label = x.type
    
    raw_feat = x.post_list
    for fx in x.post_list:
        if fx in vocab:
            feature_dict[fx] = 1  # One Hot Encoding
    return label, feature_dict  # returns label and feature dict for a single training sample 

In [13]:
def get_feature_dict(df, vocab, dichotomy=False):
    # Create a feature dictionary for a dataset
    X = []
    y = []
    for row in df.itertuples():
        if dichotomy:
            result = vectorize_one(row, vocab, dichotomy)
        else:
            result = vectorize_one(row, vocab)
        y.append(result[0])
        X.append(result[1])
    return y, X

In [14]:
def vectorize_all(train, test, dichotomy=False):
    vocab = get_vocab(train)
    
    if dichotomy:
        # training data
        y_train, X_train_dict = get_feature_dict(train, vocab, dichotomy)
        X_train = vectorizer.fit_transform(X_train_dict)
    
        # testing data
        y_test, X_test_dict = get_feature_dict(test, vocab, dichotomy)
        X_test = vectorizer.transform(X_test_dict)
    
    else:
        # training data
        y_train, X_train_dict = get_feature_dict(train, vocab)
        X_train = vectorizer.fit_transform(X_train_dict)
    
        # testing data
        y_test, X_test_dict = get_feature_dict(test, vocab)
        X_test = vectorizer.transform(X_test_dict)
    
    return X_train, y_train, X_test, y_test

Train and Evaluate

In [15]:
def fit_and_evaluate_one(classifier, X_train, y_train, X_test, y_test):
    # fitting
    classifier.fit(X_train, y_train)
    
    # accuracy
    results = {}
    y_hat_train = classifier.predict(X_train)
    accur_train = sum(y_hat_train == y_train) / len(y_train)  # train accuracy
    y_hat_test = classifier.predict(X_test)
    accur_test = sum(y_hat_test == y_test) / len(y_test)  # test accuracy
    results['accur_train'] = accur_train
    results['accur_test'] = accur_test

    return results

In [16]:
def fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test):
    results = {}
    for c in classifiers:
        cname = str(c).split('(')[0]
        print('Training '+cname+ '...')
        results[cname] = fit_and_evaluate_one(c, X_train, y_train, X_test, y_test)
    return results

## Load and Preprocess the Data

In [17]:
train, test = load_and_extract(file, split_ratio, seed=1)

All Personality Types, i.e. all possible labels in our dataset

In [18]:
Personality_dichotomies = {1:{'E':0, 'I':0},
                           2:{'N':0, 'S':0},
                           3:{'F':0, 'T':0},
                           4:{'J':0, 'P':0}}

Split type into four dichotomies

In [19]:
train = split_into_dichotomies(train)
test = split_into_dichotomies(test)

Get rid of extra string quotes and tokenize into posts for each row in posts

In [20]:
train['post_list'] = train['posts'].apply(clean_posts)
test['post_list'] = test['posts'].apply(clean_posts)

## Vectorize Train and Evaluate
I put all of these steps into one so that each model would run one at a time and overwrite the previous feature vectors. This should prevent using too much memory.

### Holistic Model<br>
16 types

In [21]:
X_train, y_train, X_test, y_test = vectorize_all(train, test)

In [22]:
holistic_result_dict = fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)
holistic_result = pd.DataFrame.from_dict(holistic_result_dict).transpose().sort_values('accur_test', ascending=False)
holistic_result

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


Unnamed: 0,accur_test,accur_train
LogisticRegression,0.540354,1.0
Perceptron,0.500384,0.993355
DecisionTreeClassifier,0.388163,1.0
MultinomialNB,0.273636,0.621779


The best accuracy is about 54% for the holistic model with all 16 labels (using Logistic Regression). Although this sounds like pretty bad accuracy when you consider that if we were to pick one personality type at random as a guess that would give us 6.25% accuracy. So we are doing quite a bit better than that.

### Dichotomized Model<br>
First Dichotomy: E or I

In [23]:
X_train, y_train, X_test, y_test = vectorize_all(train, test, dichotomy=1)
dicho1_result_dict = fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)
dicho1_result = pd.DataFrame.from_dict(dicho1_result_dict).transpose().sort_values('accur_test', ascending=False)
dicho1_result

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


Unnamed: 0,accur_test,accur_train
LogisticRegression,0.833205,1.0
Perceptron,0.795542,0.971793
MultinomialNB,0.790161,0.897884
DecisionTreeClassifier,0.759416,1.0


Second Dichotomy: N or S

In [24]:
X_train, y_train, X_test, y_test = vectorize_all(train, test, dichotomy=2)
dicho2_result_dict = fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)
dicho2_result = pd.DataFrame.from_dict(dicho2_result_dict).transpose().sort_values('accur_test', ascending=False)
dicho2_result

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


Unnamed: 0,accur_test,accur_train
LogisticRegression,0.85857,1.0
Perceptron,0.848578,0.989829
MultinomialNB,0.840123,0.890155
DecisionTreeClassifier,0.814758,1.0


Third Dichotomy: F or T

In [25]:
X_train, y_train, X_test, y_test = vectorize_all(train, test, dichotomy=3)
dicho3_result_dict = fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)
dicho3_result = pd.DataFrame.from_dict(dicho3_result_dict).transpose().sort_values('accur_test', ascending=False)
dicho3_result

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


Unnamed: 0,accur_test,accur_train
LogisticRegression,0.80784,1.0
Perceptron,0.787087,0.983184
MultinomialNB,0.782475,0.972471
DecisionTreeClassifier,0.670254,1.0


Fourth Dichotomy: J or P

In [26]:
X_train, y_train, X_test, y_test = vectorize_all(train, test, dichotomy=4)
dicho4_result_dict = fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)
dicho4_result = pd.DataFrame.from_dict(dicho4_result_dict).transpose().sort_values('accur_test', ascending=False)
dicho4_result

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


Unnamed: 0,accur_test,accur_train
LogisticRegression,0.713297,1.0
Perceptron,0.705611,0.916056
DecisionTreeClassifier,0.67794,1.0
MultinomialNB,0.657955,0.980336
