## Summary
Machine learning model to get personality insights from text. It is based on this [Kaggle dataset](https://www.kaggle.com/datasnaek/mbti-type)

In [1]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv('../data/mbti-myers-briggs-personality-type-dataset.zip', 
                   compression='zip')

In [3]:
mbti = {'I':'Introversion', 'E':'Extroversion', 'N':'Intuition', 
        'S':'Sensing', 'T':'Thinking', 'F': 'Feeling', 
        'J':'Judging', 'P': 'Perceiving'}

In [4]:
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    return text

In [5]:
train['clean_posts'] = train['posts'].apply(cleanText)

In [6]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [7]:
np.random.seed(1)

scoring = {'acc': 'accuracy',
           'neg_log_loss': 'neg_log_loss',
           'f1_micro': 'f1_micro'}

tfidf2 = CountVectorizer(ngram_range=(1, 1), 
                         stop_words='english',
                         lowercase = True, 
                         max_features = 5000)



model_lr = Pipeline([('tfidf1', tfidf2), 
                     ('lr', LogisticRegression(class_weight="balanced", C=0.005))])

results_lr = cross_validate(model_lr, train['clean_posts'], train['type'], cv=kfolds, 
                          scoring=scoring, n_jobs=-1)

In [8]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_acc']),
                                                          np.std(results_lr['test_acc'])))

print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_f1_micro']),
                                                          np.std(results_lr['test_f1_micro'])))

print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_lr['test_neg_log_loss']),
                                                          np.std(-1*results_lr['test_neg_log_loss'])))

CV Accuracy: 0.6659 (+/- 0.0090)
CV F1: 0.6659 (+/- 0.0090)
CV Logloss: 1.2501 (+/- 0.0238)


Let's integrate preprocessing into the pipeline to try this trick (a easier way could have been in this case to use the preprocessing param of the CountVectorizer class):  

In [9]:
def pipelinize(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            return [function(i) for i in list_or_series]
        else: # if it's not active, just pass it right back
            return list_or_series
    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

In [10]:
model = Pipeline([('prepro', pipelinize(cleanText)), 
                  ('tfidf1', tfidf2), 
                  ('model', LogisticRegression(class_weight="balanced", C=0.005))])

In [11]:
model.fit(train['posts'], train['type'])

Pipeline(memory=None,
     steps=[('prepro', FunctionTransformer(accept_sparse=False,
          func=<function pipelinize.<locals>.list_comprehend_a_function at 0x110db5730>,
          inv_kw_args=None, inverse_func=None, kw_args={'active': True},
          pass_y='deprecated', validate=False)), ('tfidf1', CountVectorizer(ana...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [12]:
model.predict(["hello my friend"])[0]

'ISFJ'

Let's save the model to be used later:

In [19]:
import dill as pickle

In [14]:
with open('../model/model.pkl', 'wb') as file:
    pickle.dump(model, file)

Let's try it will work:

In [15]:
with open('../model/model.pkl', 'rb') as file:
    model_loaded = pickle.load(file)

In [16]:
model_loaded.predict(["hello my friend"])[0]

'ISFJ'

References:

- https://towardsdatascience.com/a-flask-api-for-serving-scikit-learn-models-c8bcdaa41daa
- https://www.analyticsvidhya.com/blog/2017/09/machine-learning-models-as-apis-using-flask/