## Summary
Machine learning model to get personality insights from text. It is based on this [Kaggle dataset](https://www.kaggle.com/datasnaek/mbti-type)

In [1]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv('../data/mbti-myers-briggs-personality-type-dataset.zip', 
                   compression='zip')

In [3]:
mbti = {'I':'Introversion', 'E':'Extroversion', 'N':'Intuition', 
        'S':'Sensing', 'T':'Thinking', 'F': 'Feeling', 
        'J':'Judging', 'P': 'Perceiving'}

In [6]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [7]:
np.random.seed(1)

scoring = {'acc': 'accuracy',
           'neg_log_loss': 'neg_log_loss',
           'f1_micro': 'f1_micro'}

tfidf2 = CountVectorizer(ngram_range=(1, 1), 
                         stop_words='english',
                         lowercase = True, 
                         max_features = 5000)

In [8]:
model_lr = Pipeline([('tfidf1', tfidf2), 
                     ('lr', LogisticRegression(class_weight="balanced", C=0.005))])

In [9]:
results_lr = cross_validate(model_lr, train['posts'], train['type'], cv=kfolds, 
                          scoring=scoring, n_jobs=-1)

In [13]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_acc']),
                                                          np.std(results_lr['test_acc'])))

print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_f1_micro']),
                                                          np.std(results_lr['test_f1_micro'])))

print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_lr['test_neg_log_loss']),
                                                          np.std(-1*results_lr['test_neg_log_loss'])))

CV Accuracy: 0.6726 (+/- 0.0105)
CV F1: 0.6726 (+/- 0.0105)
CV Logloss: 1.2285 (+/- 0.0233)


In [14]:
model = Pipeline([('tfidf1', tfidf2), 
                  ('lr', LogisticRegression(class_weight="balanced", C=0.005))])

In [15]:
model.fit(train['posts'], train['type'])

Pipeline(memory=None,
     steps=[('tfidf1', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
      ...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [17]:
model.predict(["hello my friend"])[0]

'ISTJ'

Let's save the model to be used later:

In [18]:
import dill as pickle

from sklearn.externals import joblib
joblib.dump(model, '../model/model.pkl')

['../model/model.pkl']

In [19]:
model_loaded = joblib.load('../model/model.pkl')

Let's try it will work:

In [20]:
model_loaded.predict(["hello my friend"])[0]

'ISTJ'

References:

- https://towardsdatascience.com/a-flask-api-for-serving-scikit-learn-models-c8bcdaa41daa
- https://www.analyticsvidhya.com/blog/2017/09/machine-learning-models-as-apis-using-flask/