# MBTI Prediction

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("./data/mbti_1.csv", header = 0)

df.head()


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## Dataset Analysis

## Learning

In [4]:
data = df
print(data.shape)
# print(posts.iloc[6376])

(8675, 2)


### Split dataset in train and test sets

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.4, random_state=0)
print(train.shape, test.shape)
train.head()

(5205, 2) (3470, 2)


Unnamed: 0,type,posts
4233,ISTP,'boring|||http://www.youtube.com/watch?v=c1KNZ...
1273,INFP,'The Americans It's a new show and I really ...
3092,INTJ,No one under stands your sense of humor. 8653...
6376,ISTP,"'I admire you humor, INTP. So, it would be gre..."
7643,INFP,'I was wondering if anyone would take the time...


### Preprocessing

In [6]:
from preprocessing import format_text

train = pd.DataFrame(train)
train['preprocessed_posts'] = train['posts'].apply(format_text)
train.head()

Unnamed: 0,type,posts,preprocessed_posts
4233,ISTP,'boring|||http://www.youtube.com/watch?v=c1KNZ...,boring link mhm cap and carpenter s glasses t...
1273,INFP,'The Americans It's a new show and I really ...,the americans it s a new show and i really do...
3092,INTJ,No one under stands your sense of humor. 8653...,no one under stands your sense of humor for th...
6376,ISTP,"'I admire you humor, INTP. So, it would be gre...",i admire you humor intp so it would be great ...
7643,INFP,'I was wondering if anyone would take the time...,i was wondering if anyone would take the time...


### Classification

#### Naive Bayesian

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

text_clf = text_clf.fit(train['preprocessed_posts'], train['type'])

### Prediction

In [8]:
from preprocessing import format_text

test = pd.DataFrame(test)
test['preprocessed_posts'] = test['posts'].apply(format_text)
test.head()

Unnamed: 0,type,posts,preprocessed_posts
4587,ISFP,"'Dear ISFJ Mother, I wish you were less of a w...",dear isfj mother i wish you were less of a wo...
2786,INFJ,"'To me, I think you guys may be over analyzing...",to me i think you guys may be over analyzing ...
2813,ENFP,"'NIHM While NIHM has her INTJ husband, I've go...",nihm while nihm has her intj husband i ve got...
3705,INTP,'I want 5 kids: - an astro/nuclear/theoretical...,i want kids an astro nuclear theoretical phys...
5957,ISFP,'I have the same thing as well. I've noticed t...,i have the same thing as well i ve noticed th...


In [10]:
predicted = text_clf.predict(test['preprocessed_posts'])
print("Accuracy: {}".format(np.mean(predicted == test['type'])))
# default: 0.2072
# stop_words: 0.2069
# stop_words & preprocessed: 0.2069

Accuracy: 0.2069164265129683


In [40]:
from sklearn.model_selection import cross_validate

np.random.seed(1)

scoring = {'acc': 'accuracy',
           'f1_micro': 'f1_micro'}

results = cross_validate(text_clf, train['preprocessed_posts'], train['type'], cv=5, 
                          scoring=scoring, n_jobs=-1)

In [41]:
# print(results)
print("Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results['test_acc']),
                                                          np.std(results['test_acc'])))

print("F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results['test_f1_micro']),
                                                          np.std(results['test_f1_micro'])))


# Accuracy: 0.2140 (+/- 0.0004)
# F1: 0.2140 (+/- 0.0004)

# Preprocessed
# Accuracy: 0.2144 (+/- 0.0006)
# F1: 0.2144 (+/- 0.0006)

Accuracy: 0.2144 (+/- 0.0006)
F1: 0.2144 (+/- 0.0006)


#### SVM

In [11]:
from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(max_iter=1000, tol=1e-3)),
                    ])

text_clf_svm = text_clf_svm.fit(train['preprocessed_posts'], train['type'])

predicted_svm = text_clf_svm.predict(test['preprocessed_posts'])
print("Accuracy: {}".format(np.mean(predicted_svm == test['type'])))
# default : 0.6585
# stop_words : 0.6571
# stop_words & preprocessed: 0.6591

Accuracy: 0.6590778097982709
