In [29]:
import pandas as pd
import numpy as np
import re


df = pd.read_csv('mbti_1.csv')

# replace URLs

# replace MBTI
# https://stackoverflow.com/questions/16720541/python-string-replace-regular-expression/16720705
mbti_pat = r"ISFJ|ESFP|ISFP|ISTP|ENFP|ENFJ|INFJ|ESTP|ESFJ|ESTJ|ENTP|INFP|INTP|INTJ|ISTJ|ENTJ"
mbti_regex = re.compile(mbti_pat, re.IGNORECASE)
MBTI_REP = '$MBTI$'

# replace hashtags
hashtag_pat = r"(\#[a-zA-Z0-9]+\b)"
hashtag_regex = re.compile(hashtag_pat)
HASHTAG_REP = '$HASHTAG$'

# Replace links with $link$
# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
link_pat = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
LINK_REP = '$LINK$'

In [30]:
df['posts'] = df['posts'].apply(lambda x: re.sub(mbti_pat, MBTI_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(hashtag_pat, HASHTAG_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(link_pat, LINK_REP, x))


df['posts'] = df['posts'].apply(lambda x: x.replace('|||', ''))

In [31]:
df['IE'] = df['type'].apply(lambda x: 'I' if x[0] == 'I' else 'E')
df['NS'] = df['type'].apply(lambda x: 'N' if x[1] == 'N' else 'S')
df['FT'] = df['type'].apply(lambda x: 'F' if x[2] == 'F' else 'T')
df['PJ'] = df['type'].apply(lambda x: 'P' if x[3] == 'P' else 'J')

In [32]:
list(df.columns)

['type', 'posts', 'IE', 'NS', 'FT', 'PJ']

In [33]:
train, dev, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [34]:
df_ie = df[['type', 'posts', 'IE']]
df_ns = df[['type', 'posts', 'NS']]
df_ft = df[['type', 'posts', 'FT']]
df_pj = df[['type', 'posts', 'PJ']]

In [35]:
train_pct = 0.6

# indicates the location to split the data along
# since dev/test are the same size
test_split_position = 1.0 - (1.0 - train_pct) / 2
test_split_position

0.8

In [36]:
train_ie, dev_ie, test_ie = np.split(df_ie.sample(frac=1), [int(train_pct*len(df_ie)), int(test_split_position*len(df_ie))])
train_ns, dev_ns, test_ns = np.split(df_ns.sample(frac=1), [int(train_pct*len(df_ns)), int(test_split_position*len(df_ns))])
train_ft, dev_ft, test_ft = np.split(df_ft.sample(frac=1), [int(train_pct*len(df_ft)), int(test_split_position*len(df_ft))])
train_pj, dev_pj, test_pj = np.split(df_pj.sample(frac=1), [int(train_pct*len(df_pj)), int(test_split_position*len(df_pj))])

In [37]:
dev_ie.head()

Unnamed: 0,type,posts,IE
6669,INTJ,"'I was certain that Peter Pan was real, and sl...",I
2254,INFJ,Hi everyonex85This is the first time I visited...,I
8640,INTP,'This is a great thread... I find it interesti...,I
3601,ENFP,"'Also, sometimes I find myself very dependent ...",E
1389,ISTP,"'I can't stand pretentious people, Chatty Cath...",I


In [50]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

# baseline for IE
train_counts_ie = count_vect.fit_transform(list(train_ie['posts']))
train_counts_ie.shape

(5205, 95901)

In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

In [55]:
#IE dev set
text_clf.fit(train_ie['posts'], train_ie['IE'])  
predicted_ie = text_clf.predict(dev_ie['posts'])
np.mean(predicted_ie == dev_ie['IE'])

0.7855907780979827

In [60]:
#NS dev set
text_clf.fit(train_ns['posts'], train_ns['NS'])  
predicted_ns = text_clf.predict(dev_ns['posts'])
np.mean(predicted_ns == dev_ns['NS'])

0.8668587896253602

In [61]:
#FT dev set
text_clf.fit(train_ft['posts'], train_ft['FT'])  
predicted_ft = text_clf.predict(dev_ft['posts'])
np.mean(predicted_ft == dev_ft['FT'])

0.7602305475504323

In [62]:
#PJ dev set
text_clf.fit(train_pj['posts'], train_pj['PJ'])  
predicted_pj = text_clf.predict(dev_pj['posts'])
np.mean(predicted_pj == dev_pj['PJ'])

0.6484149855907781

In [71]:
dev_ft['posts']

3471    'I broke the windshield.  With my foot.  When ...
5765    'From the place of the collective unconscious,...
8486    'Me.  For me i will attempt to take interest i...
1675    'Great! I sent you a challenge. :)I'm a marrie...
4270    'Heart of Darkness On the RoadI like to use pa...
7075    '29 out of 36 as well.  Which is actually rema...
1484    'I do love to stumble upon fascinating connect...
4548    'From my experience this is very common with N...
7968    'Dear $MBTI$,   I don't like how you feel the ...
977     '4, 22, 31, 34, 37, 42, 54, 64, 72, 84  17 is ...
5128    'calm pleasant  soothing relaxing soft gentle ...
5558    'Ti - Ni - Te - Ne - Se - Si - Fi - Fe  If we ...
5043    'House is more of our goal. There, apparently,...
4306    'The first, no doubt. Explaining why is somewh...
7555    'That happens to me all the time, most annoyin...
8190    'This got a little bit off-topic but I don't m...
3809    'I dunno, I mean, where are their egos, LOL? C...
5630    'jamie