In [55]:
import pandas as pd
import numpy as np
import re


df = pd.read_csv('mbti_1.csv')

# replace URLs

# replace MBTI
# https://stackoverflow.com/questions/16720541/python-string-replace-regular-expression/16720705
mbti_pat = r"ISFJ|ESFP|ISFP|ISTP|ENFP|ENFJ|INFJ|ESTP|ESFJ|ESTJ|ENTP|INFP|INTP|INTJ|ISTJ|ENTJ"
mbti_regex = re.compile(mbti_pat, re.IGNORECASE)
MBTI_REP = '$MBTI$'

# replace hashtags
hashtag_pat = r"(\#[a-zA-Z0-9]+\b)"
hashtag_regex = re.compile(hashtag_pat)
HASHTAG_REP = '$HASHTAG$'

# Replace links with $link$
# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
link_pat = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
LINK_REP = '$LINK$'

In [56]:
df['posts'] = df['posts'].apply(lambda x: re.sub(mbti_pat, MBTI_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(hashtag_pat, HASHTAG_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(link_pat, LINK_REP, x))


df['posts'] = df['posts'].apply(lambda x: x.replace('|||', ''))

In [57]:
df['IE'] = df['type'].apply(lambda x: 'I' if x[0] == 'I' else 'E')
df['NS'] = df['type'].apply(lambda x: 'N' if x[1] == 'N' else 'S')
df['FT'] = df['type'].apply(lambda x: 'F' if x[2] == 'F' else 'T')
df['PJ'] = df['type'].apply(lambda x: 'P' if x[3] == 'P' else 'J')

In [58]:
list(df.columns)

['type', 'posts', 'IE', 'NS', 'FT', 'PJ']

In [59]:
df_ie = df[['type', 'posts', 'IE']]
df_ns = df[['type', 'posts', 'NS']]
df_ft = df[['type', 'posts', 'FT']]
df_pj = df[['type', 'posts', 'PJ']]

In [60]:
# # Double data values for underrepresented traits
# df_ie = df_ie.append(df_ie[df_ie['IE'] == 'E'])
# df_ns = df_ns.append(df_ns[df_ns['NS'] == 'S'])

In [61]:
train_pct = 0.6

# indicates the location to split the data along
# since dev/test are the same size
test_split_position = 1.0 - (1.0 - train_pct) / 2
test_split_position

0.8

In [62]:
train_ie, dev_ie, test_ie = np.split(df_ie.sample(frac=1, random_state = 224), [int(train_pct*len(df_ie)), int(test_split_position*len(df_ie))])
train_ns, dev_ns, test_ns = np.split(df_ns.sample(frac=1, random_state = 224), [int(train_pct*len(df_ns)), int(test_split_position*len(df_ns))])
train_ft, dev_ft, test_ft = np.split(df_ft.sample(frac=1, random_state = 224), [int(train_pct*len(df_ft)), int(test_split_position*len(df_ft))])
train_pj, dev_pj, test_pj = np.split(df_pj.sample(frac=1, random_state = 224), [int(train_pct*len(df_pj)), int(test_split_position*len(df_pj))])

In [63]:
dev_ie.head()

Unnamed: 0,type,posts,IE
3325,INTP,"'Well, I thought this would never happen but I...",I
43,INFP,'I'm Type 9 and people in my family (who aren'...,I
902,INTP,'He mentioned extroversion and then you mentio...,I
8340,ENTP,'I wish to change my name to War pigs and than...,E
3392,INFJ,'Some just talk and want to be heard. My 11th...,I


In [64]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

# baseline for IE
train_counts_ie = count_vect.fit_transform(list(train_ie['posts']))
train_counts_ie.shape

(5205, 95312)

In [65]:
downsize_factor_ie = sum(train_ie['IE'] == 'E')/len(train_ie['IE'])
train_weights_ie = [1 if ie == 'E' else downsize_factor_ie for ie in train_ie['IE']]

In [66]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

In [67]:
#IE dev set
text_clf.fit(train_ie['posts'], train_ie['IE'], **{'clf__sample_weight': train_weights_ie})  
predicted_ie = text_clf.predict(dev_ie['posts'])
np.mean(predicted_ie == dev_ie['IE'])



0.547550432276657

In [68]:
from sklearn.metrics import f1_score
print("f1 score IE")

f1_score(dev_ie['IE'], predicted_ie, labels=['I', 'E'], average=None)

f1 score IE


array([0.59928535, 0.48047651])

In [69]:
downsize_factor_ns = sum(train_ns['NS'] == 'S')/len(train_ns['NS'])
train_weights_ns = [1 if ns == 'S' else downsize_factor_ns for ns in train_ns['NS']]

In [70]:
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf.fit(train_ns['posts'], train_ns['NS'], **{'clf__sample_weight': train_weights_ns})  
predicted_ns = text_clf.predict(dev_ns['posts'])
np.mean(predicted_ns == dev_ns['NS'])

0.6968299711815562

In [71]:
print("f1 score NS")
f1_score(dev_ns['NS'], predicted_ns, labels=['N', 'S'], average=None)

f1 score NS


array([0.8       , 0.37380952])

In [72]:
downsize_factor_ft = sum(train_ft['FT'] == 'T')/len(train_ft['FT'])
train_weights_ft = [1 if ft == 'T' else downsize_factor_ft for ft in train_ft['FT']]

In [73]:

text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf.fit(train_ft['posts'], train_ft['FT'], **{'clf__sample_weight': train_weights_ft})  
predicted_ft = text_clf.predict(dev_ft['posts'])
np.mean(predicted_ft == dev_ft['FT'])

0.7175792507204611

In [74]:
print("f1 score FT")
f1_score(dev_ft['FT'], predicted_ft, labels=['F', 'T'], average=None)

f1 score FT


array([0.67506631, 0.75025484])

In [75]:
downsize_factor_pj = sum(train_pj['PJ'] == 'J')/len(train_pj['PJ'])
train_weights_pj = [1 if pj == 'J' else downsize_factor_pj for pj in train_pj['PJ']]

In [76]:

text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf.fit(train_pj['posts'], train_pj['PJ'], **{'clf__sample_weight': train_weights_pj})  
predicted_pj = text_clf.predict(dev_pj['posts'])
np.mean(predicted_pj == dev_pj['PJ'])

0.4795389048991354

In [77]:
print("f1 score PJ")
f1_score(dev_pj['PJ'], predicted_pj, labels=['P', 'J'], average=None)

f1 score PJ


array([0.312262, 0.581363])

In [78]:
predicted_pj.shape

(1735,)

In [79]:
unique_elements, counts_elements = np.unique(predicted_ie, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[['E' 'I']
 ['1094' '641']]


In [80]:
print("Raw counts")
print(df_ie['IE'].value_counts())
print(df_ns['NS'].value_counts())
print(df_ft['FT'].value_counts())
print(df_pj['PJ'].value_counts())


import collections
print("\n\nPrediction distribution")
print(collections.Counter(predicted_ie))
print(collections.Counter(predicted_ns))
print(collections.Counter(predicted_ft))
print(collections.Counter(predicted_pj))

Raw counts
I    6676
E    1999
Name: IE, dtype: int64
N    7478
S    1197
Name: NS, dtype: int64
F    4694
T    3981
Name: FT, dtype: int64
P    5241
J    3434
Name: PJ, dtype: int64


Prediction distribution
Counter({'E': 1094, 'I': 641})
Counter({'N': 1107, 'S': 628})
Counter({'T': 1158, 'F': 577})
Counter({'J': 1481, 'P': 254})
