In [5]:
import pandas as pd
import numpy as np
import re

In [6]:
# found here: https://gist.github.com/jlln/338b4b0b55bd6984f883
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [7]:
df_orig = pd.read_csv('mbti_1.csv')

df = splitDataFrameList(df_orig, 'posts', '|||')


# replace URLs

# replace MBTI
# https://stackoverflow.com/questions/16720541/python-string-replace-regular-expression/16720705
mbti_pat = r"ISFJ|ESFP|ISFP|ISTP|ENFP|ENFJ|INFJ|ESTP|ESFJ|ESTJ|ENTP|INFP|INTP|INTJ|ISTJ|ENTJ"
mbti_regex = re.compile(mbti_pat, re.IGNORECASE)
MBTI_REP = '$MBTI$'

# replace hashtags
hashtag_pat = r"(\#[a-zA-Z0-9]+\b)"
hashtag_regex = re.compile(hashtag_pat)
HASHTAG_REP = '$HASHTAG$'

# Replace links with $link$
# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
link_pat = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
LINK_REP = '$LINK$'

In [8]:
df['posts'] = df['posts'].apply(lambda x: re.sub(mbti_pat, MBTI_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(hashtag_pat, HASHTAG_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(link_pat, LINK_REP, x))

In [9]:
df['IE'] = df['type'].apply(lambda x: 'I' if x[0] == 'I' else 'E')
df['NS'] = df['type'].apply(lambda x: 'N' if x[1] == 'N' else 'S')
df['FT'] = df['type'].apply(lambda x: 'F' if x[2] == 'F' else 'T')
df['PJ'] = df['type'].apply(lambda x: 'P' if x[3] == 'P' else 'J')

In [10]:
list(df.columns)

['posts', 'type', 'IE', 'NS', 'FT', 'PJ']

In [11]:
df_ie = df[['type', 'posts', 'IE']]
df_ns = df[['type', 'posts', 'NS']]
df_ft = df[['type', 'posts', 'FT']]
df_pj = df[['type', 'posts', 'PJ']]

In [12]:
# # Double data values for underrepresented traits
# df_ie = df_ie.append(df_ie[df_ie['IE'] == 'E'])
# df_ns = df_ns.append(df_ns[df_ns['NS'] == 'S'])

In [48]:
train_pct = 0.6

# indicates the location to split the data along
# since dev/test are the same size
test_split_position = 1.0 - (1.0 - train_pct) / 2
test_split_position

0.8

In [49]:
train_ie, dev_ie, test_ie = np.split(df_ie.sample(frac=1, random_state = 224), [int(train_pct*len(df_ie)), int(test_split_position*len(df_ie))])
train_ns, dev_ns, test_ns = np.split(df_ns.sample(frac=1, random_state = 224), [int(train_pct*len(df_ns)), int(test_split_position*len(df_ns))])
train_ft, dev_ft, test_ft = np.split(df_ft.sample(frac=1, random_state = 224), [int(train_pct*len(df_ft)), int(test_split_position*len(df_ft))])
train_pj, dev_pj, test_pj = np.split(df_pj.sample(frac=1, random_state = 224), [int(train_pct*len(df_pj)), int(test_split_position*len(df_pj))])

In [50]:
dev_ie.head()

Unnamed: 0,type,posts,IE
28170,INTP,Turnitin is now going to crawl this website an...,I
379609,INTP,... I don't get it. It's supposed to be shocki...,I
94405,ENFP,"Thanks guys, I'll just keep on going and see w...",E
155565,ENTP,"I wouldn't dare, dude. I would feel bad to hur...",E
419648,ISFP,I think you discover your values by witnessing...,I


In [51]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

# baseline for IE
train_counts_ie = count_vect.fit_transform(list(train_ie['posts']))
train_counts_ie.shape

train_weights_ie = [1.0/2]*len(train_ie)

In [52]:
downsize_factor_ie = sum(train_ie['IE'] == 'E')/len(train_ie['IE'])
train_weights_ie = [1 if ie == 'E' else downsize_factor_ie for ie in train_ie['IE']]
# train_weights_ie

In [53]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

In [54]:
#IE dev set
text_clf.fit(train_ie['posts'], train_ie['IE'], **{'clf__sample_weight': train_weights_ie})  
predicted_ie = text_clf.predict(dev_ie['posts'])
np.mean(predicted_ie == dev_ie['IE'])

0.5034154777345439

In [55]:
from sklearn.metrics import f1_score
print("f1 score IE")

f1_score(dev_ie['IE'], predicted_ie, labels=['I', 'E'], average=None)

f1 score IE


array([0.42479885, 0.563126  ])

In [56]:
downsize_factor_ns = sum(train_ns['NS'] == 'S')/len(train_ns['NS'])
train_weights_ns = [1 if ns == 'S' else downsize_factor_ns for ns in train_ns['NS']]
# train_weights_ns

In [57]:
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf.fit(train_ns['posts'], train_ns['NS'], **{'clf__sample_weight': train_weights_ns})  
predicted_ns = text_clf.predict(dev_ns['posts'])
np.mean(predicted_ns == dev_ns['NS'])

0.49041320939131156

In [58]:
print("f1 score NS")
f1_score(dev_ns['NS'], predicted_ns, labels=['N', 'S'], average=None)

f1 score NS


array([0.5372362 , 0.43304837])

In [59]:
downsize_factor_ft = sum(train_ft['FT'] == 'T')/len(train_ft['FT'])
train_weights_ft = [1 if ft == 'T' else downsize_factor_ft for ft in train_ft['FT']]

In [60]:

text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf.fit(train_ft['posts'], train_ft['FT'], **{'clf__sample_weight': train_weights_ft})  
predicted_ft = text_clf.predict(dev_ft['posts'])
np.mean(predicted_ft == dev_ft['FT'])

0.5566697016637302

In [61]:
print("f1 score FT")
f1_score(dev_ft['FT'], predicted_ft, labels=['F', 'T'], average=None)

f1 score FT


array([0.43933004, 0.63339461])

In [62]:
downsize_factor_pj = sum(train_pj['PJ'] == 'J')/len(train_pj['PJ'])
train_weights_pj = [1 if pj == 'J' else downsize_factor_pj for pj in train_pj['PJ']]

In [63]:

text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf.fit(train_pj['posts'], train_pj['PJ'], **{'clf__sample_weight': train_weights_pj})  
predicted_pj = text_clf.predict(dev_pj['posts'])
np.mean(predicted_pj == dev_pj['PJ'])

0.4559826886920739

In [64]:
print("f1 score PJ")
f1_score(dev_pj['PJ'], predicted_pj, labels=['P', 'J'], average=None)

f1 score PJ


array([0.29733486, 0.55618687])

In [65]:
predicted_pj.shape

(84569,)

In [66]:
unique_elements, counts_elements = np.unique(predicted_ie, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[['E' 'I']
 ['79367' '24718']]


In [67]:
print("Raw counts")
print(df_ie['IE'].value_counts())
print(df_ns['NS'].value_counts())
print(df_ft['FT'].value_counts())
print(df_pj['PJ'].value_counts())


import collections
print("\n\nPrediction distribution")
print(collections.Counter(predicted_ie))
print(collections.Counter(predicted_ns))
print(collections.Counter(predicted_ft))
print(collections.Counter(predicted_pj))

Raw counts
I    325263
E    195164
Name: IE, dtype: int64
N    364822
S    116046
Name: NS, dtype: int64
F    229312
T    193533
Name: FT, dtype: int64
P    255735
J    167110
Name: PJ, dtype: int64


Prediction distribution
Counter({'E': 79367, 'I': 24718})
Counter({'S': 63369, 'N': 32805})
Counter({'T': 63513, 'F': 21056})
Counter({'J': 70277, 'P': 14292})
