In [1]:
import pandas as pd
import numpy as np
import re


df = pd.read_csv('mbti_1.csv')

# replace URLs

# replace MBTI
# https://stackoverflow.com/questions/16720541/python-string-replace-regular-expression/16720705
mbti_pat = r"ISFJ|ESFP|ISFP|ISTP|ENFP|ENFJ|INFJ|ESTP|ESFJ|ESTJ|ENTP|INFP|INTP|INTJ|ISTJ|ENTJ"
mbti_regex = re.compile(mbti_pat, re.IGNORECASE)
MBTI_REP = '$MBTI$'

# replace hashtags
hashtag_pat = r"(\#[a-zA-Z0-9]+\b)"
hashtag_regex = re.compile(hashtag_pat)
HASHTAG_REP = '$HASHTAG$'

# Replace links with $link$
# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
link_pat = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
LINK_REP = '$LINK$'

In [2]:
df['posts'] = df['posts'].apply(lambda x: re.sub(mbti_pat, MBTI_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(hashtag_pat, HASHTAG_REP, x))
df['posts'] = df['posts'].apply(lambda x: re.sub(link_pat, LINK_REP, x))


df['posts'] = df['posts'].apply(lambda x: x.replace('|||', ''))

In [3]:
df['IE'] = df['type'].apply(lambda x: 'I' if x[0] == 'I' else 'E')
df['NS'] = df['type'].apply(lambda x: 'N' if x[1] == 'N' else 'S')
df['FT'] = df['type'].apply(lambda x: 'F' if x[2] == 'F' else 'T')
df['PJ'] = df['type'].apply(lambda x: 'P' if x[3] == 'P' else 'J')

In [4]:
list(df.columns)

['type', 'posts', 'IE', 'NS', 'FT', 'PJ']

In [5]:
df_ie = df[['type', 'posts', 'IE']]
df_ns = df[['type', 'posts', 'NS']]
df_ft = df[['type', 'posts', 'FT']]
df_pj = df[['type', 'posts', 'PJ']]

In [6]:
# # Double data values for underrepresented traits
# df_ie = df_ie.append(df_ie[df_ie['IE'] == 'E'])
# df_ns = df_ns.append(df_ns[df_ns['NS'] == 'S'])

In [7]:
train_pct = 0.6

# indicates the location to split the data along
# since dev/test are the same size
test_split_position = 1.0 - (1.0 - train_pct) / 2
test_split_position

0.8

In [8]:
train_ie, dev_ie, test_ie = np.split(df_ie.sample(frac=1, random_state = 224), [int(train_pct*len(df_ie)), int(test_split_position*len(df_ie))])
train_ns, dev_ns, test_ns = np.split(df_ns.sample(frac=1, random_state = 224), [int(train_pct*len(df_ns)), int(test_split_position*len(df_ns))])
train_ft, dev_ft, test_ft = np.split(df_ft.sample(frac=1, random_state = 224), [int(train_pct*len(df_ft)), int(test_split_position*len(df_ft))])
train_pj, dev_pj, test_pj = np.split(df_pj.sample(frac=1, random_state = 224), [int(train_pct*len(df_pj)), int(test_split_position*len(df_pj))])

In [9]:
dev_ie.head()

Unnamed: 0,type,posts,IE
3325,INTP,"'Well, I thought this would never happen but I...",I
43,INFP,'I'm Type 9 and people in my family (who aren'...,I
902,INTP,'He mentioned extroversion and then you mentio...,I
8340,ENTP,'I wish to change my name to War pigs and than...,E
3392,INFJ,'Some just talk and want to be heard. My 11th...,I


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

# baseline for IE
train_counts_ie = count_vect.fit_transform(list(train_ie['posts']))
train_counts_ie.shape

(5205, 95312)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

In [12]:
def average_post_length(post):
        """Helper code to compute average word length of a post"""
        return int(len(post.split()))

testData = train_ie['posts']
testData.apply(average_post_length)

5800     965
5750    1401
3823    1083
6183    1247
5009     842
8123    1244
1027    1500
7278    1245
1334    1369
370     1461
3804      86
678     1432
116     1277
3480    1379
8109    1411
4912    1643
7482    1321
5668    1249
6486    1102
8263    1103
4063     465
6336     717
5480    1314
3203     798
8115    1416
4500    1559
8003    1771
7023     906
105     1488
7652     826
        ... 
6070     501
7706    1167
4856    1312
7750    1060
5828    1577
578     1132
7227    1276
6584    1764
2503    1359
2266    1533
3901    1584
6602    1367
7241    1652
367     1421
7997    1280
2415     842
8530     811
8014    1102
5294     938
282      935
2741     802
2902    1238
7661     974
2460    1359
586     1577
5996     820
1740    1326
4595    1081
6587    1334
3011    1503
Name: posts, Length: 5205, dtype: int64

In [13]:
#

from sklearn.base import BaseEstimator, TransformerMixin

class AveragePostLengthExtractor():
    """Takes in dataframe, extracts road name column, outputs average word length"""

    def __init__(self):
        pass

    def average_post_length(self, post):
        """Helper code to compute average word length of a post"""
        return round(len(post.split()))

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        print(type(df))
        return (df.apply(self.average_post_length)).values.reshape(-1, 1)

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        print("fitting")
        return self

In [14]:
test = "I\'m"
print(test in "Hello I'm")

True


In [15]:
# first-person features: I, me, my, us, we, mine, our, I'm
# two person: you, your, you're, you've
# third person: he, him, she, her, they, them, their, his

from sklearn.pipeline import Pipeline, FeatureUnion


class FirstPersonExtractor():
    """Takes in dataframe, extracts road name column, outputs average word length"""

    def __init__(self):
        self.first_person_words = set(['I', 'me', 'my', 'us', 'we', 'mine', "our", "I\'m"])

    def count_first_person(self, post):
        val = 0
        for word in post:
            if word in self.first_person_words:
                val += 1
        return val

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        return (df.apply(self.count_first_person)).values.reshape(-1, 1)

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
# first-person features: I, me, my, us, we, mine, our, I'm
# two person: you, your, you're, you've
# third person: he, him, she, her, they, them, their, his

class SecondPersonExtractor():
    """Takes in dataframe, extracts road name column, outputs average word length"""

    def __init__(self):
        self.second_person_words = set(["you", "your", "you\'re", "you\'ve"])

    def count_second_person(self, post):
        val = 0
        for word in post:
            if word in self.second_person_words:
                val += 1
        return val

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        return (df.apply(self.count_second_person)).values.reshape(-1, 1)

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class ThirdPersonExtractor():
    """Takes in dataframe, extracts road name column, outputs average word length"""

    def __init__(self):
        self.third_person_words = set(["he", "him", "she", "her", "they", "them", "their", "his"])

    def count_third_person(self, post):
        val = 0
        for word in post:
            if word in self.third_person_words:
                val += 1
        return val

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        return (df.apply(self.count_third_person)).values.reshape(-1, 1)

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [19]:
class count_vec():
    def __init__(self):
        self.count = CountVectorizer(ngram_range = (1, 2))
    def transform(self, data):
        print("yah")
        return self.count.transform(data)
    def fit(self, data, labels):
        self.count.fit(data)
        return self

In [28]:
class AvgLengthDeviation():

    def __init__(self):
        pass

    def average_post_length(self, post):
        """Helper code to compute average word length of a post"""
        return round(len(post.split()))

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        deviations = []
        for posts in df:
            avg = np.mean([len(post.split()) for post in posts.split("|||") if post != ""])
            deviations.append([avg - self.avgs[0], avg - self.avgs[1]])

    def fit(self, df, y=None):
        """Calculates average post length for each class"""
        labels = list(np.unique(y))
        avgs = [0, 0]
        labelCounts = [0, 0]
        for (posts, userLabel) in zip(df, y):
            avgs[labels.index(userLabel)] += np.mean([len(post.split()) for post in posts.split("|||") if post != ""])
            labelCounts[labels.index(userLabel)] += 1

        avgs[0],avgs[1] = avgs[0] / labelCounts[0], avgs[1] / labelCounts[1]
        self.avgs = avgs
        self.labels = labels
        return self

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
pipeline = Pipeline([
    ('feats', FeatureUnion([
        ('avg_len', AvgLengthDeviation()),
        ('first_per', FirstPersonExtractor()),
        ('second_per', SecondPersonExtractor()),
        ('third_per', ThirdPersonExtractor()),
        ('count_vec', count_vec())
    ])),
    ('clf', SVC())
])

In [30]:
# Train weight downweight factors
downsize_factor_ie = sum(train_ie['IE'] == 'E')/len(train_ie['IE'])
train_weights_ie = [1 if ie == 'E' else downsize_factor_ie for ie in train_ie['IE']]

downsize_factor_ns = sum(train_ns['NS'] == 'S')/len(train_ns['NS'])
train_weights_ns = [1 if ns == 'S' else downsize_factor_ns for ns in train_ns['NS']]

downsize_factor_ft = sum(train_ft['FT'] == 'T')/len(train_ft['FT'])
train_weights_ft = [1 if ft == 'T' else downsize_factor_ft for ft in train_ft['FT']]

downsize_factor_pj = sum(train_pj['PJ'] == 'J')/len(train_pj['PJ'])
train_weights_pj = [1 if pj == 'J' else downsize_factor_pj for pj in train_pj['PJ']]

#IE dev set
pipeline.fit(train_ie['posts'], train_ie['IE'], **{'clf__sample_weight': train_weights_ie})  
predicted_ie = pipeline.predict(dev_ie['posts'])
print("IE accuracy", np.mean(predicted_ie == dev_ie['IE']))

pipeline.fit(train_ns['posts'], train_ns['NS'], **{'clf__sample_weight': train_weights_ns})  
predicted_ns = pipeline.predict(dev_ns['posts'])
print("NS accuracy", np.mean(predicted_ns == dev_ns['NS']))

pipeline.fit(train_ft['posts'], train_ft['FT'], **{'clf__sample_weight': train_weights_ft})  
predicted_ft = pipeline.predict(dev_ft['posts'])
print("FT accuracy", np.mean(predicted_ft == dev_ft['FT']))

pipeline.fit(train_pj['posts'], train_pj['PJ'], **{'clf__sample_weight': train_weights_pj})  
predicted_pj = pipeline.predict(dev_pj['posts'])
print("PJ accuracy", np.mean(predicted_pj == dev_pj['PJ']))

yah
yah
IE accuracy 0.2403458213256484
yah
yah
NS accuracy 0.12219020172910663
yah
yah
FT accuracy 0.4634005763688761
yah
yah
PJ accuracy 0.38962536023054756


In [31]:
from sklearn.metrics import f1_score
print("f1 score IE", f1_score(dev_ie['IE'], predicted_ie, labels=['I', 'E'], average=None))

print("f1 score NS", f1_score(dev_ns['NS'], predicted_ns, labels=['N', 'S'], average=None))

print("f1 score FT", f1_score(dev_ft['FT'], predicted_ft, labels=['F', 'T'], average=None))

print("f1 score PJ", f1_score(dev_pj['PJ'], predicted_pj, labels=['P', 'J'], average=None))

f1 score IE [0.         0.38754647]
f1 score NS [0.         0.21777093]
f1 score FT [0.        0.6333202]
f1 score PJ [0.         0.56076317]


  'precision', 'predicted', average, warn_for)


In [32]:
# with 1st, 2nd, 3rd person, countvec

#IE accuracy: .247
#NS accuracy: .346
#FT accuracy: 0.494
#PJ accuracy 0.390

# f1 score IE [0.01950488 0.38839495]
# f1 score NS [0.43072289 0.23274696]
# f1 score FT [0.12723658 0.64366883]
# f1 score PJ [0.         0.56076317]

# bigrams: identical numbers
#IE accuracy: .247
#NS accuracy: .346
#FT accuracy: 0.494
#PJ accuracy 0.390


# using avg post length deviations and count vec(no 1st, 2nd, 3rd): yah
# yah
# IE accuracy 0.2426512968299712
# yah
# yah
# NS accuracy 0.22074927953890489
# yah
# yah
# FT accuracy 0.47723342939481267
# yah
# yah
# PJ accuracy 0.38962536023054756


# f1 score IE [0.00605144 0.38826816]
# f1 score NS [0.20935673 0.23181818]
# f1 score FT [0.05026178 0.63936382]
# f1 score PJ [0.         0.56076317]

# using avg post length deviations, count vec, 1st, 2nd, 3rd with bigrams, unigrams
#IE accuracy: .240
#NS accuracy: .122
#FT accuracy: .463
#PJ accuracy: .390

# f1 score IE [0.         0.38754647]
# f1 score NS [0.         0.21777093]
# f1 score FT [0.        0.6333202]
# f1 score PJ [0.         0.56076317]

# Prediction distribution
# Counter({'E': 1735})
# Counter({'S': 1735})
# Counter({'T': 1735})
# Counter({'J': 1735})

In [33]:
unique_elements, counts_elements = np.unique(predicted_ie, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[['E']
 [1735]]


In [34]:
print("Raw counts")
print(df_ie['IE'].value_counts())
print(df_ns['NS'].value_counts())
print(df_ft['FT'].value_counts())
print(df_pj['PJ'].value_counts())


import collections
print("\n\nPrediction distribution")
print(collections.Counter(predicted_ie))
print(collections.Counter(predicted_ns))
print(collections.Counter(predicted_ft))
print(collections.Counter(predicted_pj))

Raw counts
I    6676
E    1999
Name: IE, dtype: int64
N    7478
S    1197
Name: NS, dtype: int64
F    4694
T    3981
Name: FT, dtype: int64
P    5241
J    3434
Name: PJ, dtype: int64


Prediction distribution
Counter({'E': 1735})
Counter({'S': 1735})
Counter({'T': 1735})
Counter({'J': 1735})
