In [1]:
from collections import Counter
import csv
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

## Load data

In [2]:
class Personality:
    FIRST_PERSONALITY = 'Introvert'
    SECOND_PERSONALITY = 'Extrovert'

    
class Persona:
    def __init__(self, personality, texts):
        self.personality = self._assign_personality(personality)
        self.texts = self._assign_texts(texts)
        
    def _assign_personality(self, personality):
        # first letter identifies if a person is an
        # introvert or extrovert
        persona = personality[0]
        message = 'Personality of I (Introvert) or E (Extrovert) is expected.'
        assert persona in ('I', 'E'), message
        if persona == 'I':
            return 'Introvert'
        elif persona == 'E':
            return 'Extrovert'
            
    def _assign_texts(self, texts):
        if '|||' in texts:
            return texts.split('|||')
        elif isinstance(texts, list):
            return texts
        else:
            raise ValueError('Can not handle the texts input.')


class PersonaContainer:
    def __init__(self, personas):
        self.personas = personas
        
    def get_personality(self):
        return [x.personality for x in self.personas]
    
    def get_texts(self):
        return [x.texts for x in self.personas]
    
    def evenly_distribute(self):
        first = list(filter(
            lambda x: x.personality == Personality.FIRST_PERSONALITY, self.personas))
        second = list(filter(
            lambda x: x.personality == Personality.SECOND_PERSONALITY, self.personas))
        if len(first) > len(second):
            first_small = first[:len(second)]
            self.personas = second + first_small
        elif len(first) < len(second):
            second_small = second[:len(first)]
            self.personas = first + second_small
        else:
            print("Both personalities already have an even distribution.")
        random.shuffle(self.personas)
        
    def _chunks(self, lst, n):
        """Yield successive n-sized chunks from lst."""
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
        
    def split_texts(self, chunk_size):
        new_personas = []
        ignored_text = 0
        for persona in self.personas:
            texts = persona.texts
            personality = persona.personality
            for chunk in self._chunks(texts, chunk_size):
                if len(chunk) == chunk_size:
                    new_personas.append(Persona(personality, chunk))
                else:
                    ignored_text += len(chunk)
        if ignored_text != 0:
            print("Ignored %d texts because of too small chunks" % (ignored_text))
        self.personas = new_personas

In [3]:
with open('../data/mbti_1.csv') as file:
    data = csv.reader(file, delimiter=',')
    # ignore header
    next(data, None)
    personas = []
    for row in data:
        if '|||' in row[1][1:-1]:
            personas.append(Persona(row[0], row[1][1:-1]))

## Prep data

In [4]:
persona_cont = PersonaContainer(personas)
print(Counter(persona_cont.get_personality()))

Counter({'Introvert': 6676, 'Extrovert': 1998})


### generate more data by splitting texts into smaller chunks

In [5]:
persona_cont.split_texts(10)
print(Counter(persona_cont.get_personality()))

Ignored 5624 texts because of too small chunks
Counter({'Introvert': 32111, 'Extrovert': 9611})


### Distribute data evenly

In [6]:
persona_cont.evenly_distribute()
print(Counter(persona_cont.get_personality()))

Counter({'Introvert': 9611, 'Extrovert': 9611})


### split data into train and test sets

In [7]:
train, test = train_test_split(
    persona_cont.personas, test_size=0.3, random_state = 123, 
    stratify=persona_cont.get_personality())

In [8]:
train_cont = PersonaContainer(train)
print(Counter(train_cont.get_personality()))

test_cont = PersonaContainer(test)
print(Counter(test_cont.get_personality()))

Counter({'Introvert': 6728, 'Extrovert': 6727})
Counter({'Extrovert': 2884, 'Introvert': 2883})


In [9]:
train_x = [' '.join(x) for x in train_cont.get_texts()]
train_y = train_cont.get_personality()

test_x = [' '.join(x) for x in test_cont.get_texts()]
test_y = test_cont.get_personality()

## Feature engineering

In [10]:
vect = TfidfVectorizer(max_df=0.8, min_df=0.)
train_x_vect = vect.fit_transform(train_x)
test_x_vect = vect.transform(test_x)

## Modelling

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.svm import SVC


clf_log = LogisticRegression(random_state=123)
clf_log.fit(train_x_vect, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
print(clf_log.score(test_x_vect, test_y))

0.717357378186232


In [13]:
print(f1_score(
    test_y, clf_log.predict(test_x_vect), average=None, 
    labels=[Personality.FIRST_PERSONALITY, Personality.SECOND_PERSONALITY]))

[0.72778891 0.70609448]


In [14]:
cm = confusion_matrix(
    test_y, clf_log.predict(test_x_vect), 
    labels=[Personality.FIRST_PERSONALITY, Personality.SECOND_PERSONALITY])
print(cm)

[[2179  704]
 [ 926 1958]]


### Play with own posts

TODO
- build own predict class which checks if words are in vect

In [17]:
posts = ["Have a look at this", "Yes I love all of you so much", "blub", "I had a wonderful day with my friends"]
for post in posts:
    if set(post.split(' ')).intersection(set(vect.get_feature_names())):
        pred = clf_log.predict(vect.transform([post]))[0]
        print(post + ' -> ' + pred)
    else:
        print('I can not classify the post because do not know any word in the post: ' + post)
    print('')

Have a look at this -> Introvert

Yes I love all of you so much -> Extrovert

I can not classify the post because do not know any word in the post: blub

I had a wonderful day with my friends -> Extrovert

