In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../')

In [2]:
from collections import Counter

In [3]:
from hatstall.pipes.loader import (
    Personality, PersonalityPostLoader)
from hatstall.pipes.preparator import (
    PersonContainer, PostsSplitter, EvenlyDistributor)

In [4]:
payload = {}

## Pipes

In [5]:
loader = PersonalityPostLoader(payload)
splitter = PostsSplitter(payload)
evenfier = EvenlyDistributor(payload)

## Load data

In [6]:
loader.run()

## Prepare data

### check amount of data and distribution

In [7]:
person_cont = PersonContainer(payload['persons'])
print(Counter(person_cont.get_personality()))

Counter({'Introvert': 6676, 'Extrovert': 1998})


### generate more data by splitting posts into smaller chunks

In [8]:
splitter.run()

Ignored 5624 texts because of too small chunks


In [9]:
print(Counter(payload['persons_container'].get_personality()))

Counter({'Introvert': 32111, 'Extrovert': 9611})


### distribute data evenly

In [10]:
evenfier.run()

In [11]:
print(Counter(payload['persons_container'].get_personality()))

Counter({'Introvert': 9611, 'Extrovert': 9611})


## Feature engieneering and modelling

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

In [13]:
train, test = train_test_split(
    payload['persons_container'].persons, test_size=0.3, 
    random_state = 123, 
    stratify=payload['persons_container'].get_personality())

train_cont = PersonContainer(train)
print(Counter(train_cont.get_personality()))

test_cont = PersonContainer(test)
print(Counter(test_cont.get_personality()))

Counter({'Introvert': 6728, 'Extrovert': 6727})
Counter({'Extrovert': 2884, 'Introvert': 2883})


In [14]:
train_x = [' '.join(x) for x in train_cont.get_posts()]
train_y = train_cont.get_personality()

test_x = [' '.join(x) for x in test_cont.get_posts()]
test_y = test_cont.get_personality()

### pipeline

In [15]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(max_df=0.8, min_df=0.)),
    ('log', LogisticRegression(random_state=123)),
])

pipeline.fit(train_x, train_y)

print(pipeline.score(test_x, test_y))
print(f1_score(
    test_y, pipeline.predict(test_x), average=None, 
    labels=[
        Personality.FIRST_PERSONALITY, 
        Personality.SECOND_PERSONALITY]))
cm = confusion_matrix(
    test_y, pipeline.predict(test_x), 
    labels=[
        Personality.FIRST_PERSONALITY, 
        Personality.SECOND_PERSONALITY])
print(cm)

0.7230795907750998
[0.73334446 0.71199279]
[[2196  687]
 [ 910 1974]]


## pipeline system

In [18]:
from sklearn.pipeline import Pipeline as ModelPipeline

from hatstall.pipes import Pipeline as CustomPipeline, PipelineSystem
from hatstall.pipes.engineer import PostsJoiner
from hatstall.pipes.model import Evaluator
from hatstall.pipes.loader import PersonalityPostLoader
from hatstall.pipes.preparator import PostsSplitter, EvenlyDistributor, TrainTestSplitter

In [19]:
PreparationPipeline = CustomPipeline
EvaluationPipeline = CustomPipeline

psystem = PipelineSystem([
    ('preperation', PreparationPipeline([
        ('loader', PersonalityPostLoader),
        ('splitter', PostsSplitter),
        ('evenfier', EvenlyDistributor),
        ('traintest', TrainTestSplitter)
    ])),
    ('modelling', ModelPipeline([
        ('joiner', PostsJoiner()),
        ('vect', TfidfVectorizer(max_df=0.8, min_df=0.)),
        ('log', LogisticRegression(random_state=123)),
    ])),
    ('evaluation', EvaluationPipeline([
        ('eval', Evaluator)
    ]))
], mode='train_test')
psystem.run()

--- Running preparation pipeline ---
Running loader pipe --> PersonalityPostLoader
Running splitter pipe --> PostsSplitter
Ignored 5624 texts because of too small chunks
Running evenfier pipe --> EvenlyDistributor
Running traintest pipe --> TrainTestSplitter
Train size: Counter({'Introvert': 6728, 'Extrovert': 6727})
Test size: Counter({'Extrovert': 2884, 'Introvert': 2883})
--- Running modelling pipeline ---
--- Running evaluation pipeline ---
Running eval pipe --> Evaluator
0.7194381827640021
[0.7295219  0.70857349]
[[2182  701]
 [ 917 1967]]


## TODO
- feature uninon input list of messages
  - calculate average word in message
  - check if link in message
  - combine messages and use vectorizer