In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../')

## pipeline system

In [142]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline as ModelPipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from hatstall.pipes import Pipeline as CustomPipeline, PipelineSystem
from hatstall.pipes.engineer import PostsJoiner, AverageWordCalculator, Debugger
from hatstall.pipes.model import Evaluator
from hatstall.pipes.loader import PersonalityPostLoader
from hatstall.pipes.preparator import (
    PostsSplitter, EvenlyDistributor, TrainTestSplitter, DigitReplacer)

In [3]:
PreparationPipeline = CustomPipeline
EvaluationPipeline = CustomPipeline

psystem = PipelineSystem([
    ('preperation', PreparationPipeline([
        ('loader', PersonalityPostLoader),
        ('splitter', PostsSplitter),
        ('evenfier', EvenlyDistributor),
        ('traintest', TrainTestSplitter)
    ])),
    ('modelling', ModelPipeline([
        ('joiner', PostsJoiner()),
        ('vect', TfidfVectorizer(max_df=0.8, min_df=0.)),
        ('log', LogisticRegression(random_state=123)),
    ])),
    ('evaluation', EvaluationPipeline([
        ('eval', Evaluator)
    ]))
], mode='train_test')
psystem.run()

--- Running preparation pipeline ---
Running loader pipe --> PersonalityPostLoader
Running splitter pipe --> PostsSplitter
Ignored 5624 texts because of too small chunks
Running evenfier pipe --> EvenlyDistributor
Running traintest pipe --> TrainTestSplitter
Train size: Counter({'Introvert': 6728, 'Extrovert': 6727})
Test size: Counter({'Extrovert': 2884, 'Introvert': 2883})
--- Running modelling pipeline ---
--- Running evaluation pipeline ---
Running eval pipe --> Evaluator
0.7144095717010578
[0.72435146 0.70372369]
[[2164  719]
 [ 928 1956]]


In [242]:
# PreparationPipeline = CustomPipeline
EvaluationPipeline = CustomPipeline

psystem = PipelineSystem([
    ('preperation', PreparationPipeline([
        ('loader', PersonalityPostLoader),
        ('splitter', PostsSplitter),
        ('digit', DigitReplacer),
        ('evenfier', EvenlyDistributor),
        ('traintest', TrainTestSplitter)
    ])),
    ('modelling', ModelPipeline([
        ('features', FeatureUnion([
            ('tifdf', ModelPipeline([
                ('joiner', PostsJoiner()),
                ('vect', CountVectorizer(
                    max_df=1.0, min_df=0.01, 
                    token_pattern='(?u)\\$?\\b\\w\\w+\\b')),
            ])),
            ('mean_word', ModelPipeline([
                ('average', AverageWordCalculator()),
                ('scaler', MinMaxScaler())
            ])),
        ])),
        ('log', LogisticRegression(random_state=123)),
    ])),
    ('evaluation', EvaluationPipeline([
        ('eval', Evaluator)
    ]))
], mode='train_test')
psystem.run()

--- Running preparation pipeline ---
Running loader pipe --> PersonalityPostLoader
Running splitter pipe --> PostsSplitter
Ignored 5624 texts because of too small chunks
Running digit pipe --> DigitReplacer
Running evenfier pipe --> EvenlyDistributor
Running traintest pipe --> TrainTestSplitter
Train size: Counter({'Introvert': 6728, 'Extrovert': 6727})
Test size: Counter({'Extrovert': 2884, 'Introvert': 2883})
--- Running modelling pipeline ---
--- Running evaluation pipeline ---
Running eval pipe --> Evaluator
0.6781688919715624
[0.68142808 0.67484233]
[[1985  898]
 [ 958 1926]]


## pipeline analysis

todo
- function to evaluate feature importance using chi2  
- if link in string -> $link (LinkReplacer) 
- better pipeline naming

In [243]:
temp = psystem.pipelines[2]
model = temp.pipeline.payload['model']
X, y, _, _ = temp.pipeline.payload['train_test']

In [244]:
X[5]

['If you want to know the difference between an ENFP and an INFP, watch these two in Nacho Libre $digit',
 'SPOILERS FOR SEASONS $digit  Let’s start with King Ecbert. At first glance, his whole “plan to conquer England” might SOUND like dominant Te. But it isn’t as much an action derived from organization...',
 '*Cracks knuckles and grins*  ISTP: Ragnar ESTP: King Ecbert (Am I the only one who sees this??), Jarl Borg INFJ: Helga, Yidu (?), Harbard (a very, very mature INFJ) ENFJ: Athelstan ISFP: Bjorn...',
 "Hi, everyone! I'm trying to figure out my little brother's type, and I'm pretty sure it's ENTJ, but I'm not totally sure because he only exhibits ENTJ-ness half the time. I'm probably going to go...",
 "Alright, come on, you two. Let's play nice. I'm glad you're both passionate about the characters *to self* though, it was kind of intended to be a Dick Grayson thread...* but let's be fair. And...",
 'Okay. That makes sense. The whole wrapping things around a central idea thing, rig

In [255]:
[y for x in X for y in x if 'www.' in y]

['Here are some examples that I have found of different ENFJ Enneagram types through celebrity videos.   Type $digit Striving to be Perfect  Brene Brown   https://www.youtube.com/watch?v=hYRtdoBMQyk',
 'http://www.youtube.com/watch?v=FUmrBNmhvJM',
 'I used to love it when this came on:   https://www.youtube.com/watch?v=SPlQpGeTbIE  And apparently, I used to love this as a baby lol   $digit',
 'https://www.youtube.com/watch?v=cntvEDbagAw',
 'Bienvenidos  https://www.youtube.com/watch?v=FKYavPyurPw&feature=player_detailpage',
 'http://www.youtube.com/watch?v=ZgOJZdlSUCA',
 'https://www.youtube.com/watch?v=DpelSFTFkHc',
 'https://www.youtube.com/watch?v=wZngpLsUQos',
 'intj $digit > $digit > $digit sx/sp.    https://www.youtube.com/watch?v=fnfztDYfxqQ',
 'Awww. :D  PS: Your new avatar scares me.',
 'kinda repetitive but lol, so ESTP  http://www.youtube.com/watch?v=ABhDiXbUaBE',
 'Awwww... Thanks one of my favorite perC members, I enjoy your company as well ;)',
 "https://www.youtube.com/w

In [250]:
matrix = model.named_steps['features'].transform([X[1]])
matrix

<1x1810 sparse matrix of type '<class 'numpy.float64'>'
	with 119 stored elements in Compressed Sparse Row format>

In [253]:
fu = model.named_steps['features']
feature_names = fu.transformer_list[0][1].named_steps['vect'].get_feature_names()
[x for x in feature_names if 'www' in x]

['www']

In [252]:
fu = model.named_steps['features']
feature_names = fu.transformer_list[0][1].named_steps['vect'].get_feature_names()
feature_names

['$digit',
 'ability',
 'able',
 'about',
 'above',
 'absolute',
 'absolutely',
 'abstract',
 'accept',
 'according',
 'account',
 'accurate',
 'achieve',
 'across',
 'act',
 'acting',
 'action',
 'actions',
 'active',
 'activities',
 'actual',
 'actually',
 'add',
 'admit',
 'advice',
 'affect',
 'afraid',
 'after',
 'again',
 'against',
 'age',
 'aggressive',
 'ago',
 'agree',
 'agreed',
 'ah',
 'ahead',
 'air',
 'alcohol',
 'alive',
 'all',
 'allow',
 'allowed',
 'almost',
 'alone',
 'along',
 'alot',
 'already',
 'alright',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'an',
 'analysis',
 'and',
 'anger',
 'angry',
 'animal',
 'animals',
 'anime',
 'annoyed',
 'annoying',
 'another',
 'answer',
 'answers',
 'anti',
 'anxiety',
 'any',
 'anybody',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apologize',
 'apparently',
 'appear',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'are',
 '

In [131]:
len(feature_names)

71799

In [88]:
matrix.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.86190476]])

In [132]:
len(matrix.toarray())

1

In [141]:
X[1]

["I agree with this... I don't keep friends without trying it seems like, well more so maintaining closeness. But this year I'm making a few new friends and joined a band.",
 "I don't think through my feelings, I normal do nothing and just feel them raw, then sort them out later.Sign Up for Facebook | Facebook",
 "People just don't agree with how to be spiritual without religion, and some people need to know how because we aren't all christians. Plus I'm being attacked multiple times saying it's some morality...",
 "I wouldn't say I'm a total outside but most of my friends aren't normal or kinda nerdy.",
 'Another friend I used to hang out a ton like avoids me or is obsessed since I told him I was an atheist (probably a few months ago?) I was christian until the beginning of this year or so but even...',
 "Once your somewhere new they always do that... hahaha. A lot of times you meet someone just as good or better though. But it sucks when you lose them.  I think I'm lucky to be introv

In [140]:
import pandas as pd

feat_frame = pd.DataFrame(list(zip(feature_names, matrix.toarray()[0])), columns=['tok', 'nr'])
feat_frame[feat_frame.nr != 0]

Unnamed: 0,tok,nr
5789,95,1.0
6824,acquaintances,1.0
7425,ago,1.0
7439,agree,2.0
7854,all,1.0
8052,always,1.0
8229,amount,1.0
8281,an,1.0
8629,another,1.0
9206,aren,2.0


In [77]:
from sklearn.feature_selection import chi2
chi2(matrix, y)

(array([0.00343467, 0.03347686, 0.15374042, ..., 0.13024521, 0.30932743,
        0.22906454]),
 array([0.95326589, 0.85482404, 0.69498621, ..., 0.71817793, 0.5780932 ,
        0.63221833]))