In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../')

## pipeline system

In [341]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline as ModelPipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from hatstall.pipes import Pipeline as CustomPipeline, PipelineSystem
from hatstall.pipes.engineer import PostsJoiner, AverageWordCalculator, Debugger
from hatstall.pipes.model import Evaluator, TestDataTokenChecker
from hatstall.pipes.loader import PersonalityPostLoader
from hatstall.pipes.preparator import (
    PostsSplitter, EvenlyDistributor, TrainTestSplitter, DigitReplacer, LinkReplacer)

In [342]:
PreparationPipeline = CustomPipeline
EvaluationPipeline = CustomPipeline

psystem = PipelineSystem([
    ('preperation', PreparationPipeline([
        ('loader', PersonalityPostLoader),
        ('splitter', PostsSplitter),
        ('evenfier', EvenlyDistributor),
        ('traintest', TrainTestSplitter)
    ])),
    ('modelling', ModelPipeline([
        ('joiner', PostsJoiner()),
        ('vect', TfidfVectorizer(max_df=0.8, min_df=0.)),
        ('log', LogisticRegression(random_state=123)),
    ])),
    ('evaluation', EvaluationPipeline([
        ('eval', Evaluator)
    ]))
], mode='train_test')
psystem.run()

TypeError: 'tuple' object is not callable

In [359]:
# PreparationPipeline = CustomPipeline
EvaluationPipeline = CustomPipeline

psystem = PipelineSystem([
    ('preperation', PreparationPipeline([
        ('loader', PersonalityPostLoader),
        ('splitter', PostsSplitter),
        ('digit', DigitReplacer),
        ('link', LinkReplacer), 
        ('evenfier', EvenlyDistributor),
        ('traintest', TrainTestSplitter)
    ])),
    ('modelling', ModelPipeline([
        ('features', FeatureUnion([
            ('token', ModelPipeline([
                ('joiner', PostsJoiner()),
                ('vect', CountVectorizer(
                    max_df=1.0, min_df=0.01, 
                    token_pattern='(?u)\\$?\\b\\w\\w+\\b')),
            ])),
            ('mean_word', ModelPipeline([
                ('average', AverageWordCalculator()),
                ('scaler', MinMaxScaler())
            ])),
        ])),
        ('log', LogisticRegression(random_state=123)),
    ])),
    ('evaluation', EvaluationPipeline([
        ('eval', Evaluator)
    ]))
], mode='train_test')
psystem.run()

--- Running preparation pipeline ---
Running loader pipe --> PersonalityPostLoader
Running splitter pipe --> PostsSplitter
Ignored 5624 texts because of too small chunks
Running digit pipe --> DigitReplacer
Running link pipe --> LinkReplacer
Running evenfier pipe --> EvenlyDistributor
Running traintest pipe --> TrainTestSplitter
Train size: Counter({'Introvert': 6728, 'Extrovert': 6727})
Test size: Counter({'Extrovert': 2884, 'Introvert': 2883})
--- Running modelling pipeline ---
--- Running evaluation pipeline ---
Running eval pipe --> Evaluator
0.6823305011271025
[0.68543956 0.67915937]
[[1996  887]
 [ 945 1939]]


In [363]:
# PreparationPipeline = CustomPipeline
EvaluationPipeline = CustomPipeline

psystem = PipelineSystem([
    ('preperation', PreparationPipeline([
        ('loader', PersonalityPostLoader),
        ('splitter', PostsSplitter),
        ('digit', DigitReplacer),
        ('link', LinkReplacer), 
        ('evenfier', EvenlyDistributor),
        ('traintest', TrainTestSplitter)
    ])),
    ('modelling', ModelPipeline([
        ('features', FeatureUnion([
            ('token', ModelPipeline([
                ('joiner', PostsJoiner()),
                ('vect', CountVectorizer(
                    max_df=1.0, min_df=0.01,
                    token_pattern='(?u)\\$?\\b\\w\\w+\\b')),
            ])),
            ('mean_word', ModelPipeline([
                ('average', AverageWordCalculator()),
                ('scaler', MinMaxScaler())
            ])),
        ])),
        ('log', LogisticRegression(random_state=123)),
    ])),
    ('evaluation', EvaluationPipeline([
        ('token_checker', TestDataTokenChecker),
        ('eval', Evaluator)
    ]))
], mode='train_test')
psystem.run()

--- Running preparation pipeline ---
Running loader pipe --> PersonalityPostLoader
Running splitter pipe --> PostsSplitter
Ignored 5624 texts because of too small chunks
Running digit pipe --> DigitReplacer
Running link pipe --> LinkReplacer
Running evenfier pipe --> EvenlyDistributor
Running traintest pipe --> TrainTestSplitter
Train size: Counter({'Introvert': 6728, 'Extrovert': 6727})
Test size: Counter({'Extrovert': 2884, 'Introvert': 2883})
--- Running modelling pipeline ---
--- Running evaluation pipeline ---
Running token_checker pipe --> TestDataTokenChecker
Removing 2346 out of 5767 test examples because model knows to few words in test example.
Running eval pipe --> Evaluator
0.6471791873721134
[0.6522616  0.64194601]
[[1132  595]
 [ 612 1082]]


## pipeline analysis

todo
- function to evaluate feature importance using chi2  
- if link in string -> $link (LinkReplacer) 
- better pipeline naming

In [357]:
import numpy as np
aa = np.array([1,2,3,4,5,6,7, 8,1,2])
sum(aa < 5)
list(aa[aa < 5])

[1, 2, 3, 4, 1, 2]

In [272]:
temp = psystem.pipelines[2]
model = temp.pipeline.payload['model']
X, y, _, _ = temp.pipeline.payload['train_test']

In [273]:
X[5]

["NewMango  I don't really know what that means or what the difference is, but.. Do explain, please.   I know that he told me that he was honest and open (the whole just ask kind of thing), so...",
 "Not sure if anyone was keeping up with my questions about an INTJ romantic relationship prospect, but......  UPDATE: Two weeks and totally ghosted.  So.. Woohoo. There's that. I thought you...",
 "I want to know about each encounter. How did it work out? Let's compare and contrast! We've all read the stereotypes.. The most common being that INTJs, ENTJs, and (for whatever reason) ISTJs are our...",
 "Right now, I'm feeling very frustrated. Slightly restless.  I'm tired of being let down by people.  I met an INTJ a few weeks ago that described himself as being honest and open (the whole just...",
 "That's kind of interesting.. I too feel like I can often switch between a few different modes, but I think it's more of a reaction to the general vibe that we pick up on. My family also perceives

In [274]:
[z for x in X for y in x for z in y.split() if 'www.' in z]

['Awww.',
 'Awww.',
 'awww.',
 'awwwwwwwww.',
 'awww.',
 "D'awwww...",
 'Awww...',
 'awwww....',
 'Awwww...',
 'Wowwwww.',
 'Awww..',
 'Awww....paranoid,',
 'Awwww.',
 'Awww....',
 'Awww.',
 'Owwwww....sorry',
 'Awww.',
 'awww.',
 'Awwwww.',
 'awwww.',
 'ewww...babies.',
 'AWwwwwwww....',
 'Awww.',
 'Awwww...',
 "D'awww....",
 'awwwwwwwww.',
 'awww...']

In [275]:
matrix = model.named_steps['features'].transform([X[1]])
matrix

<1x1820 sparse matrix of type '<class 'numpy.float64'>'
	with 86 stored elements in Compressed Sparse Row format>

In [276]:
fu = model.named_steps['features']
feature_names = fu.transformer_list[0][1].named_steps['vect'].get_feature_names()
[x for x in feature_names if 'www' in x]

[]

In [314]:
test = [x for x in fu.transformer_list if x[0] == 'tifdf'][0]
test[1].named_steps['vect'].get_feature_names()

['$digit',
 '$link',
 'ability',
 'able',
 'about',
 'above',
 'absolute',
 'absolutely',
 'abstract',
 'accept',
 'according',
 'account',
 'accurate',
 'across',
 'act',
 'acting',
 'action',
 'actions',
 'active',
 'activities',
 'actual',
 'actually',
 'add',
 'admire',
 'admit',
 'adult',
 'advice',
 'affect',
 'afraid',
 'after',
 'again',
 'against',
 'age',
 'aggressive',
 'ago',
 'agree',
 'agreed',
 'ah',
 'ahead',
 'air',
 'alcohol',
 'alive',
 'all',
 'allow',
 'allowed',
 'almost',
 'alone',
 'along',
 'alot',
 'already',
 'alright',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'an',
 'analysis',
 'and',
 'anger',
 'angry',
 'animal',
 'animals',
 'anime',
 'annoyed',
 'annoying',
 'another',
 'answer',
 'answering',
 'answers',
 'anxiety',
 'anxious',
 'any',
 'anybody',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apologize',
 'apparently',
 'appear',
 'apply',
 'appreciate',
 'a

In [316]:
jep = test[1].named_steps['vect']
jep.token_pattern

'(?u)\\$?\\b\\w\\w+\\b'

In [336]:
test1 = ['the', 'yes', 'nice']

test2 = [['the, yes', 'nice! day blub'], ['the', 'yes', 'nice']]

test2 = [' '.join(x) for x in test2]

In [337]:
test2

['the, yes nice! day blub', 'the yes nice']

In [338]:
import re
test2 = [re.findall('(?u)\\$?\\b\\w\\w+\\b', x) for x in test2]
#test2 = [x for y in test2 for x in y]

In [339]:
test2

[['the', 'yes', 'nice', 'day', 'blub'], ['the', 'yes', 'nice']]

In [335]:
set(test2)

{'blub', 'day', 'nice', 'the', 'yes'}

In [328]:
len(set(test2) - set(test1))

2

In [330]:
(len(set(test2) - set(test1))) / len(test2)

0.4

In [278]:
fu.transformer_list

[('tifdf', Pipeline(memory=None,
       steps=[('joiner', PostsJoiner()), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
          ngram_range=(1, 1), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\$?\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None))])),
 ('mean_word', Pipeline(memory=None,
       steps=[('average', AverageWordCalculator()), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1)))]))]

In [277]:
fu = model.named_steps['features']
feature_names = fu.transformer_list[0][1].named_steps['vect'].get_feature_names()
feature_names

['$digit',
 '$link',
 'ability',
 'able',
 'about',
 'above',
 'absolute',
 'absolutely',
 'abstract',
 'accept',
 'according',
 'account',
 'accurate',
 'across',
 'act',
 'acting',
 'action',
 'actions',
 'active',
 'activities',
 'actual',
 'actually',
 'add',
 'admire',
 'admit',
 'adult',
 'advice',
 'affect',
 'afraid',
 'after',
 'again',
 'against',
 'age',
 'aggressive',
 'ago',
 'agree',
 'agreed',
 'ah',
 'ahead',
 'air',
 'alcohol',
 'alive',
 'all',
 'allow',
 'allowed',
 'almost',
 'alone',
 'along',
 'alot',
 'already',
 'alright',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'an',
 'analysis',
 'and',
 'anger',
 'angry',
 'animal',
 'animals',
 'anime',
 'annoyed',
 'annoying',
 'another',
 'answer',
 'answering',
 'answers',
 'anxiety',
 'anxious',
 'any',
 'anybody',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apologize',
 'apparently',
 'appear',
 'apply',
 'appreciate',
 'a

In [131]:
len(feature_names)

71799

In [88]:
matrix.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.86190476]])

In [132]:
len(matrix.toarray())

1

In [141]:
X[1]

["I agree with this... I don't keep friends without trying it seems like, well more so maintaining closeness. But this year I'm making a few new friends and joined a band.",
 "I don't think through my feelings, I normal do nothing and just feel them raw, then sort them out later.Sign Up for Facebook | Facebook",
 "People just don't agree with how to be spiritual without religion, and some people need to know how because we aren't all christians. Plus I'm being attacked multiple times saying it's some morality...",
 "I wouldn't say I'm a total outside but most of my friends aren't normal or kinda nerdy.",
 'Another friend I used to hang out a ton like avoids me or is obsessed since I told him I was an atheist (probably a few months ago?) I was christian until the beginning of this year or so but even...',
 "Once your somewhere new they always do that... hahaha. A lot of times you meet someone just as good or better though. But it sucks when you lose them.  I think I'm lucky to be introv

In [140]:
import pandas as pd

feat_frame = pd.DataFrame(list(zip(feature_names, matrix.toarray()[0])), columns=['tok', 'nr'])
feat_frame[feat_frame.nr != 0]

Unnamed: 0,tok,nr
5789,95,1.0
6824,acquaintances,1.0
7425,ago,1.0
7439,agree,2.0
7854,all,1.0
8052,always,1.0
8229,amount,1.0
8281,an,1.0
8629,another,1.0
9206,aren,2.0


In [77]:
from sklearn.feature_selection import chi2
chi2(matrix, y)

(array([0.00343467, 0.03347686, 0.15374042, ..., 0.13024521, 0.30932743,
        0.22906454]),
 array([0.95326589, 0.85482404, 0.69498621, ..., 0.71817793, 0.5780932 ,
        0.63221833]))