# Pipelines

In [1]:
from dao import DataAccess

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import numpy as np

In [3]:
X = DataAccess.as_dataframe()

In [4]:
X.head()

Unnamed: 0_level_0,created_at,labels,predict,text,user
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
556e0ee3d6dfbb462880f0a5,Tue Jun 02 20:16:08 +0000 2015,{'alcohol': 0},0.52605,Impatiently waiting to get our hands on the ne...,"{'statuses_count': 823, 'favourites_count': 39..."
556e128ad6dfbb46288111e4,Tue Jun 02 20:31:44 +0000 2015,{'alcohol': 1},0.516649,Beer fans need their @ColumbusBrewing Bodhi. I...,"{'statuses_count': 10442, 'favourites_count': ..."
556e1464d6dfbb4628812330,Tue Jun 02 20:39:37 +0000 2015,{'alcohol': 1},0.502633,Stone Cold use to be the baddest MF in my book...,"{'statuses_count': 89573, 'favourites_count': ..."
556e15f1d6dfbb4628813236,Tue Jun 02 20:46:14 +0000 2015,{'alcohol': 1},0.535758,Now @iamjohnoliver has to drink a Bud Light Li...,"{'statuses_count': 16435, 'favourites_count': ..."
556e1adcd6dfbb50e34a1ed6,Tue Jun 02 21:07:13 +0000 2015,{'alcohol': 0},0.533892,I'm ready for a yard sale and to sell all the...,"{'statuses_count': 32154, 'favourites_count': ..."


In [5]:
class ItemGetter(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        pass
    
    def transform(self, X, y=None):
        return X[self.key]
    
    def fit_transform(self, X, y=None):
        return X[self.key]

## Text Pipeline

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from twokenize import tokenize

text_pipe = []

text_pipe.append(
    ("text", 
     ItemGetter("text")
    )
)

text_pipe.append(
    ("tfidf", 
     TfidfVectorizer(
            analyzer="char",
            ngram_range=(2,5),
            min_df = 10,
            max_df = .98
        )
    )
)

text_pipe.append(
    ("lsi",
    TruncatedSVD(
            n_components=3000
        )
    )
)
                 
                 
text_pipeline = Pipeline(text_pipe[:2])

In [32]:
%time
t = text_pipeline.fit_transform(X)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs


In [34]:
%time
t = text_pipeline.transform(X)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.91 µs


## Time Vectorizers

In [44]:
import pandas as pd

class DateTimeIndexTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X, y=None):
        return pd.DatetimeIndex(X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [45]:
time_pipe = []

time_pipe.append(
    ("get_created_at", ItemGetter("created_at"))
)

time_pipe.append(
    ("to_datetime",
    DateTimeTransformer())
)


time_pipeline = Pipeline(
    time_pipe
)

In [49]:
%time

x = time_pipeline.fit_transform(X)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs


In [104]:
from scipy.sparse import csc_matrix

import pandas as pd

class DatetimeVectorizor(BaseEstimator, TransformerMixin):
    
    allowed_kinds = {"dayofweek", "hour", "hour*dayofweek"}
    
    def __init__(self, kind):
        if kind not in self.allowed_kinds:
            raise Exception(
                'Kind must be one of "dayofweek", "hour", or "hour*dayofweek"'
            )
        self.kind = kind
        
    def fit(self, X, y=None):
        pass
    
    def transform(self, X, y=None):
        n = len(X)
        if self.kind == "dayofweek":
            col = X.dayofweek
        elif self.kind == "hour":
            col = X.hour
        else:
            col = X.dayofweek * 24 + X.hour
        row = np.array(range(n))
        data = np.ones(n)
        return csc_matrix((data, (row, col)), (n, col.max()+1))
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    

