# Scikit-Learn PIPELINES: variables selection (categorical / numerical)

In [4]:
import os
import sys
sys.path.append('../')  
from tools.reader import csv2df, get_dcol

### read data

In [3]:
ddt = {'lcol':['dt'],'sformat':'%Y-%m-%d %H:%M:%S'}
data, dcol = csv2df('../../datasets/dataset.solar.csv', ltarget = ['y', 'cy'], ddt = ddt, lindex = ['dt'])
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 34775 entries, 2012-01-01 00:00:00 to 2015-12-31 23:00:00
Data columns (total 24 columns):
hforecast     34775 non-null int64
VGRD267       34775 non-null float64
UGRD267       34775 non-null float64
LCDC267       34775 non-null float64
MCDC267       34775 non-null float64
HCDC267       34775 non-null float64
TCDC267       34775 non-null float64
PRES267       34775 non-null float64
RH267         34775 non-null float64
TMP267        34775 non-null float64
APCP267       34775 non-null float64
HWS267        34775 non-null float64
cLCDC267      34775 non-null int64
cMCDC267      34775 non-null int64
cHCDC267      34775 non-null int64
cTCDC267      34775 non-null int64
logAPCP267    34775 non-null float64
cAPCP267      34775 non-null category
year          34775 non-null int64
month         34775 non-null int64
hour          34775 non-null int64
doy           34775 non-null int64
y             34775 non-null float64
cy            34775 no

## PIPELINES

In [27]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from pipelines.custom import *
import numpy as np

### only numeric

In [12]:
# only for numerical and boolean data
num_pipeline = Pipeline([
    ('features', FeatureUnion(n_jobs=-1, transformer_list=[
        # bool
        ('boolean', Pipeline([ ('selector', TypeSelector('bool')) ])),
        # numericals
        ('numericals', Pipeline([
            ('selector', TypeSelector(np.number)),
            ('scaler', StandardScaler()),
        ])), 
    ])), 
]) 

In [37]:
df = data[['cAPCP267','TMP267','PRES267']]
X = num_pipeline.fit_transform(df)
X[:, :5]

array([[-1.36419905,  1.40019561],
       [-1.39772874,  1.38778641],
       [-1.43423885,  1.35469522],
       ..., 
       [-0.99872543,  0.9837981 ],
       [-1.07733393,  1.03895008],
       [-1.12998796,  1.07617768]])

### only categorical (Option 1)

In [23]:
cat_pipeline = Pipeline([
   ('selector', TypeSelector('category')),
   ('labeler', StringIndexer()),
   ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

In [26]:
df = data[['cAPCP267']]
X = cat_pipeline.fit_transform(df)
X[:,:5].toarray()

array([[ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       ..., 
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.]])

### only categorical (Option 2)

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, prefix_sep='_',dummy_na=False, drop_first=True):
        self.prefix_sep = prefix_sep
        self.dummy_na = dummy_na
        self.drop_first = drop_first
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return pd.get_dummies(X, prefix_sep=self.prefix_sep,dummy_na=self.dummy_na,drop_first=self.drop_first).values
        

In [34]:
cat_pipeline = Pipeline([
   ('selector', TypeSelector('category')),
   ('encoder', MyOneHotEncoder()),
])

In [36]:
df = data[['cAPCP267','TMP267']]
dm = MyOneHotEncoder()
X = cat_pipeline.fit_transform(df)
X[:, :5]

array([[0, 0],
       [0, 0],
       [0, 0],
       ..., 
       [0, 1],
       [0, 0],
       [0, 0]], dtype=uint8)

### numerical + categorical

In [39]:
full_pipeline = FeatureUnion(transformer_list=[
   ("num_pipeline", num_pipeline),
   ("cat_pipeline", cat_pipeline),
])

In [40]:
df = data[['cAPCP267','TMP267']]
X = full_pipeline.fit_transform(df)
X[:,:5]

array([[-1.36419905,  0.        ,  0.        ],
       [-1.39772874,  0.        ,  0.        ],
       [-1.43423885,  0.        ,  0.        ],
       ..., 
       [-0.99872543,  0.        ,  1.        ],
       [-1.07733393,  0.        ,  0.        ],
       [-1.12998796,  0.        ,  0.        ]])