## Understanding Feature Union Techniques in sklearn

In [1]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as panda
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [2]:
##lets create a simple dataframe

countries = ['Spain','India','China','USA','England','Australia','Burma','Japan','Norway','Germany']

average_height = [5.8,5.7,5.4,6.1,5.8,6.2,5.2,5.3,6.1,6.]

std_deviation = [2,4,5,2,1.2,2,2,1,1.2,4]

population_studied = [20000,540000,1200000,670000,500000,12450,12000,120000,100000,10000]

gender_studied = ['M','M','F','M','M','F','M','M','F','M']


height_data = panda.DataFrame({
    
                'countries':countries,
                'avg_height': average_height,
                'std':std_deviation,
                'count': population_studied,
                'gender':gender_studied
    
})

height_data

Unnamed: 0,countries,avg_height,std,count,gender
0,Spain,5.8,2.0,20000,M
1,India,5.7,4.0,540000,M
2,China,5.4,5.0,1200000,F
3,USA,6.1,2.0,670000,M
4,England,5.8,1.2,500000,M
5,Australia,6.2,2.0,12450,F
6,Burma,5.2,2.0,12000,M
7,Japan,5.3,1.0,120000,M
8,Norway,6.1,1.2,100000,F
9,Germany,6.0,4.0,10000,M


#### We will take the following actions in each column to prepare it for the machine learning model

##### 1. one hot encoding the column gender
##### 2. standardize the column count, avg_height and std
##### 3. drop the column countries

### \** We will perform all these actions in one shot using Pipeline, FeatureUnions and Transformers in sci-kit learn




In [3]:

## handle all numeric columns at one shot since their expected behavior is the same

transformers_for_numeric_columns = [('numerical_standard_scaling',StandardScaler(),['avg_height','std','count'])]

column_transformer_numeric = ColumnTransformer(
                                    transformers = transformers_for_numeric_columns,
#                                     remainder = 'passthrough'
                                )


In [10]:

## drop the countries columns
## we will not need this since our remainder in above first applied feature will be default of drop. so all other
## columns not mentioned will be dropped
drop_countries_transformers = ColumnTransformer(

                                transformers = [('drop_countries', 'drop',['countries'])],
#                                 remainder = 'passthrough'

)


In [5]:

'''
we are going to enable pipelines to work off a single columns in a panda datafram
this particular transformer will enable us to hook this into a feature union which
will return the column which has been mentioned in the contructor
'''

class MyItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.key = column_name

    def fit(self, x, y=None):
        return self

    def transform(self, df):
        return df[self.key]
    
    
'''
the normal LabelEncoder in scikit learn only takes one argument in fit,fit_transform which is y.
however our pipelines and our feature unions all expect two columns, x and y.
we will provide this behavior with an extension of the labelencoder class

'''
class MyLabelEncoder(LabelEncoder):
    
    def fit(self, X, y, **fit_params):
        
        # reshape because we are going to append it to a matrix
        # adding a new axis so that we convert a row only data dim (x,)
        # to a two dim array (x,1)
        return super().fit(X)[:,np.newaxis]
    
    def fit_transform(self, X, y, **fit_params):
        
        return super().fit_transform(X)[:,np.newaxis]
    

In [6]:
label = LabelEncoder()
ohe = OneHotEncoder()

pipeline = Pipeline([
    
            ('select_gender_column', MyItemSelector(column_name='gender')),
            ('labelize_gender', MyLabelEncoder()),
            ('encode_gender' , OneHotEncoder())
    
    
])


# # feature_transformer_gender = 
# gender_col_transformer = ColumnTransformer(

#     transformers = [
        
#         ('label', LabelEncoder(), ['gender']),
#         ('encode', OneHotEncoder(), ['gender']),
        
#     ]
#     ,
#     remainder= 'passthrough'
# )

In [7]:

features =[
    
    ('gender_transformation', pipeline),
    ('transform_numeric_columns', column_transformer_numeric),
]


features

[('gender_transformation', Pipeline(memory=None,
       steps=[('select_gender_column', MyItemSelector(column_name=None)), ('labelize_gender', MyLabelEncoder()), ('encode_gender', OneHotEncoder(categorical_features=None, categories=None,
         dtype=<class 'numpy.float64'>, handle_unknown='error',
         n_values=None, sparse=True))])),
 ('transform_numeric_columns',
  ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
           transformer_weights=None,
           transformers=[('numerical_standard_scaling', StandardScaler(copy=True, with_mean=True, with_std=True), ['avg_height', 'std', 'count'])]))]

In [8]:
feature_union = FeatureUnion(features)

In [9]:
##lets do a trial run and see what happens

feature_union.fit_transform(height_data).toarray()

array([[ 0.        ,  1.        ,  0.11826248, -0.33410121, -0.78602775],
       [ 0.        ,  1.        , -0.17739372,  1.18454064,  0.58351917],
       [ 1.        ,  0.        , -1.06436231,  1.94386157,  2.32179027],
       [ 0.        ,  1.        ,  1.00523107, -0.33410121,  0.9259059 ],
       [ 0.        ,  1.        ,  0.11826248, -0.94155795,  0.47816941],
       [ 1.        ,  0.        ,  1.30088727, -0.33410121, -0.80591252],
       [ 0.        ,  1.        , -1.65567471, -0.33410121, -0.80709771],
       [ 0.        ,  1.        , -1.36001851, -1.09342213, -0.52265334],
       [ 1.        ,  0.        ,  1.00523107, -0.94155795, -0.57532823],
       [ 0.        ,  1.        ,  0.70957488,  1.18454064, -0.81236519]])