# Transformers

Real Life Datasets have many features with a wide range of values like for example let’s consider the house price prediction dataset. It will have many features like no. of. bedrooms, square feet area of the house, etc.​

As you can guess, the no. of bedrooms will vary between 1 and 5, but the square feet area will range from 500-2000. This is a huge difference in the range of both features.​

Many machine learning algorithms that are using Euclidean distance as a metric to calculate the similarities will fail to give a reasonable recognition to the smaller feature, in this case, the number of bedrooms, which in the real case can turn out to be an actually important metric.​

Eg: Linear Regression, Logistic Regression, KNN​

### Importing necessary library

In [10]:
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier,GradientBoostingClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute._base import SimpleImputer as Imputer
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [11]:
# Reading titanic data and droping PassengerId as it is unique identifier

titanic_df = pd.read_csv('train.csv')
titanic_df = titanic_df.drop('PassengerId', axis=1)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
titanic_df.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [13]:
# Split the data into train and test.
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [14]:
# the columns with int or float datatype are considered as numeric features.

numeric_features = ['Age', 'Fare']


# Qualitative columns
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']


name_feature = ['Name']
cabin_feature = ['Cabin']

In [15]:
# creating a pipeline for numeric columns
# we are doing median imputation for null values and then standard scaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


# creating pipeline for categorical columns
# filling with 'missing' for null values and then one hot encoding the categories

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

Using only first charecter as cabin information, replacing missing values with 'U'! ​  
creating dummy variables for encoding categorical column !

In [27]:
# creating custom transformer for cabin coolumn

class CabinFeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        print('in the CabinFeatureTransformer init method: ')
            
    
    def fit(self, x, y=None):
        x.Cabin.fillna('U', inplace=True)
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin')    
        self.cabin_columns=  cabin_dummies.columns
        return self

    def transform(self, x):
        # replacing missing cabins with U (for Uknown)
        x.Cabin.fillna('U', inplace=True)
    
        # mapping each Cabin value with the cabin letter
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin') 
        cabin_dummies = cabin_dummies.reindex(columns = self.cabin_columns, fill_value=0)
        
        x = pd.concat([x, cabin_dummies], axis=1)

        x.drop('Cabin', axis=1, inplace=True)
    
        return x

Creating custom transformer for Name columns ​  
We are basically trying to extract the Title from name for ex: Officer, Major, etc...!

In [28]:
class NameFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the NameFeatureTransformer Init method: ')
        
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        Title_Dictionary = {
                "Capt": "Officer", "Col": "Officer", "Major": "Officer","Jonkheer": "Royalty",
                "Don": "Royalty","Sir" : "Royalty","Dr": "Officer","Rev": "Officer","the Countess":"Royalty",
                "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs", "Mr" : "Mr", "Mrs" : "Mrs", "Miss" : "Miss",
                "Master" : "Master", "Lady" : "Royalty"}
        
        x['Title'] = x['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
        x['Title'] = x.Title.map(Title_Dictionary)
        
        x.drop('Name', axis=1, inplace=True)
    
        titles_dummies = pd.get_dummies(x['Title'], prefix='Title')
        x = pd.concat([x, titles_dummies], axis=1)
    
        x.drop('Title', axis=1, inplace=True)
        return x.values

One common transformer, that merges all other transformer created and mapped with respected column !

In [29]:
transformer = ColumnTransformer(
    transformers=[
        ('numeric_data_preprocessing', numeric_transformer, numeric_features),
        ('categorical_data_preprocessing', categorical_transformer, categorical_features),
        ('cabin_data_preprocessing', CabinFeatureTransformer(), cabin_feature),
        ('name_data_preprocessing', NameFeatureTransformer(), name_feature)
    ])

in the CabinFeatureTransformer init method: 
in the NameFeatureTransformer Init method: 


In [30]:
# transforming the train data using created transformer

transformer.fit_transform(X_train)

in the CabinFeatureTransformer init method: 
in the NameFeatureTransformer Init method: 


array([[-0.79693172, -0.47182993,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.87433616,  0.38903233,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.1002918 ,  0.69775126,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.87433616, -0.45642589,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.36413482,  1.08153862,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-1.26135834,  0.1373185 ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [31]:
import pickle

In [35]:
# Serializing trained transformer object for further use

In [36]:
with open('transformer.pkl', 'wb') as f:
    pickle.dump(transformer, f)

In [37]:
# reading back the serialized object 

In [33]:
with open('transformer.pkl', 'rb') as f:
    saved_transformer = pickle.load(f)

In [38]:
# transforming the test data from serialized object

In [39]:
saved_transformer.transform(X_test)

array([[-0.40990954,  0.43297927,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05451708,  1.51834306,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.87433616, -0.4166246 ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.26135834, -0.46010048,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.44153926, -0.6075011 ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.79693172, -0.47519299,  0.        , ...,  0.        ,
         0.        ,  0.        ]])