# Advanced Feature Engineering
<hr style="border:2px solid black">

## 1. Example: Titanic Data

**load packages**

In [568]:
# data analysis stack
import numpy as np
import pandas as pd

# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

**read data**

In [569]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 2.1 Train-Test split

In [570]:
train,test = train_test_split(df, test_size=0.2, random_state=101)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

### 2.2 Quick exploration

In [571]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
1,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S
2,82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29.0,0,0,345779,9.5,,S
3,320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corn...",female,40.0,1,1,16966,134.5,E34,C
4,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S


In [572]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          577 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        157 non-null    object 
 11  Embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.9+ KB


### 2.3 Feature-Target Separation

In [573]:
num_features = [
    'SibSp',
    'Pclass',
    'Age',
    'Fare'
]

cat_features = ['Sex',  
                #'Name', 
                #'Cabin',
                'Embarked']

features = num_features + cat_features

target = 'Survived'

# feature and target columns
X_train,y_train = train[features], train[target]

In [574]:
X_train.head()

Unnamed: 0,SibSp,Pclass,Age,Fare,Sex,Embarked
0,0,2,23.0,13.0,male,S
1,0,1,51.0,26.55,male,S
2,0,3,29.0,9.5,male,S
3,1,1,40.0,134.5,female,C
4,0,2,6.0,33.0,female,S


In [575]:
y_train

0      0
1      1
2      1
3      1
4      1
      ..
707    0
708    1
709    1
710    1
711    0
Name: Survived, Length: 712, dtype: int64

### 2.3 Feature Engineering

**numerical columns**

In [576]:
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaling', StandardScaler()),
    ])

**categorical columns**

In [577]:
# column transformation
cat_transformer = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(drop='first'))
    ])

**total preprocessing**

In [578]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transformer', num_transformer, num_features),
        ('cat_transformer', cat_transformer, cat_features)
    ])

### 2.4 Model Building

**instantiate model**

In [579]:
classifier_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

**train model**

In [580]:
classifier_model.fit(X_train,y_train)

**model validation**

In [581]:
training_acccuracy = classifier_model.score(X_train,y_train)
print(f"training accuracy: {round(training_acccuracy, 6)}")

training accuracy: 0.801966


### 2.5 Model Evaluation

**feature-target separation**

**model performance**

In [582]:
X_test, y_test = test[features], test[target]
X_test

Unnamed: 0,SibSp,Pclass,Age,Fare,Sex,Embarked
0,0,1,45.5,28.5000,male,S
1,1,1,18.0,227.5250,female,C
2,1,1,19.0,53.1000,male,S
3,0,3,6.0,12.4750,male,S
4,0,2,,0.0000,male,S
...,...,...,...,...,...,...
174,0,3,,7.3125,male,S
175,0,3,,8.0500,male,S
176,0,1,56.0,83.1583,female,C
177,0,3,48.0,7.8542,male,S


In [583]:
classifier_model.fit(X_train,y_train)

In [584]:
test_acccuracy = classifier_model.score(X_test,y_test)
print(f"test accuracy: {round(test_acccuracy, 6)}")


test accuracy: 0.810056


In [585]:

from scipy import sparse as sp

<hr style="border:2px solid black">

## 3. Extra Challenge

### 3.1 Custom Imputer

In [586]:
def title_norm(str_t) -> int:
    '''
    Function does the following transformations:
    ['mrs','mr','miss','master','dr','rev'] remain the same
    ['mlle','ms'] become 'miss'
    'mme' becomes 'mrs'
    ['col','major','capt'] become 'army'
    ['don','lady','the countess','sir','the count','madam','lord'] become 'nobl'
    other titles become 'unknown'
    '''
    title_name = { 1 : ['mlle','ms', 'miss'],
                   2 : ['mme', 'mrs'], 
                   3 : ['mr'],
                   4 : ['col','major','capt', 'army'],
                   5 : ['don','lady','the countess','sir','the count','madam','lord', 'nobl'],
                   6 : ['master'],
                   7 : ['dr'],
                   8 : ['rev'],
    }

    for item in title_name:
        if str_t in title_name[item]:
           
            return item
        
    return 0


In [587]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

In [588]:
class CatergoryImputer(BaseEstimator, TransformerMixin):
    '''
    Parameters
    ----------    
    group_cols : list
        List of columns used for calculating the aggregated value 
    target : str
        The name of the column to impute
    metric : str
        The metric to be used for remplacement, can be one of ['name', 'cabin']
    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_cols, target, metric='count'):
        
        #assert metric in ['max', 'min', 'count'], 'Unrecognized value for metric, should be mean/median'
        assert type(group_cols) == list, 'group_cols should be a list of columns'
        assert type(target) == str, 'target should be a string'
        
        self.group_cols = group_cols
        self.target = target
        self.metric = metric
    
    def fit(self, X, y=None):
       
       # print(X[self.group_cols])
       # assert pd.isnull(X[self.group_cols]).any(axis=1) == False, 'There are missing values in group_cols'
       # print('Ok')
        #impute_map = X[self.group_cols]
       # X.groupby(self.group_cols)[self.target].agg(self.metric).reset_index(drop=False)
        
        #self.impute_map_ = impute_map
        
       return self 

    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
       # check_is_fitted(self, 'impute_map_')
        
      #  X = X.copy()
      #  X['title'] = X[self.group_cols].apply(lambda x: x.split(',')[1].split('.')[0].lower().strip()) 
      #  X[self.group_cols] = X['title'].apply(title_norm

        return X[self.group_cols]


In [589]:

class CategoryMaker(BaseEstimator, TransformerMixin): 
    
    def __init__(self):
        
        print('\n>>>>>>>Categor init() called.\n')

    def fit(self, X, y = None):

        print('\n>>>>>>>Categor fit() called.\n')
        return self

    def transform(self, X, y = None):
        
        X_ = X.copy() # creating a copy to avoid changes to original dataset
        X_.Name = X.Name.apply(lambda x: x.split(',')[1].split('.')[0].lower().strip()) 
        X_.Name = X_.Name.apply(title_norm)
        print('\n>>>>>>>Categor transform() called.',  X_.Name)
        return X_
        

In [590]:
class ColumnsSelector(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns
        
    def fit(self, X, y = None):
        return self    
    
    def transform(self, X, y = None):
        # return the dataframe with the specified features
        return X[self.columns]
    

class FareScaler(BaseEstimator, TransformerMixin): 
    def __init__(self):
        print('\n>>>>>>>Numeric init() called.\n')

    def fit(self, X, y = None):
        print('\n>>>>>>>Numeric fit() called.\n')
        return self

    def transform(self, X, y = None):
        
        X_ = X.copy() # creating a copy to avoid changes to original dataset
        X_.Fare = (X_.Fare)/(X_.SibSp + 1 + X_.Parch)
        print('\n>>>>>>>Numeric transform() called.\n', X_)
        return X_
     

### 3.1 Feature Engineering

In [591]:
num_features = [
    'SibSp',
    'Pclass',
    'Age',
    'Fare',
    'Parch'
]

cat_features = ['Sex',  
                'Name', 
               # 'Cabin',
                'Embarked']

features = num_features + cat_features

target = 'Survived'

# feature and target columns
X_train,y_train = train[features], train[target]

**numerical columns**

In [592]:
eng_transformer = Pipeline(
    steps=[
        ('scale_fare', FareScaler()),
        ('imputer', SimpleImputer(strategy='median')),
       
])


>>>>>>>Numeric init() called.



In [593]:
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaling', StandardScaler()),
    ])

**categorical columns**

In [594]:
# column transformation
cat_transformer = Pipeline(
    steps=[
        ('reorder', CategoryMaker()),
        ('onehot', OneHotEncoder(drop='first')),
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ]
)


>>>>>>>Categor init() called.



**total preprocessing**

In [595]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transformer', num_transformer, num_features),
       # ('eng_transformer', eng_transformer, num_features),
        ('cat_transformer', cat_transformer, cat_features)
    ])

In [596]:
pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### 3.2 Model Building

In [597]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import classification_report 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.tree import DecisionTreeClassifier

from IPython.display import IFrame
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

**instantiate model**

In [598]:
classifier_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier',  CatBoostClassifier(random_state=12, depth=4, iterations=90))   
        #('classifier',  LogisticRegression())   
        #('classifier',  RandomForest (max_depth=20,  n_estimators=200))   
    ])




**train model**

In [599]:
rfc = classifier_model.fit(X_train, y_train)


>>>>>>>Categor init() called.


>>>>>>>Categor fit() called.


>>>>>>>Categor transform() called. 0      3
1      3
2      3
3      2
4      1
      ..
707    3
708    3
709    1
710    2
711    1
Name: Name, Length: 712, dtype: int64
Learning rate set to 0.081078
0:	learn: 0.6598336	total: 2.02ms	remaining: 180ms
1:	learn: 0.6375619	total: 2.26ms	remaining: 99.6ms
2:	learn: 0.6134767	total: 2.47ms	remaining: 71.5ms
3:	learn: 0.5914124	total: 2.65ms	remaining: 57.1ms
4:	learn: 0.5718821	total: 2.81ms	remaining: 47.7ms
5:	learn: 0.5547683	total: 2.96ms	remaining: 41.4ms
6:	learn: 0.5396879	total: 3.11ms	remaining: 36.9ms
7:	learn: 0.5273671	total: 3.26ms	remaining: 33.5ms
8:	learn: 0.5162645	total: 3.41ms	remaining: 30.7ms
9:	learn: 0.5073640	total: 3.52ms	remaining: 28.2ms
10:	learn: 0.4976568	total: 3.71ms	remaining: 26.6ms
11:	learn: 0.4885868	total: 3.85ms	remaining: 25ms
12:	learn: 0.4802937	total: 3.99ms	remaining: 23.7ms
13:	learn: 0.4730574	total: 4.19ms	remaining: 22.7ms
14:	l

**model validation**

In [600]:

training_acccuracy = rfc.score(X_train,y_train)
print(f"training accuracy: {round(training_acccuracy, 6)}")


>>>>>>>Categor transform() called. 0      3
1      3
2      3
3      2
4      1
      ..
707    3
708    3
709    1
710    2
711    1
Name: Name, Length: 712, dtype: int64
training accuracy: 0.855337


### 2.5 Model Evaluation

**feature-target separation**

In [601]:
X_test, y_test = test[features], test[target]

**model performance**

In [602]:
test_acccuracy = classifier_model.score(X_test,y_test)
print(f"test accuracy: {round(test_acccuracy, 6)}")


>>>>>>>Categor transform() called. 0      3
1      2
2      3
3      6
4      3
      ..
174    3
175    3
176    2
177    3
178    3
Name: Name, Length: 179, dtype: int64
test accuracy: 0.826816


In [603]:
y_test.info

<bound method Series.info of 0      0
1      1
2      0
3      1
4      0
      ..
174    0
175    0
176    1
177    0
178    0
Name: Survived, Length: 179, dtype: int64>

<hr style="border:2px solid black">

## References

- [How to add feature engineering to a scikit-learn pipeline](https://practicaldatascience.co.uk/machine-learning/how-to-add-feature-engineering-to-a-scikit-learn-pipeline)

- [Coding a custom imputer in scikit-learn](https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de)

In [604]:
predict_data = pd.read_csv("../data/test.csv")
# Get feature columns
X_test = predict_data[features]
# Predict
result = classifier_model.predict(X_test)
# Export to CSV
prediction = pd.DataFrame(result.ravel(), columns=["Survived"])
prediction = pd.concat([predict_data["PassengerId"], prediction], axis=1)
prediction.to_csv("../data/prediction-pipeline.csv", index=False)




>>>>>>>Categor transform() called. 0      3
1      2
2      3
3      3
4      2
      ..
413    3
414    0
415    3
416    3
417    6
Name: Name, Length: 418, dtype: int64


# Best result on Kaggle by CatBoostClassifier
prediction-pipeline.csv

Complete · 2m ago
# Score: 0.78468