# Advanced Feature Engineering
<hr style="border:2px solid black">

## 1. Example: Penguin Data

**load packages**

In [53]:
# data analysis stack
import numpy as np
import pandas as pd

# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

**read data**

In [54]:
df = pd.read_csv('../data/train_new.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/train_new.csv'

### 2.1 Train-Test split

In [27]:
train,test = train_test_split(df, test_size=0.2, random_state=101)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

### 2.2 Quick exploration

In [28]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
1,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S
2,82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29.0,0,0,345779,9.5,,S
3,320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corn...",female,40.0,1,1,16966,134.5,E34,C
4,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S


In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          577 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        157 non-null    object 
 11  Embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.9+ KB


### 2.3 Feature-Target Separation

In [30]:
num_features = [
    'SibSp',
    'Pclass',
    'Age',
    'Fare'
]

cat_features = ['Sex']

features = num_features + cat_features

target = 'Survived'

# feature and target columns
X_train,y_train = train[features], train[target]

In [31]:
X_train.head()

Unnamed: 0,SibSp,Pclass,Age,Fare,Sex
0,0,2,23.0,13.0,male
1,0,1,51.0,26.55,male
2,0,3,29.0,9.5,male
3,1,1,40.0,134.5,female
4,0,2,6.0,33.0,female


In [32]:
y_train

0      0
1      1
2      1
3      1
4      1
      ..
707    0
708    1
709    1
710    1
711    0
Name: Survived, Length: 712, dtype: int64

### 2.3 Feature Engineering

**numerical columns**

In [33]:
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaling', StandardScaler()),
        #('scaling', MinMaxScaler())
    ])

**categorical columns**

In [34]:
# column transformation
cat_transformer = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(drop='first'))
    ])

**total preprocessing**

In [35]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transformer', num_transformer, num_features),
        ('cat_transformer', cat_transformer, cat_features)
    ])

### 2.4 Model Building

**instantiate model**

In [36]:
classifier_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

**train model**

In [37]:
classifier_model.fit(X_train,y_train)

**model validation**

In [38]:
training_acccuracy = classifier_model.score(X_train,y_train)
print(f"training accuracy: {round(training_acccuracy, 6)}")

training accuracy: 0.800562


### 2.5 Model Evaluation

**feature-target separation**

In [39]:
X_test, y_test = test[features], test[target]

**model performance**

In [40]:
test_acccuracy = classifier_model.score(X_test,y_test)
print(f"test accuracy: {round(test_acccuracy, 6)}")

test accuracy: 0.787709


<hr style="border:2px solid black">

## 3. Extra Challenge

### 3.1 Custom Imputer

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin


class WeightImputer(BaseEstimator, TransformerMixin):
    """
    imputer for missing weight values
    """
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X):
        
        pass

### 3.1 Feature Engineering

**numerical columns**

In [50]:
num_transformer = Pipeline(
    steps=[
        ('imputer', WeightImputer()),
        ('scaling', StandardScaler()),
        #('scaling', MinMaxScaler())
    ])

**categorical columns**

In [51]:
# column transformation
cat_transformer = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(drop='first'))
    ])

**total preprocessing**

In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transformer', num_transformer, num_features),
        ('cat_transformer', cat_transformer, cat_features)
    ])

### 3.2 Model Building

**instantiate model**

In [23]:
classifier_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

**train model**

In [45]:
classifier_model.fit(X_train,y_train)

**model validation**

In [49]:
X_train

Unnamed: 0,SibSp,Pclass,Age,Fare,Sex
0,0,2,23.0,13.0000,male
1,0,1,51.0,26.5500,male
2,0,3,29.0,9.5000,male
3,1,1,40.0,134.5000,female
4,0,2,6.0,33.0000,female
...,...,...,...,...,...
707,0,3,19.0,14.5000,male
708,0,3,32.0,56.4958,male
709,0,1,41.0,134.5000,female
710,0,1,44.0,57.9792,female


In [None]:

training_acccuracy = classifier_model.score(X_train,y_train)
print(f"training accuracy: {round(training_acccuracy, 6)}")

### 2.5 Model Evaluation

**feature-target separation**

In [47]:
X_test, y_test = test[features], test[target]

**model performance**

In [48]:
test_acccuracy = classifier_model.score(X_test,y_test)
print(f"test accuracy: {round(test_acccuracy, 6)}")

test accuracy: 0.787709


<hr style="border:2px solid black">

## References

- [How to add feature engineering to a scikit-learn pipeline](https://practicaldatascience.co.uk/machine-learning/how-to-add-feature-engineering-to-a-scikit-learn-pipeline)

- [Coding a custom imputer in scikit-learn](https://towardsdatascience.com/coding-a-custom-imputer-in-scikit-learn-31bd68e541de)