### Module 13 Lab: Practice Column transformation and Pipelines

**Data:**
    
Our simple Titanic dataset. We are trying to predict if they survived or not.
    
**Method:**
1. Create a new binary column from Cabin
2. Convert Gender from male/female to 0/1
3. Fill missing values in age with the mean (impute)
4. Create dummy variables from embarked and deal with missing values (small Pipeline)
5. Using all the transformers, create the Master ColumnTransformer
6. Create the Pipeline
7. Your turn: Use the Pipeline for different algorithms

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer,ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import boto3
import pandas as pd
import numpy as np
import pickle
import time
# Prevent pandas from displaying in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### 0. Load the data

In [4]:
# Setup boto3
sess = boto3.session.Session()
s3 = sess.client('s3') 
source_bucket = 'machinelearning-read-only'
source_key = 'data/titanic_simple.csv'
response = s3.get_object(Bucket = source_bucket, Key = source_key)
df = pd.read_csv(response.get("Body"))
df.head(5)

Unnamed: 0,Gender,Age,Cabin,Embarked,Survived
0,male,22.0,,S,0
1,female,38.0,C85,C,1
2,female,26.0,,S,1
3,female,35.0,C123,S,1
4,male,35.0,,S,0


In [5]:
# Discuss problems with the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Gender    891 non-null    object 
 1   Age       714 non-null    float64
 2   Cabin     204 non-null    object 
 3   Embarked  889 non-null    object 
 4   Survived  891 non-null    int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 34.9+ KB


### 1. Create a new binary column from Cabin

In [6]:
# Create a copy of df to work with
df_cleaned = df
# Code from previous module:
df_cleaned['HasCabin'] = np.where(df_cleaned['Cabin'].isnull(), 0, 1) # Build a new column in the DataFrame
# Drop the old 'Cabin' column
df_cleaned = df_cleaned.drop('Cabin',axis = 1)
# Reorder the columns so 'Survived' is at the end
df_cleaned = df_cleaned[['Gender','Age','Embarked','HasCabin','Survived']]
df_cleaned.head(6)

Unnamed: 0,Gender,Age,Embarked,HasCabin,Survived
0,male,22.0,S,0,0
1,female,38.0,C,1,1
2,female,26.0,S,0,1
3,female,35.0,S,1,1
4,male,35.0,S,0,0
5,male,,Q,0,0


### 2. Create OrdinalEncoder to convert Gender from male/female to 0/1

In [7]:
category_columns = ['Gender']
# create just the transformer
cat_transformer = OrdinalEncoder(categories = [['male','female']])

### 3. Fill missing values in age with the mean (impute)

In [8]:
# impute age
impute_columns = ['Age']
# Create just the transformer
imp_transformer = SimpleImputer(missing_values=np.nan, strategy='mean')

### 4. Create dummy variables from embarked and deal with missing values
This will be a 2-step Pipeline

In [9]:
dummy_columns = ['Embarked']
# 2 transformers in a pipeline:
# If there are missing values, then replace with the word 'missing'
missing_transformer = SimpleImputer(strategy='constant', fill_value='missing')
# This creates 4 dummy columns for 'C', 'S', 'Q' and 'missing'
dummy_transformer = OneHotEncoder() # Now use OneHotEncoder() to create dummy variables
#
# Create the 2-step transformer using a pipeline
embarked_transformer = Pipeline(steps=[
    ('missing', missing_transformer),
    ('dummy', dummy_transformer)])
embarked_transformer

Pipeline(memory=None,
         steps=[('missing',
                 SimpleImputer(add_indicator=False, copy=True,
                               fill_value='missing', missing_values=nan,
                               strategy='constant', verbose=0)),
                ('dummy',
                 OneHotEncoder(categories='auto', drop=None,
                               dtype=<class 'numpy.float64'>,
                               handle_unknown='error', sparse=True))],
         verbose=False)

### 5. Using all the transformers, create the Master ColumnTransformer

In [10]:
# Use all your transformers from above to create this preprocessor ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_transformer, category_columns), 
        ("impute", imp_transformer, impute_columns), 
        ("emb", embarked_transformer, dummy_columns) 
    ]
)
preprocessor

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cat',
                                 OrdinalEncoder(categories=[['male', 'female']],
                                                dtype=<class 'numpy.float64'>),
                                 ['Gender']),
                                ('impute',
                                 SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               missing_values=nan,
                                               strategy='mean', verbose=0),
                                 ['Age']),
                                ('emb',
                                 Pipeline(memory=None,
                                          steps=[('missing',
                                                  SimpleImputer(add_indicator=False,
                         

### 6. Create the Pipeline

In [11]:
# First, split the data into training/test sets
# Features
X = df.drop(['Survived'],axis = 1)
# Target
y = df['Survived']
# Split into train/test
# Reserve 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,random_state = 42)
# Verify the sizes of the split datasets
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (712, 5)
y_train: (712,)
X_test: (179, 5)
y_test: (179,)


In [12]:
# Create the pipeline with our preprocessor and a new classifier model
#
lr = LogisticRegression() # Create a new model
# 
# Use the preprocessor with the model
#
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("LogisticRegressor", lr)]
)
#
# Perform the preprocessing and the training of the model
pipe.fit(X_train, y_train)
#
# Treat the pipe object just like trained model
y_pred = pipe.predict(X_test)
# Report the performance
print('Logistic Regression Accuracy:', pipe.score(X_test, y_test))
confusion_matrix(y_test, y_pred)

Logistic Regression Accuracy: 0.7821229050279329


array([[88, 17],
       [22, 52]])

### 7. Your turn: Use the Pipeline for different algorithms
Now, use the Pipeline created above to train and evaluate a new algorithm: Gradient Boosting Classifier

In [None]:
# your code here
#
gbc = 'create a gbc model here'
#
# Use the model with the preprocessor
#
pipe = 'Create a new pipeline here. Use our preprocessor from above and your gbc model.'

#
# Perform the preprocessing and the training of the model
pipe.fit(X_train, y_train)
#
# Treat the pipe object just like trained model
y_pred = pipe.predict(X_test)
# Report the performance
print('Gradient Boosting  Accuracy:', pipe.score(X_test, y_test))
confusion_matrix(y_test, y_pred)