# Classification Workflow with Pipelines

Let's add pipelines into our workflow!

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, \
FunctionTransformer, PolynomialFeatures

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from sklearn.dummy import DummyClassifier


from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

## Bringing in Our Modeling Class from before

In [2]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [3]:
titanic = pd.read_csv('./data/titanic.csv')
titanic.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/titanic.csv'

In [None]:
X = titanic.drop(['PassengerId', 'Name', 'Survived'], axis=1)
y = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Using `Pipeline` and `ColumnTransformer`

When we use the `ColumnTransformer` we'll want to choose the relevant column numbers, so let's remind ourselves which columns are where:

In [None]:
X.head()

In [None]:
# We'll throw these mini-pipelines into our ColumnTransformer: numeric and categorical
subpipe_numerics = Pipeline(steps=[
    ('num_impute',SimpleImputer(add_indicator=True)),
#     ('poly',PolynomialFeatures()),
    ('ss', StandardScaler())
])

subpipe_cat = Pipeline(steps=[
    ('cat_impute', SimpleImputer(strategy='most_frequent', add_indicator=True)),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

### Selector!!!!

[This](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html#sklearn.compose.make_column_selector) will return a callable list of columns

In [None]:
# Selector!

test = selector(dtype_include=object)

In [None]:
test(X)

In [None]:
numtest = selector(dtype_include=np.number)

In [None]:
numtest(X)

In [None]:
CT = ColumnTransformer(transformers=[
    ('subpipe_num',subpipe_numerics, selector(dtype_include=np.number)),
    ('subpipe_cat', subpipe_cat, selector(dtype_include=object))
], remainder='passthrough')

In [None]:
# The "remainder='passthrough'" bit tells the compiler to leave
# the other df columns unchanged.


In [None]:
# The `ColumnTransformer` will take care of our preprocessing,
# so now we can add our model at the end of the pipeline.


In [None]:
# Create Dummy/Baseline

dummy_model_pipe = Pipeline(steps=[
    ('ct',CT),
    ('dummy',DummyClassifier(strategy='most_frequent'))
])

In [None]:
# Use the class with out dummy pipe

dummy_model_pipe.fit(X_train, y_train)

In [None]:
dummy_model_pipe.score(X_train, y_train)

In [None]:
dummy_model_pipe.score(X_test, y_test)

In [None]:
dummy_pipe = ModelWithCV(
    dummy_model_pipe,
    model_name='dummy',
    X=X_test,
    y=y_test,
)

In [None]:
fig, ax =plt.subplots()

dummy_pipe.print_cv_summary()

dummy_pipe.plot_cv(ax=ax)

In [None]:
# Create FSM (First Simple Model)

fsm_model_pipe = Pipeline(steps=[
    ('ct',CT),
    ('fsm',LogisticRegression())
])

In [None]:
# Use the class with out logreg pipe
fsm_pipe = ModelWithCV(
    fsm_model_pipe,
    model_name='fsm',
    X=X_test,
    y=y_test,
)

fig, ax =plt.subplots()

fsm_pipe.print_cv_summary()

fsm_pipe.plot_cv(ax=ax)

In [None]:
dum_model = DummyClassifier(strategy='most_frequent')
lr_model = LogisticRegression()
tree_model = DecisionTreeClassifier()
# gbc_model = GradientBoostingClassifier()

In [None]:
model_list = [('dummy',dum_model), ('logreg',lr_model), ('dtc',tree_model), ('gbc',GradientBoostingClassifier())]

In [None]:
for name, model in model_list:
    loop_model_pipe = Pipeline(steps=[
        ('ct', CT),
        (name, model)
    ])
    
    loop_model_pipe.fit(X_train,y_train)
    
    loop_pipe = ModelWithCV(
        loop_model_pipe,
        model_name=name,
        X=X_test,
        y=y_test)
    
    loop_pipe.print_cv_summary()
    

## Trying Other Models at the End of the Pipeline

Can I have multiple models in a single pipeline? Yes. We'll forgo this here, but for more on this see [here](https://stackoverflow.com/questions/48507651/multiple-classification-models-in-a-scikit-pipeline-python).

### Random Forest

In [None]:
# rfc!

### Gradient Booster

In [None]:
# gbc!

## Tuning and Cross-Validating

In [None]:
fsm_model_pipe

In [None]:
fsm_model_pipe.get_params().keys()

In [None]:
params = {
    'ct__subpipe_num__num_impute__strategy' : ['mean','median'],
    'ct__subpipe_num__poly__degree' :[1,2,3],
    'fsm__solver' : ['liblinear','lbfgs'],
    'fsm__max_iter': [10, 100,1000,10_000],
    'fsm__C' : [0.0001, 0.001, 0.01, 0.1, 1],
    'fsm__tol' : [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
# GridSearch?

gs = GridSearchCV(
    estimator = fsm_model_pipe,
    param_grid=params,
    cv=5,
    verbose=1
)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_params_

In [None]:
gs_best = gs.best_estimator_

In [None]:
gs_pipe = ModelWithCV(
    gs_best,
    model_name='gs',
    X=X_test,
    y=y_test,
)

fig, ax =plt.subplots()

gs_pipe.print_cv_summary()

gs_pipe.plot_cv(ax=ax)

In [None]:
gs_best.score(X_train, y_train)

In [None]:
gs_best.score(X_test, y_test)

## `imblearn` Pipelines

### Dealing with the Target Imbalance

Let's use `SMOTE()`:

In [None]:
y_train.value_counts()

In [None]:
sm = SMOTE(sampling_strategy='auto', random_state=42)

In [None]:
sm2 = SMOTE(sampling_strategy=0.8, random_state=42)

To make things simple (and avoid errors!), let's just grab the numeric types and eliminate the NaNs from X_train:

In [None]:
X_train_clean = X_train.select_dtypes(['float', 'int']).dropna()
y_train_clean = y_train[X_train_clean.index]

In [None]:
y_train_clean.value_counts()

In [None]:
# Even distribution

X_clean_resmp, y_clean_resmp = sm.fit_resample(X_train_clean, y_train_clean)

y_clean_resmp.value_counts()

In [None]:
# Distribution where count of 1's = 0.8 * count of 0's

X_clean_resmp2, y_clean_resmp2 = sm2.fit_resample(X_train_clean, y_train_clean)

y_clean_resmp2.value_counts()

### `imblearn` Pipeline

Of course, we want to be able to perform all of our preprocessing steps from above, but just now add `SMOTE`. Good thing we can throw it all into a pipeline!

In [None]:
# imbpipe!

imbpipe = ImPipeline(steps=[
    ('ct', CT),
    ('sm', SMOTE(random_state=42)),
    ('gs_best', LogisticRegression(C=1,max_iter=10, solver='liblinear',tol=.0001))
])

{'ct__subpipe_num__num_impute__strategy': 'mean',
 'fsm__C': 1,
 'fsm__max_iter': 10,
 'fsm__solver': 'liblinear',
 'fsm__tol': 0.0001}

In [None]:
imbpipe.fit(X_train, y_train)

In [None]:
imbpipe.score(X_train, y_train)

In [None]:
imbpipe.score(X_test, y_test)

### Gridsearching

In [None]:
# Seriously? Again?

## Evaluation on Test Set

Let's suppose that we choose our final model from this last GridSearch run. Note that the optimal parameters are also the default values!

In [None]:
final_model = imbpipe

In [None]:
plot_confusion_matrix(final_model, X_test, y_test);

In [None]:
y_hat = final_model.predict(X_test)

In [None]:
print(f"""
Our final model's accuracy on the test set is {round(accuracy_score(y_test, y_hat), 2)}. \n
Our final model's recall on the test set is {round(recall_score(y_test, y_hat), 2)} \n
Our final model's precision on the test set is {round(precision_score(y_test, y_hat), 2)} \n
Our final model's f1-score on the test is {round(f1_score(y_test, y_hat), 2)}.
""")

## Exercise: Your Turn!

Use SMOTE and an estimator (model) of your choice in a pipeline to model the exoplanets' method of discovery ("method"). You can build a model one feature at a time or just throw them all in from the beginning. Consider using the LabelEncoder (from sklearn.preprocessing) to code up the target. You'll also need to make a choice about how to handle the null values.

In [None]:
exos = sns.load_dataset('planets')

In [None]:
### Your work here






