# Pipelines in procesi

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Algorithm Chains and Pipelines

In [None]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# load and split the data
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# compute minimum and maximum on the training data
scaler = MinMaxScaler().fit(X_train)

# rescale the training data
X_train_scaled = scaler.transform(X_train)

svm = SVC()
# learn an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scale the test data and score the scaled data
X_test_scaled = scaler.transform(X_test)
print("Test score: {:.2f}".format(svm.score(X_test_scaled, y_test)))

### Parameter Selection with Preprocessing

In [None]:
from sklearn.model_selection import GridSearchCV

# for illustration purposes only, don't use this code!
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid.fit(X_train_scaled, y_train)

print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Best set score: {:.2f}".format(grid.score(X_test_scaled, y_test)))
print("Best parameters: ", grid.best_params_)

In [None]:
from helper_plots import plot_improper_processing
plot_improper_processing()

### Building Pipelines

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))

### Using Pipelines in Grid Searches

In [None]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

In [None]:
from helper_plots import plot_proper_processing
plot_proper_processing()

### The General Pipeline Interface

```python
def fit(self, X, y):
    X_transformed = X
    for name, estimator in self.steps[:-1]:
        # iterate over all but the final step
        # fit and transform the data
        X_transformed = estimator.fit_transform(X_transformed, y)
    # fit the last step
    self.steps[-1][1].fit(X_transformed, y)
    return self
```

```python
def predict(self, X):
    X_transformed = X
    for step in self.steps[:-1]:
        # iterate over all but the final step
        # transform the data
        X_transformed = step[1].transform(X_transformed)
    # fit the last step
    return self.steps[-1][1].predict(X_transformed)
```

### Convenient Pipeline Creation with make_pipeline

In [None]:
from sklearn.pipeline import make_pipeline

# standard syntax
pipe_long = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))])

# abbreviated syntax
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))

In [None]:
print("Pipeline steps:\n{}".format(pipe_short.steps))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
print("Pipeline steps:\n{}".format(pipe.steps))

### Accessing Step Attributes

In [None]:
# fit the pipeline defined before to the cancer dataset
pipe.fit(cancer.data)

# extract the first two principal components from the "pca" step
components = pipe.named_steps["pca"].components_

print("components.shape: {}".format(components.shape))

### Accessing Attributes in a Pipeline inside GridSearchCV

In [None]:
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

In [None]:
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=4)

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

In [None]:
print("Best estimator:\n{}".format(grid.best_estimator_))

In [None]:
print("Logistic regression step:\n{}".format(grid.best_estimator_.named_steps["logisticregression"]))

In [None]:
print("Logistic regression coefficients:\n{}".format(grid.best_estimator_.named_steps["logisticregression"].coef_))

### Grid-Searching Preprocessing Steps and Model Parameters

In [None]:
from sklearn.datasets import load_boston

boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,random_state=0)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

pipe = make_pipeline(StandardScaler(), PolynomialFeatures(), Ridge())

In [None]:
param_grid = {'polynomialfeatures__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
plt.matshow(grid.cv_results_['mean_test_score'].reshape(3, -1), vmin=0, cmap="viridis")
plt.xlabel("ridge__alpha")
plt.ylabel("polynomialfeatures__degree")
plt.xticks(range(len(param_grid['ridge__alpha'])), param_grid['ridge__alpha'])
plt.yticks(range(len(param_grid['polynomialfeatures__degree'])),param_grid['polynomialfeatures__degree'])
plt.colorbar()
plt.show()

In [None]:
print("Best parameters: {}".format(grid.best_params_))

In [None]:
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

In [None]:
param_grid = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
pipe = make_pipeline(StandardScaler(), Ridge())
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Score without poly features: {:.2f}".format(grid.score(X_test, y_test)))

### Grid-Searching Which Model To Use

In [None]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid = [{'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
                'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
               {'classifier': [RandomForestClassifier(n_estimators=100)],
                'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

## Choosing the right estimator

<img src="https://scikit-learn.org/stable/_static/ml_map.png" class="map" alt="Move mouse over image" usemap="#imgmap">

## Primer 1: Titanic

In [None]:
import numpy as np
import pandas as pd

**Dataset details**

In [None]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

**Profiling Report using Pandas Profiling**

In [None]:
from pandas_profiling import ProfileReport

### Creating Data Transformer Pipeline

In [None]:
# numerical features from the dataset
numerical_features = ['age', 'fare']

# categorical features from the dataset
categorical_features = ['embarked', 'sex', 'pclass']

**Imputing Data**

In [None]:
X_copy = X.copy()

**Scaling data**

**One Hot Encoding**

**Building Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
numerical_transformer = Pipeline(steps=[
                                    ('imputer', SimpleImputer()),
                                    ('scaler', 'passthrough')])

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder

In [None]:
categorical_transformer = Pipeline(steps=[
                                        ('imputer', SimpleImputer()),
                                        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
from sklearn.compose import ColumnTransformer 

data_transformer = ColumnTransformer(
                    transformers=[
                        ('numerical', numerical_transformer, numerical_features),
                        ('categorical', categorical_transformer, categorical_features)])

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Creating preprocessor pipeline which will first transform the data
# and then apply PCA.
preprocessor = Pipeline(steps=[('data_transformer', data_transformer),
                             ('reduce_dim',PCA())])

**Adding Classifier to the Pipeline**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=0, max_iter=10000))])

**Applying GridSearchCV**

In [None]:
from sklearn.model_selection import train_test_split            
from sklearn.model_selection import GridSearchCV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=0)

In [None]:
param_grid = {
    'preprocessor__data_transformer__numerical__imputer__strategy': ['mean', 'median'],
    'preprocessor__data_transformer__categorical__imputer__strategy': ['constant','most_frequent'],
    'preprocessor__data_transformer__numerical__scaler': [StandardScaler(), RobustScaler(), \
                                                          MinMaxScaler()],
    'classifier__C': [0.1, 1.0, 10, 100],
    'preprocessor__reduce_dim__n_components': [2, 5, 10],
    'classifier__solver': ['liblinear','newton-cg', 'lbfgs','sag','saga']
}

In [None]:
grid_search = GridSearchCV(classifier, param_grid=param_grid, verbose=4, n_jobs=-1, cv=5)

In [None]:
grid_search.fit(X_train, y_train)

**Visualizing Pipeline**

In [None]:
from sklearn import set_config                      
from sklearn.utils import estimator_html_repr      

In [None]:
set_config(display='diagram')

In [None]:
grid_search.best_estimator_

In [None]:
from sklearn.utils import estimator_html_repr
with open('data/titanic_data_pipeline_estimator.html', 'w') as f:  
    f.write(estimator_html_repr(grid_search.best_estimator_))

**Evaluating Model**

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
y_pred = grid_search.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plot_confusion_matrix(grid_search, X_test, y_test)
plt.show()

In [None]:
grid_search.score(X_test, y_test)