## Pipelines

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
sklearn.set_config(print_changed_only=True)

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Cadena de algoritmos y Pipelines

In [None]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

# Calcular el minimo y maximo en el conjunto de entrenamiento
scaler = MinMaxScaler().fit(X_train)
# rescalar el conjunto de entrenamiento
X_train_scaled = scaler.transform(X_train)

svm = SVC()
svm.fit(X_train_scaled, y_train)
# escalar datos de prueba y puntuar los datos escalados
X_test_scaled = scaler.transform(X_test)
svm.score(X_test_scaled, y_test)

### Construyendo pipelines

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

### Usando pipelines y búsqueda grid

In [None]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Exactitud de la mejor validacion cruzada:", grid.best_score_)
print("Puntaje de conjunto de prueba: ", grid.score(X_test, y_test))
print("Mejores parametros: ", grid.best_params_)

### No usar Pipelines vs Selección de características

In [None]:
rnd = np.random.RandomState(seed=0)
X = rnd.normal(size=(100, 10000))
y = rnd.normal(size=(100,))

In [None]:
from sklearn.feature_selection import SelectPercentile, f_regression

select = SelectPercentile(score_func=f_regression,
                          percentile=5)
select.fit(X, y)
X_selected = select.transform(X)
print(X_selected.shape)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
np.mean(cross_val_score(Ridge(), X_selected, y))

In [None]:
pipe = Pipeline([("select", SelectPercentile(score_func=f_regression, percentile=5)),
                 ("ridge", Ridge())])
np.mean(cross_val_score(pipe, X, y))

### Interfaz general de un Pipeline

In [None]:
def fit(self, X, y):
    X_transformed = X
    for step in self.steps[:-1]:
        X_transformed = step[1].fit_transform(X_transformed, y)
    self.steps[-1][1].fit(X_transformed, y)
    return self

In [None]:
def predict(self, X):
    X_transformed = X
    for step in self.steps[:-1]:
        X_transformed = step[1].transform(X_transformed)
    return self.steps[-1][1].predict(X_transformed)

### Pipeline com make_pipeline

In [None]:
from sklearn.pipeline import make_pipeline
# Sintaxis estandar 
pipe_long = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))])
# Sintaxis abreviada
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))

In [None]:
pipe_short.steps

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipe = make_pipeline(StandardScaler(), PCA(n_components=2),
                     StandardScaler())
pipe.steps

### Accediendo atributos 

In [None]:
pipe.fit(cancer.data)
components = pipe.named_steps.pca.components_
print(components.shape)

In [None]:
pipe['pca']

In [None]:
pipe[0]

In [None]:
pipe[1]

In [None]:
pipe[:2]

¿Qué resulta de hacer ``pipe.named_steps['standardscaler-1']?``.

In [None]:
### Tu respuesta

### Accediendo atributos en pipeline con búsqueda grid

In [None]:
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

In [None]:
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=4)
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)

In [None]:
print(grid.best_estimator_.named_steps.logisticregression)
print(grid.best_estimator_['logisticregression'])

In [None]:
print(grid.best_estimator_.named_steps.logisticregression.coef_)

In [None]:
print(grid.best_estimator_['logisticregression'].coef_)

### Parámetros de modelo y pasos de preprocesamiento en la búsqueda grid

In [None]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, random_state=0)

from sklearn.preprocessing import PolynomialFeatures
pipe = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(),
    Ridge())

In [None]:
param_grid = {'polynomialfeatures__degree': [1, 2, 3],
              'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid,
                    n_jobs=-1, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(grid.cv_results_)
res.head()

In [None]:
res = pd.pivot_table(res, index=['param_polynomialfeatures__degree', 'param_ridge__alpha'],
               values=['mean_train_score', 'mean_test_score'])

In [None]:
res['mean_train_score'].unstack()

In [None]:
res['mean_test_score'].unstack()

In [None]:
print(grid.best_params_)

In [None]:
grid.best_estimator_['polynomialfeatures'].get_feature_names(diabetes.feature_names) ### Super!!.

In [None]:
grid.score(X_test, y_test)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('regressor', Ridge())])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],
              'regressor': [Ridge(), Lasso()],
              'regressor__alpha': np.logspace(-3, 3, 7)}

grid = GridSearchCV(pipe, param_grid,
                    cv=RepeatedKFold(n_splits=10, n_repeats=10))
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
from sklearn.tree import DecisionTreeRegressor
param_grid = [{'regressor': [DecisionTreeRegressor()],
               'regressor__max_depth': [2, 3, 4]},
              {'regressor': [Ridge()],
               'regressor__alpha': [0.1, 1]}
             ]

### ColumnTransformer

In [2]:
from sklearn.compose import make_column_transformer, ColumnTransformer
import pandas as pd
bike = pd.read_csv("datos/bike_day_raw.csv")
bike.head()

Unnamed: 0,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,1,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,1,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,1,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,1,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600


In [3]:
bike.dtypes

season          int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
cnt             int64
dtype: object

In [4]:
bike_data = bike.drop("cnt", axis=1)
cat_features = bike.columns[:6]
cat_features

Index(['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit'], dtype='object')

In [6]:
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

ct = make_column_transformer((OneHotEncoder(sparse=False), cat_features),
                             remainder=StandardScaler()) # 
ct.transformers

[('onehotencoder',
  OneHotEncoder(sparse=False),
  Index(['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit'], dtype='object'))]

In [7]:
ColumnTransformer([('ohe', OneHotEncoder(sparse=False), cat_features)],
                  remainder=StandardScaler())

ColumnTransformer([('ohe', OneHotEncoder(sparse=False), cat_features),
                   ('scaler', StandardScaler(), [6, 7, 8, 9])])

ColumnTransformer(transformers=[('ohe', OneHotEncoder(sparse=False),
                                 Index(['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit'], dtype='object')),
                                ('scaler', StandardScaler(), [6, 7, 8, 9])])

In [None]:
ct.fit(bike_data)

In [None]:
bike_data.shape

In [None]:
ct.transform(bike_data).shape

In [None]:
ct.transform(bike_data)

In [None]:
ct = make_column_transformer((OneHotEncoder(sparse=False), cat_features),
                             remainder=StandardScaler())
ohe_pipe = make_pipeline(ct, Ridge())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bike_data, bike.cnt, random_state=42)

In [None]:
cross_val_score(ohe_pipe, X_train, y_train)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
from sklearn.preprocessing import PowerTransformer

ct = make_column_transformer((OneHotEncoder(sparse=False), cat_features))
ohe_pipe = make_pipeline(ct, Ridge())

param_grid = {'columntransformer__remainder':
              [StandardScaler(), PowerTransformer(method='yeo-johnson')],
              'ridge__alpha': np.logspace(-3, 2, 6)}
grid = GridSearchCV(ohe_pipe, param_grid)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

In [None]:
grid.best_params_

In [None]:
res = pd.DataFrame(grid.cv_results_)
res

In [None]:
plt.plot(res.mean_test_score[:6].values, label="StandardScaler")
plt.plot(res.mean_test_score[6:].values, label="PowerTransformer")
plt.legend()