# Pipelines

 * Machine learning applications require not only the application of a single algorithm, but the chaining together of many different processing steps and machine learning models.
 * We go all the way from preprocessing to model predictions.
 * **NB!** In a pipeline each step, but the last, must be a transformer and the last one must be an estimator, such as a classifier, regressor or transformer.

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import ElasticNet

## Imputing missing data & logistic regression

In [3]:
diab = pd.read_csv('supervised/classification/data/diabetes.csv', header=0)
diab.head(1)

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1


In [5]:
diab.replace({'triceps':0, 'insulin':0, 'bmi':0}, np.nan, inplace=True)
diab.head(1)

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35.0,,33.6,0.627,50,1


In [9]:
X = diab.drop('diabetes', axis=1).values
y = diab.diabetes.values

# init imputer object
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

# init estimator object
logistic_reg = LogisticRegression(solver='liblinear')

# build the pipeline object. List of tuples. [('name', object), (), ..., ()]
steps = [('imputer', imp), ('logistic_regression', logistic_reg)]

# pipeline object
pipe = Pipeline(steps=steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Logistic Regression R^2 = {:.3f}".format(pipe.score(X_test, y_test)))

Logistic Regression R^2 = 0.771


## Dummifying Imputing, scaling, regression + GridSearch

 * When specifying the parameter grid in Pipeline, there is a slight change.
 * need to specify for each parameter which step of the pipeline it belongs to.
 * Paramter we want to adjust: 'l1_ratio' are parameter of ElasticNet, 3rd step. We give it a name 'elasticnet'.
 * The syntax to define a parameter grid for a pipeline is to specify for each parameter the step name, followed by __ (a double underscore), followed by the parameter name.

In [14]:
gapmind = pd.read_csv('supervised/regression/data/gm_2008_region.csv', header=0)
gapmind.head(2)

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
0,34811059.0,2.73,0.1,3.328945,24.5962,12314.0,129.9049,75.3,29.5,Middle East & North Africa
1,19842251.0,6.43,2.0,1.474353,22.25083,7103.0,130.1247,58.3,192.0,Sub-Saharan Africa


In [16]:
# dummify region column
gapmind_dummies = pd.get_dummies(gapmind, drop_first=True)
gapmind_dummies.head(2)

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region_East Asia & Pacific,Region_Europe & Central Asia,Region_Middle East & North Africa,Region_South Asia,Region_Sub-Saharan Africa
0,34811059.0,2.73,0.1,3.328945,24.5962,12314.0,129.9049,75.3,29.5,0,0,1,0,0
1,19842251.0,6.43,2.0,1.474353,22.25083,7103.0,130.1247,58.3,192.0,0,0,0,0,1


In [28]:
X = gapmind_dummies.drop('life', axis=1).values
y = gapmind_dummies.life.values

# Setup the pipeline steps: steps
steps = [('imp', SimpleImputer(np.nan, 'mean')),
        ('scaler', StandardScaler()),
        ('elasticnet', ElasticNet())]

# init pipeline object
pipe = Pipeline(steps)

# Specify the hyperparameter space
param_grid = {'elasticnet__l1_ratio' : np.linspace(0, 1, 30)}

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# fit pipeline
pipe.fit(X_train, y_train)

# init gridsearch obj
pipe_cv = GridSearchCV(pipe, param_grid, cv=5)
pipe_cv.fit(X_train, y_train)

# compute and print the metrics
print("R^2 = {:.3f}".format(pipe.score(X_test, y_test)))
print("ElasticNet Parameters: {}".format(pipe_cv.best_params_))



R^2 = 0.850
ElasticNet Parameters: {'elasticnet__l1_ratio': 1.0}
