# Pipelines


## Setting up

In [1]:
import pandas as pd
import numpy as np

#Loading the Breast Cancer Wisconsin dataset
from sklearn.datasets import load_breast_cancer

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Create DataFrame with features
df = pd.DataFrame(X)
df.columns = dataObj.feature_names

# Add class column
df.insert(loc=0, column="Class", value=y)

# Explore data
display(df.head())
print(df.shape)
display(df.describe())
print(df['Class'].value_counts())

Unnamed: 0,Class,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


(569, 31)


Unnamed: 0,Class,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.627417,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,1.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


1    357
0    212
Name: Class, dtype: int64


In [2]:
from sklearn.model_selection import train_test_split

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.30,
    stratify=y,
    random_state=1)

## Method 1: Without using a pipeline

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Standardize
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# Logistic Regression
lr = LogisticRegression(random_state=1)
lr.fit(X_train_pca, y_train)

# Making prediction from training data
y_pred_train = lr.predict(X_train_pca)
print(y_pred_train)

# Making prediction from testing data
y_pred_test = lr.predict(X_test_pca)
print(y_pred_test)

# Calculating accuracy (old way)
training_accuracy = accuracy_score(y_pred_train, y_train)
testing_accuracy = accuracy_score(y_pred_test, y_test)
print(f"Training Accuracy:{training_accuracy:6.3f}")
print(f"Testing Accuracy:{testing_accuracy:6.3f}")

# Calculating accuracy (another way)
training_accuracy = lr.score(X_train_pca, y_train)
testing_accuracy = lr.score(X_test_pca, y_test)
print(f"Training Accuracy:{training_accuracy:6.3f}")
print(f"Testing Accuracy:{testing_accuracy:6.3f}")

[1 1 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1
 0 1 1 0 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1
 1 1 1 0 1 1 1 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0
 1 1 0 1 1 1 1 0 1 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 0
 1 1 0 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1
 1 0 0 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 1 1 1 0 0 1
 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0
 1 0 1 1 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1
 1 1 0 0 1 1 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 0
 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1
 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 1 1 0 0 0 1 1 0]
[0 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0 1 1 1
 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1
 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 1 0

## Method 2: Using a pipeline


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Construct pipeline object
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1))])

# Training
pipe_lr.fit(X_train, y_train)

# Making prediction from testing data
y_pred = pipe_lr.predict(X_test)
print(y_pred)

# Training accuracy
training_accuracy = pipe_lr.score(X_train, y_train)
print(f"Training Accuracy:{training_accuracy:6.3f}")

# Testing accuracy
testing_accuracy = pipe_lr.score(X_test, y_test)
print(f"Testing Accuracy:{testing_accuracy:6.3f}")

[0 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0 1 1 1
 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1
 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 0 1 1
 1 1 0 0 0 1 0 0 1 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 0 0 0 1 1 0 1 1 1
 1 0 1 0 1 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 1 0 0]
Training Accuracy: 0.960
Testing Accuracy: 0.947


In [5]:
# Get parameter names
for k, v in pipe_lr.get_params().items():
    print(f"{k:25.25s}: {str(v)}")

memory                   : None
steps                    : [('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression(random_state=1))]
verbose                  : False
scl                      : StandardScaler()
pca                      : PCA(n_components=2)
clf                      : LogisticRegression(random_state=1)
scl__copy                : True
scl__with_mean           : True
scl__with_std            : True
pca__copy                : True
pca__iterated_power      : auto
pca__n_components        : 2
pca__random_state        : None
pca__svd_solver          : auto
pca__tol                 : 0.0
pca__whiten              : False
clf__C                   : 1.0
clf__class_weight        : None
clf__dual                : False
clf__fit_intercept       : True
clf__intercept_scaling   : 1
clf__l1_ratio            : None
clf__max_iter            : 100
clf__multi_class         : auto
clf__n_jobs              : None
clf__penalty             : l2
clf__random_state    