In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

In [3]:
wine = datasets.load_wine()
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [4]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [5]:
features = pd.DataFrame(wine.data, columns=wine.feature_names)
label = pd.DataFrame(wine.target, columns=['wine_class'])
df = pd.concat([features, label], axis=1)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,wine_class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [6]:
df.wine_class.value_counts()

1    71
0    59
2    48
Name: wine_class, dtype: int64

In [7]:
x = df[df.drop('wine_class', axis=1).columns]
y = df['wine_class']

## Imputing within a pipeline

In [8]:
# SVC

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
         ('SVM', SVC())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the train set
pipeline.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(x_test)
print(y_pred[:5])

# Compute metrics
print(pipeline.score(x_test, y_test))
print(classification_report(y_test, y_pred))

[0 0 1 0 1]
0.7592592592592593
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.63      0.90      0.75        21
           2       0.60      0.21      0.32        14

    accuracy                           0.76        54
   macro avg       0.74      0.71      0.69        54
weighted avg       0.75      0.76      0.72        54



In [9]:
# KNeighborsClassifier

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
         ('KNeighborsClassifier', KNeighborsClassifier())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the train set
pipeline.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(x_test)
print(y_pred[:5])

# Compute metrics
print(pipeline.score(x_test, y_test))
print(classification_report(y_test, y_pred))

[2 0 2 0 1]
0.7407407407407407
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        19
           1       0.75      0.71      0.73        21
           2       0.53      0.57      0.55        14

    accuracy                           0.74        54
   macro avg       0.73      0.73      0.73        54
weighted avg       0.74      0.74      0.74        54



In [10]:
# DecisionTreeClassifier

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
         ('DecisionTreeClassifier', DecisionTreeClassifier())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the train set
pipeline.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(x_test)
print(y_pred[:5])

# Compute metrics
print(pipeline.score(x_test, y_test))
print(classification_report(y_test, y_pred))

[0 0 2 0 1]
0.9629629629629629
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.95      1.00      0.98        21
           2       1.00      0.93      0.96        14

    accuracy                           0.96        54
   macro avg       0.97      0.96      0.96        54
weighted avg       0.96      0.96      0.96        54



## Scaling in a pipeline

In [11]:
# from sklearn.preprocessing import StandardScaler

# steps = [('scaler', StandardScaler()), 
#          ('KNeighborsClassifier', KNeighborsClassifier())]
# pipeline = Pipeline(steps)

# pipeline.fit(x_train, y_train)
# y_pred = pipeline.predict(x_test)

In [12]:
# SVC

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 
         ('scaler', StandardScaler()),
         ('SVM', SVC())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the train set
pipeline.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(x_test)
print(y_pred[:5])

# Compute metrics
print(pipeline.score(x_test, y_test))
print(classification_report(y_test, y_pred))

[0 0 2 0 1]
0.9814814814814815
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.95      1.00      0.98        21
           2       1.00      0.93      0.96        14

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



In [13]:
# KNeighborsClassifier

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 
         ('scaler', StandardScaler()),
         ('KNeighborsClassifier', KNeighborsClassifier())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the train set
pipeline.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(x_test)
print(y_pred[:5])

# Compute metrics
print(pipeline.score(x_test, y_test))
print(classification_report(y_test, y_pred))

[0 0 2 0 1]
0.9629629629629629
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.90      0.95        21
           2       0.93      1.00      0.97        14

    accuracy                           0.96        54
   macro avg       0.96      0.97      0.96        54
weighted avg       0.97      0.96      0.96        54



In [14]:
# DecisionTreeClassifier

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 
         ('scaler', StandardScaler()),
         ('DecisionTreeClassifier', DecisionTreeClassifier())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the train set
pipeline.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(x_test)
print(y_pred[:5])

# Compute metrics
print(pipeline.score(x_test, y_test))
print(classification_report(y_test, y_pred))

[0 0 2 0 1]
0.9629629629629629
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        19
           1       0.95      1.00      0.98        21
           2       1.00      0.93      0.96        14

    accuracy                           0.96        54
   macro avg       0.97      0.96      0.96        54
weighted avg       0.96      0.96      0.96        54



## Cross-validation and scaling in a pipeline

In [15]:
# from sklearn.model_selection import GridSearchCV

# steps = [('scaler', StandardScaler()), 
#          ('KNeighborsClassifier', KNeighborsClassifier())]
# pipeline = Pipeline(steps)

# parameters = {'KNeighborsClassifier__n_neighbors': np.arange(1, 50)}
# cv = GridSearchCV(pipeline, param_grid = parameters)

# cv.fit(x_train, y_train)
# y_pred = cv.predict(x_test)

In [16]:
# KNeighborsClassifier

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 
         ('scaler', StandardScaler()),
         ('KNeighborsClassifier', KNeighborsClassifier())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

parameters = {'KNeighborsClassifier__n_neighbors': np.arange(1, 50)}  # 'pipeline step name__hyperparameter name' 
cv = GridSearchCV(pipeline, param_grid = parameters)

# Fit the cv to the train set
cv.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = cv.predict(x_test)
print(y_pred[:5])

# Compute metrics
print(cv.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print(cv.best_params_)

[0 0 2 0 1]
0.9629629629629629
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.90      0.95        21
           2       0.93      1.00      0.97        14

    accuracy                           0.96        54
   macro avg       0.96      0.97      0.96        54
weighted avg       0.97      0.96      0.96        54

{'KNeighborsClassifier__n_neighbors': 19}


## Scaled vs. unscaled

In [17]:
# Import the necessary modules
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
        ('SVC', SVC())]
        
# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Fit the pipeline to the training set: knn_scaled
SVC_scaled = pipeline.fit(x_train, y_train)

# Instantiate and fit a k-NN classifier to the unscaled data
SVC_unscaled = KNeighborsClassifier().fit(x_train, y_train)

# Compute and print metrics
print('Accuracy with Scaling: {}'.format(SVC_scaled.score(x_test, y_test)))
print('Accuracy without Scaling: {}'.format(SVC_unscaled.score(x_test, y_test)))

Accuracy with Scaling: 0.9814814814814815
Accuracy without Scaling: 0.7407407407407407


## Bringing it all together I: Pipeline for classification

In [18]:
# Import the necessary modules
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Create train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21)

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)

# Fit to the training set
cv.fit(x_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(x_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(x_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        11

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

Tuned Model Parameters: {'SVM__C': 10, 'SVM__gamma': 0.1}


## Bringing it all together II: Pipeline for regression

In [19]:
diabetes = datasets.load_diabetes()
x = diabetes.data
y = diabetes.target

In [20]:
# Import the necessary modules
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values=np.NaN, strategy='mean')),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}

# Create train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)

# Fit to the training set
gm_cv.fit(x_train, y_train)

# Compute and print the metrics
r2 = gm_cv.score(x_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))

Tuned ElasticNet Alpha: {'elasticnet__l1_ratio': 1.0}
Tuned ElasticNet R squared: 0.508459571308439
