# **Pipelines Activity**

_John Andrew Dixon_

---

**Setup**


In [2]:
# Import necessary modules

# For working with the data
import pandas as pd

# For working with NumPY arrays, the main output of sklearn
import numpy as np

# For performing a TTS
from sklearn.model_selection import train_test_split

# For scaling numerical features and encoding nominal features
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# For creating column selectors and column transformers
from sklearn.compose import make_column_selector, make_column_transformer

# For simple imputation on missing data
from sklearn.impute import SimpleImputer

# For creation of preprocessing pipelines
from sklearn.pipeline import make_pipeline

In [3]:
# Remote URL that has the data
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSdnb9XcAnl91bdZYxoJQgIapMW6SLkfr3DYGwnpBOIw-rkw-5j_3b0JLx01282OBAKVUCUJnq8OAUR/pub?output=xlsx"

# Read in the data and verify
df = pd.read_excel(url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   name                                             77 non-null     object 
 1   Manufacturer                                     77 non-null     object 
 2   type                                             68 non-null     object 
 3   calories per serving                             70 non-null     float64
 4   grams of protein                                 77 non-null     int64  
 5   grams of fat                                     69 non-null     float64
 6   milligrams of sodium                             76 non-null     float64
 7   grams of dietary fiber                           77 non-null     float64
 8   grams of complex carbohydrates                   77 non-null     float64
 9   grams of sugars                   

---

## **Tasks**

> **Question**: _How well can the calories be predicted based on the Manufacturer, cereal type, grams of fat, grams of sugars, and weight in ounces per one serving of the cereal?_

### **Define features (X) and target (y).**
- `X` should only include the `Manufacturer`, `type`, `grams of fat`, `grams of sugars`, and `Weight in ounces per one serving` columns.
- `y` should be `calories per serving`

In [4]:
# Create the feature matrix (X)
columns = ["Manufacturer", "type", "grams of fat", "grams of sugars", "Weight in ounces per one serving"]
X = df.loc[:, columns]
X

Unnamed: 0,Manufacturer,type,grams of fat,grams of sugars,Weight in ounces per one serving
0,General Mills,Cold,2.0,10.0,1.00
1,General Mills,Cold,2.0,,1.33
2,General Mills,Cold,2.0,1.0,1.00
3,General Mills,Cold,3.0,9.0,1.00
4,General Mills,Cold,2.0,7.0,1.00
...,...,...,...,...,...
72,Ralston Purina,Cold,,2.0,1.00
73,Ralston Purina,Cold,1.0,3.0,1.00
74,American Home Food Products,Hot,1.0,,1.00
75,Nabisco,Hot,0.0,0.0,1.00


In [5]:
# Create the target vector (y)
y = df["calories per serving"]
y
y.value_counts(dropna=False)

110.0    27
100.0    16
120.0     9
NaN       7
90.0      5
140.0     3
50.0      3
70.0      2
150.0     2
130.0     1
160.0     1
80.0      1
Name: calories per serving, dtype: int64

### **Identify each feature as numerical, ordinal, or nominal.**
**Numerical**: `grams of fat`, `grams of sugars`, `Weight in ounces per one serving`

**Ordinal**: `None`

**Nominal**: `Manufacturer`, `type`

### **Train test split the data to prepare for machine learning.**

In [6]:
# Perform the Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **Use pipelines and column transformers to complete the following tasks:**
- Impute any missing values. Use the ‘mean’ strategy for numeric columns and the ‘most_frequent’ strategy for categorical columns.
- One-hot encode the nominal features.
    - Be sure to include the arguments: sparse=False AND handle_unknown='ignore' when creating your OneHotEncoder.
- Scale the numeric columns.

> **Note:** _In newer versions of scikit-learn, sparse is now called sparse_output._

In [7]:
# Instantiate all transformers

# Instantiate the Simple imputers for both types of columns
# Most Fequent for Nominal columns and Mean for numeric columns
most_frequent_imputer = SimpleImputer(strategy="most_frequent")
mean_imputer = SimpleImputer(strategy="mean")

# Instantiate the Scaler for scaling numerical features
scaler = StandardScaler()

# Instantiate the One-Hot Encoder
# Note: In newer versions of sklearn, sparse is now called sparse_output
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [8]:
# Instantiate the numeric pipeline
numeric_pipeline = make_pipeline(mean_imputer, scaler)
numeric_pipeline

In [9]:
# Instantiate the nomical/categorical pipeline
nominal_pipeline = make_pipeline(most_frequent_imputer, one_hot_encoder)
nominal_pipeline


### **All preprocessing steps should be contained within a single preprocessing object.**
- Include the arguments: remainder='drop' OR remainder='passthrough' when creating your ColumnTransformer

In [10]:
# Instantiate columns selectors
numeric_selector = make_column_selector(dtype_include="number")
nominal_selector = make_column_selector(dtype_include="object")

In [11]:
# Instantiate tuples for ColumnTransformer
numeric_tuple = (numeric_pipeline, numeric_selector)
nominal_tuple = (nominal_pipeline, nominal_selector)

In [12]:
# Instantiate a ColumnTransformer as a single preprocessing object for
# all column transformations
preprocessor = make_column_transformer(nominal_tuple, numeric_tuple, remainder="passthrough", verbose_feature_names_out=False)

### **Use your preprocessing object to transform your data appropriately, avoiding data leakage, to make it ready for modeling. Show the resulting NumPy array output.**
- The .fit() and .transform() methods should ONLY be used with the resulting preprocessing object, NOT with any individual transformer.

In [13]:
# Fit the preprocessor to ONLY the training data to prevent data leaks
preprocessor.fit(X_train)

In [14]:
# Transform the X_train and X_test data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [15]:
print("X_train_processed data missing values:", np.isnan(X_train_processed).sum().sum())
print("X_train_processed datatypes:", X_train_processed.dtype)
print("X_train_processed shape:", X_train_processed.shape)

print()

print("X_test_processed data missing values:", np.isnan(X_test_processed).sum().sum())
print("X_test_processed datatypes:", X_test_processed.dtype)
print("X_test_processed shape:", X_test_processed.shape)

# Output the first 5 values of the processed training data
X_train_processed

X_train_processed data missing values: 0
X_train_processed datatypes: float64
X_train_processed shape: (61, 11)

X_test_processed data missing values: 0
X_test_processed datatypes: float64
X_test_processed shape: (16, 11)


array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.73371631,
        -0.15718492],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.42401746,
        -0.15718492],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.82659717],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        -0.15718492],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        , -1.0082989 ,  0.96381669,
        -0.15718492],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.19391708,
         2.024

### **Bonus: Make them into readable DataFrames**

> **Note:** _This is more for myself. It uses something I learned about in the previous core exercise after perusing some of Scikit-learn's documentation._

In [16]:
# Get the names of the features as created by the column transformer
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
features = preprocessor.get_feature_names_out()

# Make each a DataFrame
X_train_processed_df = pd.DataFrame(X_train_processed, columns=features)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=features)

# Display each DataFrame
display(X_train_processed_df)
X_test_processed_df

Unnamed: 0,Manufacturer_General Mills,Manufacturer_Kelloggs,Manufacturer_Nabisco,Manufacturer_Post,Manufacturer_Quaker Oats,Manufacturer_Ralston Purina,type_Cold,type_Hot,grams of fat,grams of sugars,Weight in ounces per one serving
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.733716,-0.157185
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,1.424017,-0.157185
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,1.826597
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,-0.157185
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.008299,0.963817,-0.157185
...,...,...,...,...,...,...,...,...,...,...,...
56,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,-0.876986,-0.157185
57,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.008299,1.193917,-0.157185
58,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.016598,0.963817,-0.157185
59,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.008299,0.273516,-0.157185


Unnamed: 0,Manufacturer_General Mills,Manufacturer_Kelloggs,Manufacturer_Nabisco,Manufacturer_Post,Manufacturer_Quaker Oats,Manufacturer_Ralston Purina,type_Cold,type_Hot,grams of fat,grams of sugars,Weight in ounces per one serving
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.008299,0.043415,-0.157185
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.008299,1.424017,3.149119
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.876986,-0.157185
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.008299,0.733716,-0.157185
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.186685,-0.157185
5,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-1.008299,-1.567288,-0.157185
6,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.008299,0.273516,-0.157185
7,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.008299,1.884218,-0.157185
8,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.008299,-0.416786,-0.157185
9,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.008299,-0.876986,-0.157185
