Steps of the Process:

    1. Import necessary libraries.
    2. Load data.
    3. Explore data.
    4. Validation split.
    5. Instantiate column selectors.
    6. Instantiate transformers.
    7. Instantiate pipelines.
    8. Instantiate ColumnTransformer.
    9. Transform data.
    10. Inspect results.

# 1. Import Necessay Libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

# 2. Load Data

In [2]:
path = './cereal_missing_values.csv'
crl_df = pd.read_csv(path)
crl_df.head()

Unnamed: 0,name,Manufacturer,type,calories per serving,grams of protein,grams of fat,milligrams of sodium,grams of dietary fiber,grams of complex carbohydrates,grams of sugars,milligrams of potassium,vitamins and minerals (% of FDA recommendation),Display shelf,Weight in ounces per one serving,Number of cups in one serving,Rating of cereal
0,Apple Cinnamon Cheerios,General Mills,Cold,110.0,2,2.0,180.0,1.5,10.5,10.0,70,25.0,1,1.0,0.75,29.509541
1,Basic 4,General Mills,Cold,130.0,3,2.0,,2.0,18.0,,100,25.0,3,1.33,0.75,37.038562
2,Cheerios,General Mills,Cold,,6,2.0,290.0,2.0,17.0,1.0,105,25.0,1,1.0,1.25,50.764999
3,Cinnamon Toast Crunch,General Mills,Cold,120.0,1,3.0,210.0,0.0,13.0,9.0,45,25.0,2,1.0,0.75,19.823573
4,Clusters,General Mills,Cold,110.0,3,2.0,140.0,2.0,13.0,7.0,105,25.0,3,1.0,0.5,40.400208


# 3. Explore Data

In [4]:
crl_df.info()
# there are some missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   name                                             77 non-null     object 
 1   Manufacturer                                     77 non-null     object 
 2   type                                             68 non-null     object 
 3   calories per serving                             70 non-null     float64
 4   grams of protein                                 77 non-null     int64  
 5   grams of fat                                     69 non-null     float64
 6   milligrams of sodium                             76 non-null     float64
 7   grams of dietary fiber                           77 non-null     float64
 8   grams of complex carbohydrates                   77 non-null     float64
 9   grams of sugars                   

In [7]:
print(crl_df.isna().sum().sum(), 'missing values')
print(crl_df.duplicated().sum(), 'duplicated values')

35 missing values
0 duplicated values


In [8]:
# checking the uniqueness of columns
def check_inconsistent_values(df):
    for col in df:
        series = df[col]
        print(f" Series -->{col} unique values are {series.unique()} \n")

In [9]:
check_inconsistent_values(crl_df)
# values seem consistent enough 

 Series -->name unique values are ['Apple Cinnamon Cheerios' 'Basic 4' 'Cheerios' 'Cinnamon Toast Crunch'
 'Clusters' 'Cocoa Puffs' 'Count Chocula' 'Crispy Wheat & Raisins'
 'Golden Grahams' 'Honey Nut Cheerios' 'Kix' 'Lucky Charms'
 'Multi-Grain Cheerios' 'Oatmeal Raisin Crisp' 'Raisin Nut Bran'
 'Total Corn Flakes' 'Total Raisin Bran' 'Total Whole Grain' 'Triples'
 'Trix' 'Wheaties' 'Wheaties Honey Gold' 'All-Bran'
 'All-Bran with Extra Fiber' 'Apple Jacks' 'Corn Flakes' 'Corn Pops'
 "Cracklin' Oat Bran" 'Crispix' 'Froot Loops' 'Frosted Flakes'
 'Frosted Mini-Wheats' 'Fruitful Bran' 'Just Right Crunchy  Nuggets'
 'Just Right Fruit & Nut' 'Mueslix Crispy Blend' 'Nut&Honey Crunch'
 'Nutri-Grain Almond-Raisin' 'Nutri-grain Wheat' 'Product 19'
 'Raisin Bran' 'Raisin Squares' 'Rice Krispies' 'Smacks' 'Special K'
 '100% Bran' 'Shredded Wheat' "Shredded Wheat 'n'Bran"
 'Shredded Wheat spoon size' 'Strawberry Fruit Wheats' 'Bran Flakes'
 'Fruit & Fibre Dates; Walnuts; and Oats' 'Fruity Pebbl

# 4. Split Data

In [11]:
target = 'calories per serving'
X = crl_df.drop(columns=[target])
y = crl_df[target]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42) 

# 5. Instantiate Column Selectors

In [13]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [14]:
print(f"Categorical columns are ---> {cat_selector(X_train)} \n")
print(f"Numbers columns are ---> {num_selector(X_train)}")

Categorical columns are ---> ['name', 'Manufacturer', 'type'] 

Numbers columns are ---> ['grams of protein', 'grams of fat', 'milligrams of sodium', 'grams of dietary fiber', 'grams of complex carbohydrates', 'grams of sugars', 'milligrams of potassium', 'vitamins and minerals (% of FDA recommendation)', 'Display shelf', 'Weight in ounces per one serving', 'Number of cups in one serving', 'Rating of cereal']


# 6. Instantiate Transformers

In [15]:
# Imputers
freq_imputer = SimpleImputer(strategy='most_frequent') #for categorical data
mean_imputer = SimpleImputer(strategy='mean') #for numeric data
# Scaler
scaler = StandardScaler() #for numeric data 
# One-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False) #categorical nominal data

# 7. Instantiate Pipelines 

In [16]:
# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler) # scale and mean numeric data
numeric_pipe

In [17]:
# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

# 8. Instantiate ColumnTransformer

In [18]:
# Tuples for Column Transformer
num_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)

# columnTransformer
preprocessor = make_column_transformer(num_tuple, category_tuple)
preprocessor


# 9. Transformer Data

In [19]:
# fit on train
preprocessor.fit(X_train)



In [21]:
# transform train and test data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# 10. Inspect the Result