# Data Pipelines

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LogisticRegression
import pickle

## Pipeline

Pipeline คือ ลำดับของ data processing step ที่ถูกรวมเข้าด้วยกัน มีองค์ประกอบ (component) หลักคือ <b>transformer</b> (ตัวแปลงข้อมูล)

Transformer มี method ที่สำคัญคือ
- `.fit` = คำนวณ
- `.transform` = แปลงข้อมูลโดยใช้สิ่งที่คำนวณได้จาก `.fit`
- `.fit_transform` = `.fit` + `.transform`

## Creating simple pipelines

เราสามารถสร้าง pipeline โดยเอา transformer มาต่อกัน

ใช้ `sklearn.pipeline.Pipeline` หรือ `sklearn.pipeline.make_pipeline` (ใช้อันหลังจะเขียน code สั้นกว่า แต่ไม่สามารถตั้งชื่อ transformer ได้)

In [2]:
# Import data and drop duplicates
data = pd.read_csv('../data/India_air_quality_light.csv').drop_duplicates()
# Separate features (X) and target (y)
X, y = data.drop(columns="high_level"), data["high_level"]
# Drop row containing non-sense values
X = X.drop(X[(X['so2'] < 0) | (X['no2'] < 0) | (X['rainfall'] < 0)].index)
y = y[X.index]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
X_train

Unnamed: 0,stn_code,state,location,type,so2,no2,date,rainfall
11510,72,Tamil Nadu,Chennai,Industrial,14.0,15.0,14/08/2015,80.492450
27225,,Andhra Pradesh,Tirupati,Sensitive,4.0,9.0,06/10/2009,68.307701
2952,327,Goa,Panaji,Residential,2.0,9.0,20/05/2015,59.263047
21706,,Uttar Pradesh,Allahabad,Others,4.3,17.9,09/11/2008,45.476771
15548,,Maharashtra,Pune,Industrial,21.0,36.7,06/11/2008,91.507578
...,...,...,...,...,...,...,...,...
23093,705,Maharashtra,Nanded,Industrial,87.0,91.0,16/12/2014,76.647294
3579,607,Assam,Silcher,Residential,6.0,14.0,13/10/2013,88.402010
3313,,Karnataka,Bangalore,Industrial,15.3,35.9,08/11/2009,73.028840
14397,,Odisha,Angul,Industrial,6.3,12.8,04/11/2005,


In [3]:
from sklearn import set_config
# Turn on the diagram visualization to have a nice view of our pipelines
set_config(display='diagram')
# Set pipelines to return their transform outputs as Pandas DataFrames
set_config(transform_output="pandas")

In [4]:
# Create a simple pipeline that impute missing values and standardise values

# Use "sklearn.pipeline.Pipeline"
pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])
# Fit the "so2" and "no2" features with the pipeline
pipe.fit(X_train[['so2', 'no2']])

In [5]:
# Use "sklearn.pipeline.make_pipeline"
pipe = make_pipeline(SimpleImputer(), StandardScaler())
# Fit the "so2" and "no2" features with the pipeline
pipe.fit(X_train[['so2', 'no2']])

In [6]:
# Transform the "so2" and "no2" features of the test set
pipe.transform(X_test[['so2', 'no2']])

Unnamed: 0,so2,no2
8373,-0.017697,-0.469072
17381,0.107097,-0.039262
16139,-0.113693,-0.078336
23512,0.174294,3.047552
13170,-0.497675,-0.469072
...,...,...
22847,0.000000,-1.110995
20781,-0.209688,-0.245794
13514,-0.785662,-0.413252
16520,-0.382480,-0.720259


## FeatureUnion

ใช้ transformer หลายอันพร้อมกัน (ขนานกัน) ในการแปลงทุก columns แล้วเอาผลลัพธ์จาก transformer แต่ละอันมาต่อกัน

ใช้ `sklearn.pipeline.FeatureUnion`

In [7]:
union = FeatureUnion([
    ('pipeline', pipe),
    ('not_scaled', SimpleImputer())
])
union.fit(X_train[['so2', 'no2']])

In [8]:
# Transform the "so2" and "no2" features of the test set
# using the "pipeline" and "not_scaled" transformers in parallel
# and concatenate the output of each of the transformers
union.transform(X_test[['so2', 'no2']])

Unnamed: 0,pipeline__so2,pipeline__no2,not_scaled__so2,not_scaled__no2
8373,-0.017697,-0.469072,11.000000,18.0
17381,0.107097,-0.039262,12.300000,25.7
16139,-0.113693,-0.078336,10.000000,25.0
23512,0.174294,3.047552,13.000000,81.0
13170,-0.497675,-0.469072,6.000000,18.0
...,...,...,...,...
22847,0.000000,-1.110995,11.184355,6.5
20781,-0.209688,-0.245794,9.000000,22.0
13514,-0.785662,-0.413252,3.000000,19.0
16520,-0.382480,-0.720259,7.200000,13.5


## ColumnTransformer

ใช้ transformer หลายอันพร้อมกัน (ขนานกัน) โดยที่เราสามารถเลือก column ที่จะถูก transformed ด้วยแต่ละ transformer ได้ แล้วเอาผลลัพธ์ที่ได้ทั้งหมดมาต่อกัน (ต่างจาก `FeatureUnion` ซึ่งทุก column ที่ส่งเข้าไปจะถูก transformed ด้วยทุก transformer พร้อมกัน)

ใช้ `sklearn.compose.ColumnTransformer`

In [9]:
# Create a transformer to impute and scale numerical features 
num_transformer = Pipeline([
    ('num_imputer', SimpleImputer()),
    ('num_scaler', StandardScaler())
])
# Create a transformer to impute and encode categorical features
cat_transformer = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [10]:
# Numerical features
num_cols = X_train.select_dtypes(include=np.number).columns

In [11]:
# Apply "num_transformer" to numerical features and
# apply "cat_transformer" to the "state" and "type" features in parallel
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_cols),
    ('cat_transformer', cat_transformer, ["state", "type"])
])
preprocessor

In [12]:
preprocessor.fit_transform(X_test)

Unnamed: 0,num_transformer__so2,num_transformer__no2,num_transformer__rainfall,cat_transformer__state_Andhra Pradesh,cat_transformer__state_Arunachal Pradesh,cat_transformer__state_Assam,cat_transformer__state_Bihar,cat_transformer__state_Chandigarh,cat_transformer__state_Chhattisgarh,cat_transformer__state_Dadra & Nagar Haveli,...,cat_transformer__state_Rajasthan,cat_transformer__state_Tamil Nadu,cat_transformer__state_Telangana,cat_transformer__state_Uttar Pradesh,cat_transformer__state_Uttarakhand,cat_transformer__state_West Bengal,cat_transformer__type_Industrial,cat_transformer__type_Others,cat_transformer__type_Residential,cat_transformer__type_Sensitive
8373,-0.023994,-0.487788,-7.344619e-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17381,0.093238,-0.059271,-7.511473e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
16139,-0.114173,-0.098227,7.645589e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23512,0.156364,3.018262,7.645589e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13170,-0.474889,-0.487788,7.645589e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22847,0.000000,-1.127781,-1.362842e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20781,-0.204352,-0.265182,7.744864e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13514,-0.745427,-0.432136,-1.159736e+00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16520,-0.366675,-0.738220,-1.741181e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Accessing individual transformers

เราสามารถเข้าถึงแต่ละ transformer ใน pipeline ด้วย `transformers_` attribute

In [13]:
preprocessor.transformers_

[('num_transformer',
  Pipeline(steps=[('num_imputer', SimpleImputer()),
                  ('num_scaler', StandardScaler())]),
  Index(['so2', 'no2', 'rainfall'], dtype='object')),
 ('cat_transformer',
  Pipeline(steps=[('cat_imputer', SimpleImputer(strategy='most_frequent')),
                  ('cat_encoder',
                   OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
  ['state', 'type']),
 ('remainder', 'drop', [0, 2, 6])]

In [14]:
preprocessor.transformers_[0][2]

Index(['so2', 'no2', 'rainfall'], dtype='object')

In [15]:
preprocessor.transformers_[1][1][1].get_feature_names_out()

array(['state_Andhra Pradesh', 'state_Arunachal Pradesh', 'state_Assam',
       'state_Bihar', 'state_Chandigarh', 'state_Chhattisgarh',
       'state_Dadra & Nagar Haveli', 'state_Daman & Diu', 'state_Delhi',
       'state_Goa', 'state_Gujarat', 'state_Haryana',
       'state_Himachal Pradesh', 'state_Jammu & Kashmir',
       'state_Jharkhand', 'state_Karnataka', 'state_Kerala',
       'state_Madhya Pradesh', 'state_Maharashtra', 'state_Manipur',
       'state_Meghalaya', 'state_Mizoram', 'state_Nagaland',
       'state_Odisha', 'state_Puducherry', 'state_Punjab',
       'state_Rajasthan', 'state_Tamil Nadu', 'state_Telangana',
       'state_Uttar Pradesh', 'state_Uttarakhand', 'state_West Bengal',
       'type_Industrial', 'type_Others', 'type_Residential',
       'type_Sensitive'], dtype=object)

## Custom transformers

เราสามารถสร้าง transformer ที่ไม่มีใน `scikit-learn` (เรากำหนดเอง) ได้ 2 วิธี คือ ใช้ `FunctionTransformer` กับสร้าง class ของ transformer ขึ้นมาเอง

### FunctionTransformer

เราจะใช้ `sklearn.preprocessing.FunctionTransformer` เพื่อแปลง function/callable เป็น transformer

ใช้ได้เฉพาะ stateless transformation เท่านั้น (transformation ที่ไม่ต้องใช้ statistics ของข้อมูล เช่น `round`, `log`) 

In [16]:
# Create a transformer that rounds data to 2 decimal places
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))
# Create a transformer to impute, scale, and then round numerical features 
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)
])
# Apply "num_transformer" to numerical features and
# apply "cat_transformer" to the "state" and "type" features in parallel
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_cols),
    ('cat_transformer', cat_transformer, ["state", "type"])
])
preprocessor

In [17]:
preprocessor.fit_transform(X_train)

Unnamed: 0,num_transformer__so2,num_transformer__no2,num_transformer__rainfall,cat_transformer__state_Andhra Pradesh,cat_transformer__state_Arunachal Pradesh,cat_transformer__state_Assam,cat_transformer__state_Bihar,cat_transformer__state_Chandigarh,cat_transformer__state_Chhattisgarh,cat_transformer__state_Dadra & Nagar Haveli,...,cat_transformer__state_Tamil Nadu,cat_transformer__state_Telangana,cat_transformer__state_Uttar Pradesh,cat_transformer__state_Uttarakhand,cat_transformer__state_Uttaranchal,cat_transformer__state_West Bengal,cat_transformer__type_Industrial,cat_transformer__type_Others,cat_transformer__type_Residential,cat_transformer__type_Sensitive
11510,0.27,-0.64,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
27225,-0.69,-0.97,-0.63,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2952,-0.88,-0.97,-1.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
21706,-0.66,-0.47,-1.84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15548,0.94,0.57,0.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23093,7.28,3.61,-0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3579,-0.50,-0.69,0.44,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3313,0.40,0.53,-0.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14397,-0.47,-0.76,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Custom transformer classes

หากต้องการทำ memory-dependent transformation ต้องเขียน Class ขึ้นมาเอง
- Class ที่จะสร้างต้องสืบทอดมาจาก `sklearn.base.TransformerMixin` (มี `.fit`, `.transform`, `.fit_transform` method) และ `sklearn.base.BaseEstimator` (มี `get_params`, `set_params` method)
- ต้อง override `__init__` (constructor), `.fit` กับ `.transform` method สำหรับ Transformer ของตัวเอง
- `.fit` กับ `.transform` method ต้องมี `y` เป็น parameter ตัวหนึ่ง หากไม่ใช้ ให้ใส่ค่า default เป็น `None`
- `.fit` method ต้อง return self เพื่อให้ transformer object ไปต่อกับ method อื่นได้อีก

In [18]:
class CustomScaler(TransformerMixin, BaseEstimator): 
    """Center data around 0 and shrink it by a fixed factor"""
    
    def __init__(self, shrink_factor=3):
        self.shrink_factor = shrink_factor
    
    def fit(self, X, y=None):
        self.means = X.mean()
        return self
    
    def transform(self, X, y=None):
        X_transformed = (X - self.means) / self.shrink_factor
        # Return result as dataframe for integration into ColumnTransformer
        return X_transformed

In [19]:
# Create a transformer to impute, scale (using our CustomScaler), and then round numerical features 
num_transformer_with_custom_scaler = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', CustomScaler(shrink_factor=3)),
    ('rounder', rounder)
])
# Apply "num_transformer" to numerical features and
# apply "cat_transformer" to the "state" and "type" features in parallel
preprocessor_with_custom_scaler = ColumnTransformer([
    ('num_transformer', num_transformer_with_custom_scaler, num_cols),
    ('cat_transformer', cat_transformer, ["state", "type"])
])
preprocessor_with_custom_scaler

In [20]:
preprocessor_with_custom_scaler.fit_transform(X_train)

Unnamed: 0,num_transformer__so2,num_transformer__no2,num_transformer__rainfall,cat_transformer__state_Andhra Pradesh,cat_transformer__state_Arunachal Pradesh,cat_transformer__state_Assam,cat_transformer__state_Bihar,cat_transformer__state_Chandigarh,cat_transformer__state_Chhattisgarh,cat_transformer__state_Dadra & Nagar Haveli,...,cat_transformer__state_Tamil Nadu,cat_transformer__state_Telangana,cat_transformer__state_Uttar Pradesh,cat_transformer__state_Uttarakhand,cat_transformer__state_Uttaranchal,cat_transformer__state_West Bengal,cat_transformer__type_Industrial,cat_transformer__type_Others,cat_transformer__type_Residential,cat_transformer__type_Sensitive
11510,0.94,-3.80,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
27225,-2.39,-5.80,-3.93,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2952,-3.06,-5.80,-6.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
21706,-2.29,-2.83,-11.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15548,3.27,3.43,3.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23093,25.27,21.53,-1.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3579,-1.73,-4.13,2.77,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3313,1.37,3.17,-2.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14397,-1.63,-4.53,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Including models in pipelines

เอา model ไปต่อกับ pre-processing pipeline ได้ โดย model ที่เราจะใช้ มาจาก `scikit-learn` หรือสร้าง class ขึ้นมาเองก็ได้ (custom model) ซึ่งเราต้อง override `.fit`, `.predict` และ `.score` method

<img src="../images/pipeline.png" width="500" /><br />

เมื่อสร้าง pipeline แล้ว เราต้องตรวจสอบว่าแต่ละ component ของ pipeline ทำงานเหมือนกับเมื่อไม่ได้อยู่ใน pipeline หรือไม่ การเข้าถึงแต่ละ component สามารถทำได้โดยใช้ `.named_steps` attribute

In [21]:
# Combine preprocessor and linear model in pipeline
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=2500))])
final_pipe

In [22]:
# Train the pipeline
final_pipe_trained = final_pipe.fit(X_train, y_train)

In [23]:
# Make predictions
final_pipe_trained.predict(X_test)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [24]:
# Score the pipeline
final_pipe_trained.score(X_test, y_test)

0.7308934337997847

In [25]:
# Compute the cross-validated score of the pipeline
cross_val_score(final_pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.7246019286835614

In [26]:
# Test if the "preprocessing" component works identically to
# when it has not been included in the pipeline
final_pipe.named_steps["preprocessing"].fit_transform(X_train)

Unnamed: 0,num_transformer__so2,num_transformer__no2,num_transformer__rainfall,cat_transformer__state_Andhra Pradesh,cat_transformer__state_Arunachal Pradesh,cat_transformer__state_Assam,cat_transformer__state_Bihar,cat_transformer__state_Chandigarh,cat_transformer__state_Chhattisgarh,cat_transformer__state_Dadra & Nagar Haveli,...,cat_transformer__state_Tamil Nadu,cat_transformer__state_Telangana,cat_transformer__state_Uttar Pradesh,cat_transformer__state_Uttarakhand,cat_transformer__state_Uttaranchal,cat_transformer__state_West Bengal,cat_transformer__type_Industrial,cat_transformer__type_Others,cat_transformer__type_Residential,cat_transformer__type_Sensitive
11510,0.27,-0.64,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
27225,-0.69,-0.97,-0.63,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2952,-0.88,-0.97,-1.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
21706,-0.66,-0.47,-1.84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15548,0.94,0.57,0.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23093,7.28,3.61,-0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3579,-0.50,-0.69,0.44,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3313,0.40,0.53,-0.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14397,-0.47,-0.76,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Exporting pipelines

เราสามารถ export pipeline ได้ 2 วิธี คือ
- Export ในรูปของ pickle (`.pkl`) file ถ้ามี custom transformer จะต้องสร้าง module แยกออกไป
- เขียน pipeline เป็น class/package

### Exporting pipelines as pickle files

In [27]:
# Create a new pipeline
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])
cat_transformer = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_cols),
    ('cat_transformer', cat_transformer, ['state', 'type'])
])
final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=2500))
])
final_pipe.fit(X_train, y_train)

In [28]:
# Export the pipeline as a pickle file
pkl_path = "../data/pipeline.pkl"
with open(pkl_path, "wb") as file:
    pickle.dump(final_pipe, file)

In [29]:
# Load the pipeline from the pickle file
pipeline = pickle.load(open(pkl_path, "rb"))

In [30]:
# Import new samples
new_samples = pd.read_csv('../data/new_samples.csv')
new_samples

Unnamed: 0,stn_code,state,location,type,so2,no2,date,rainfall
0,527,Madhya Pradesh,Ujjain,Industrial,7.8,9.1,13/10/2011,98.903760
1,258,Uttar Pradesh,Ghaziabad,Industrial,35.0,39.0,24/05/2011,126.244780
2,638,Maharashtra,Chandrapur,Industrial,15.0,10.0,21/07/2011,105.816791
3,537,Assam,Sivasagar,Residential,6.0,14.5,08/05/2013,49.449315
4,376,Rajasthan,Jodhpur,Residential,4.0,19.0,23/07/2004,86.035515
...,...,...,...,...,...,...,...,...
30999,457,Karnataka,Bangalore,Residential,14.0,28.0,19/05/2014,
31000,565,Himachal Pradesh,Nalagarh,Residential,1.0,10.0,17/01/2011,
31001,574,Maharashtra,Sangli,Residential,14.0,36.0,21/03/2011,131.366458
31002,,Maharashtra,Pune,Others,18.0,49.3,16/11/2007,94.813520


In [31]:
# Make predictions using the pipeline
pipeline.predict(new_samples)

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

### Pipeline classes

We can write the following class as a package in a `.py` file. Don't forget to import relevant libraries/classes/methods in the file. After that, the class can be imported to be used later. For example, if the class named `PollutionModel` is written in `pollution_model.py` file, it can be imported by `from pollution_model import PollutionModel`.

If you update your pipeline class in a `.py` file, and you are working on a Jupyter notebook, you can include the following code in the notebook so that your notebook doesn't need to be restarted to see the update.

`%load_ext autoreload`<br />
`%autoreload 2`

In [32]:
# Relevant libraries/classes/methods have been imported in this notebook.
# If you write this class in a .py file, don't forget to import them.

class PollutionModel:

    def __init__(self, path):
        self.X_train, self.X_test, self.y_train, self.y_test = self.load_data(path)
        self.model = self.create_pipeline()
        self.train()

    def load_data(self, path):
        """Import data, drop duplicates, separate X and y, and do train-test split."""
        data = pd.read_csv(path).drop_duplicates()
        X, y = data.drop(columns="high_level"), data["high_level"]
        return train_test_split(X, y, train_size=0.8)

    def create_pipeline(self):
        # Create a transformer to impute and scale numerical features 
        num_transformer = Pipeline([
            ('num_imputer', SimpleImputer()),
            ('num_scaler', StandardScaler())
        ])
        # Create a transformer to impute and encode categorical features
        cat_transformer = Pipeline([
            ('cat_imputer', SimpleImputer(strategy='most_frequent')),
            ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        # Apply "num_transformer" to numerical features and
        # apply "cat_transformer" to the "state" and "type" features in parallel
        preprocessor = ColumnTransformer([
            ('num_transformer', num_transformer, self.X_train.select_dtypes(include=np.number).columns),
            ('cat_transformer', cat_transformer, ['state', 'type'])
        ])
        # Combine preprocessor and linear model in pipeline
        final_pipe = Pipeline([
            ('preprocessing', preprocessor),
            ('classifier', LogisticRegression(max_iter=2500))])

        return final_pipe

    def train(self):
        self.model.fit(self.X_train, self.y_train)

    def score(self):
        return self.model.score(self.X_test, self.y_test)

    def predict(self, path):
        new_samples = pd.read_csv(path)
        return self.model.predict(new_samples)

In [33]:
# If you write the class in a .py file, don't forget to import it to be used in this notebook.

model = PollutionModel('../data/India_air_quality_light.csv')

In [34]:
model.score()

0.7260495156081809

In [35]:
model.predict('../data/new_samples.csv')

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)