# **MLProcess - Air Quality**
---
**3 - Data Preprocessing**

In [1]:
# Import the required libraries.
import yaml
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import (
    OneHotEncoder,
    LabelEncoder,
    StandardScaler
)

from imblearn.under_sampling import RandomUnderSampler as RUS
from imblearn.over_sampling import (
    RandomOverSampler as ROS,
    SMOTE
)

## **1 - Configuration File**
---

In [2]:
# Function to load configuration parameter.
def load_config(path_config):
    """
    Load the configuration file (config.yaml).

    Parameters:
    ----------
    path_config : str
        Configuration file location.

    Returns:
    -------
    params : dict
        The configuration parameters.
    """

    # Try to load config.yaml file.
    try:
        with open(path_config, 'r') as file:
            params = yaml.safe_load(file)
    except FileNotFoundError as err:
        raise RuntimeError(f"Configuration file not found in {path_config}")

    return params

In [3]:
# Function to update configuration parameter.
def update_config(key, value, params, path_config):
    """
    Update the configuration parameter values.

    Parameters:
    ----------
    key : str
        The key to be updated.

    value : any type supported in Python
        The updated value.

    params : dict
        Loaded configuration parameters.

    path_config : str
        Configuration file location.

    Returns:
    -------
    config : dict
        Updated configuration parameters.
    """

    # To maintain the raw config immutable.
    params = params.copy()

    # Update the configuration parameters.
    params[key] = value

    with open(path_config, 'w') as file:
        yaml.dump(params, file)

    print(f"Params Updated! \nKey: {key} \nValue: {value}\n")

    # Reload the updated configuration parameters.
    config = load_config(path_config)

    return config

In [4]:
# Load the configuration file.
PATH_CONFIG = "../config/config.yaml"
config = load_config(PATH_CONFIG)

In [5]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'path_data_joined': '../data/interim/joined_dataset.pkl',
 'path_data_raw': '../data/raw/',
 'path_data_test': ['../data/interim/X_test.pkl',
  '../data/interim/y_test.pkl'],
 'path_data_train': ['../data/interim/X_train.pkl',
  '../data/interim/y_train.pkl'],
 'path_data_valid': ['../data/interim/X_valid.pkl',
  '../data/interim/y_valid.pkl'],
 'path_data_validated': '../data/interim/validated_data.pkl',
 'range_co': [-1, 47],
 'range_no2': [-1, 65],
 'range_o3': [-1, 151],
 'range_pm10': [-1, 179],
 'range_pm25': [-1, 174],
 'range_so2': [-1, 82],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jaka

## **2 - Load Data**
---

In [6]:
# Function for load data.
def load_data(config):
    """
    Load every set of data.

    Parameters:
    ----------
    config : dict
        The loaded configuration file.

    Returns:
    -------
    data_train, data_valid, data_test : pd.DataFrame
        The loaded data.
    """

    # Load the train set.
    X_train = joblib.load(config["path_data_train"][0])
    y_train = joblib.load(config["path_data_train"][1])

    # Load the valid set.
    X_valid = joblib.load(config["path_data_valid"][0])
    y_valid = joblib.load(config["path_data_valid"][1])

    # Load the test set.
    X_test = joblib.load(config["path_data_test"][0])
    y_test = joblib.load(config["path_data_test"][1])

    # Concatenate the X and y of each set.
    data_train = pd.concat([X_train, y_train], axis=1)
    data_valid = pd.concat([X_valid, y_valid], axis=1)
    data_test = pd.concat([X_test, y_test], axis=1)

    # Validate the proportion.
    num_all_data = int(data_train.shape[0]) + int(data_valid.shape[0]) + int(data_test.shape[0])
    print(f"Data train proportion : {len(X_train) / num_all_data}")
    print(f"Data valid proportion : {len(X_valid) / num_all_data}")
    print(f"Data test proportion  : {len(X_test) / num_all_data}")

    return data_train, data_valid, data_test

In [7]:
# Load the data.
data_train, data_valid, data_test = load_data(config)

Data train proportion : 0.7997793712079426
Data valid proportion : 0.09983452840595698
Data test proportion  : 0.10038610038610038


In [8]:
# Sanity check the train data.
data_train.head()

Unnamed: 0,stasiun,pm10,pm25,so2,co,o3,no2,category
320,DKI1 (Bunderan HI),65,95,27,14,17,32,SEDANG
1685,DKI1 (Bunderan HI),55,72,30,13,22,40,SEDANG
1093,DKI1 (Bunderan HI),50,64,31,19,18,35,SEDANG
31,DKI2 (Kelapa Gading),33,50,39,7,47,15,BAIK
1051,DKI5 (Kebon Jeruk) Jakarta Barat,67,118,29,16,28,31,TIDAK SEHAT


## **3 - Join Categories**
---
`SEDANG` + `TIDAK SEHAT` => `TIDAK BAIK`

In [9]:
# Function for join categories.
def join_categories(data, config):
    """
    Join categories SEDANG & TIDAK SEHAT -> TIDAK BAIK.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    config : dict
        The loaded configuration file.

    Returns:
    -------
    data : pd.DataFrame
        The loaded data with categories joined.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Check if label found in data.
    if config["label"] in data.columns.tolist():        

        # Rename SEDANG to TIDAK SEHAT.
        data["category"] = data["category"].replace("SEDANG", "TIDAK SEHAT")

        # Rename TIDAK SEHAT to TIDAK BAIK.
        data["category"] = data["category"].replace("TIDAK SEHAT", "TIDAK BAIK")

        return data
    else:
        raise RuntimeError("Label is not detected in the dataset.")

In [10]:
# Update the configuration parameter.
config = update_config(
    key = "label_categories_new",
    value = ["BAIK", "TIDAK BAIK"],
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: label_categories_new 
Value: ['BAIK', 'TIDAK BAIK']



Join categories in train data.

In [11]:
data_train["category"].value_counts()

category
SEDANG         1044
TIDAK SEHAT     255
BAIK            151
Name: count, dtype: int64

In [12]:
data_train = join_categories(data_train, config)

In [13]:
data_train["category"].value_counts()

category
TIDAK BAIK    1299
BAIK           151
Name: count, dtype: int64

Join categories in valid data.

In [14]:
data_valid["category"].value_counts()

category
SEDANG         130
TIDAK SEHAT     32
BAIK            19
Name: count, dtype: int64

In [15]:
data_valid = join_categories(data_valid, config)

In [16]:
data_valid["category"].value_counts()

category
TIDAK BAIK    162
BAIK           19
Name: count, dtype: int64

Join categories in test data.

In [17]:
data_test["category"].value_counts()

category
SEDANG         131
TIDAK SEHAT     32
BAIK            19
Name: count, dtype: int64

In [18]:
data_test = join_categories(data_test, config)

In [19]:
data_test["category"].value_counts()

category
TIDAK BAIK    163
BAIK           19
Name: count, dtype: int64

## **4 - Handling Missing Value**
---

- Create the `nan_replace()` function.

In [20]:
# Function to replace -1 with NaN.
def nan_replace(data):
    """
    Replace any -1 with NaN (Not a Number).

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    Returns:
    -------
    data : pd.DataFrame
        The processed data.
    """

    # Ensure the raw data immutable.
    data = data.copy()

    # Replace all -1 to NaN.
    data = data.replace(-1, np.nan)
    
    return data

In [21]:
# Replace the -1.
data_train = nan_replace(data_train)
data_train.isnull().sum()

stasiun      0
pm10        39
pm25        66
so2         76
co          13
o3          39
no2         14
category     0
dtype: int64

In [22]:
# Replace the -1.
data_valid = nan_replace(data_valid)
data_valid.isnull().sum()

stasiun     0
pm10        7
pm25        7
so2         9
co          3
o3          4
no2         3
category    0
dtype: int64

In [23]:
# Replace the -1.
data_test = nan_replace(data_test)
data_test.isnull().sum()

stasiun      0
pm10         7
pm25        13
so2         12
co           0
o3           5
no2          2
category     0
dtype: int64

### **4.1. `pm10` Imputation**

- Create the `calculate_class_mean()` and `impute_class_mean()` function.

In [24]:
# Function to calculate class mean for pm10 and pm25.
def calculate_class_mean(data, column):
    """
    Calculate the class mean for column pm10 and pm25.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    column : str
        The column name.

    Returns:
    -------
    impute_baik, impute_tidak_baik : float
        The mean for each class.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Boolean condition for each class.
    data_baik = data["category"] == "BAIK"
    data_tidak_baik = data["category"] == "TIDAK BAIK"

    # Calculate the class mean.
    impute_baik = int(data[data_baik][column].mean())
    impute_tidak_baik = int(data[data_tidak_baik][column].mean())

    print(f"Mean {column} class BAIK       : {impute_baik}")
    print(f"Mean {column} class TIDAK BAIK : {impute_tidak_baik}\n")    

    return impute_baik, impute_tidak_baik

In [25]:
# Function to impute missing values in column pm10 and pm25 using class mean.
def impute_class_mean(data, column, impute_baik, impute_tidak_baik):
    """
    Impute the missing value for column pm10 and pm25.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    column : str
        The column name.

    impute_baik : float
        The mean for class BAIK.

    impute_tidak_baik : float
        The mean for class TIDAK BAIK.
    
    Returns:
    -------
    data : pd.DataFrame
        The imputed data.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Boolean condition for each class.
    data_baik = data["category"] == "BAIK"
    data_tidak_baik = data["category"] == "TIDAK BAIK"

    # Boolean condition for missing values.
    missing_values = data[column].isnull() == True

    # Slice the missing values for each class.
    missing_baik = data[data_baik & missing_values]
    missing_tidak_baik = data[data_tidak_baik & missing_values]

    print(f"Num of missing value in {column} class BAIK before imputation       : {len(missing_baik)}")
    print(f"Num of missing value in {column} class TIDAK BAIK before imputation : {len(missing_tidak_baik)}\n")

    # Impute the missing values.
    data.loc[data[data_baik & missing_values].index, column] = impute_baik
    data.loc[data[data_tidak_baik & missing_values].index, column] = impute_tidak_baik

    print(f"Num of missing value in {column} class BAIK after imputation        : {data[data_baik][column].isnull().sum()}")
    print(f"Num of missing value in {column} class TIDAK BAIK after imputation  : {data[data_tidak_baik][column].isnull().sum()}\n")

    return data

Impute the `pm10` column in train, valid, and test set.

In [26]:
# Calculate the class mean.
column = "pm10"

impute_baik, impute_tidak_baik = calculate_class_mean(
    data = data_train,
    column = column
)

# Update the configuration parameter.
config = update_config(
    key = f"impute_{column}",
    value = {"BAIK": impute_baik,
             "TIDAK BAIK": impute_tidak_baik},
    params = config,
    path_config = PATH_CONFIG
)

# Impute the missing values.
data_train = impute_class_mean(
    data = data_train,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Mean pm10 class BAIK       : 28
Mean pm10 class TIDAK BAIK : 55

Params Updated! 
Key: impute_pm10 
Value: {'BAIK': 28, 'TIDAK BAIK': 55}

Num of missing value in pm10 class BAIK before imputation       : 9
Num of missing value in pm10 class TIDAK BAIK before imputation : 30

Num of missing value in pm10 class BAIK after imputation        : 0
Num of missing value in pm10 class TIDAK BAIK after imputation  : 0



In [27]:
# Impute the missing values.
data_valid = impute_class_mean(
    data = data_valid,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing value in pm10 class BAIK before imputation       : 0
Num of missing value in pm10 class TIDAK BAIK before imputation : 7

Num of missing value in pm10 class BAIK after imputation        : 0
Num of missing value in pm10 class TIDAK BAIK after imputation  : 0



In [28]:
# Impute the missing values.
data_test = impute_class_mean(
    data = data_test,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing value in pm10 class BAIK before imputation       : 1
Num of missing value in pm10 class TIDAK BAIK before imputation : 6

Num of missing value in pm10 class BAIK after imputation        : 0
Num of missing value in pm10 class TIDAK BAIK after imputation  : 0



### **4.2. `pm25` Imputation**

Impute the `pm25` column in train, valid, and test set.

In [29]:
# Calculate the class mean.
column = "pm25"

impute_baik, impute_tidak_baik = calculate_class_mean(
    data = data_train,
    column = column
)

# Update the configuration parameter.
config = update_config(
    key = f"impute_{column}",
    value = {"BAIK": impute_baik,
             "TIDAK BAIK": impute_tidak_baik},
    params = config,
    path_config = PATH_CONFIG
)

# Impute the missing values.
data_train = impute_class_mean(
    data = data_train,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Mean pm25 class BAIK       : 39
Mean pm25 class TIDAK BAIK : 82

Params Updated! 
Key: impute_pm25 
Value: {'BAIK': 39, 'TIDAK BAIK': 82}

Num of missing value in pm25 class BAIK before imputation       : 35
Num of missing value in pm25 class TIDAK BAIK before imputation : 31

Num of missing value in pm25 class BAIK after imputation        : 0
Num of missing value in pm25 class TIDAK BAIK after imputation  : 0



In [30]:
# Impute the missing values.
data_valid = impute_class_mean(
    data = data_valid,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing value in pm25 class BAIK before imputation       : 6
Num of missing value in pm25 class TIDAK BAIK before imputation : 1

Num of missing value in pm25 class BAIK after imputation        : 0
Num of missing value in pm25 class TIDAK BAIK after imputation  : 0



In [31]:
# Impute the missing values.
data_test = impute_class_mean(
    data = data_test,
    column = column,
    impute_baik = impute_baik,
    impute_tidak_baik = impute_tidak_baik
)

Num of missing value in pm25 class BAIK before imputation       : 9
Num of missing value in pm25 class TIDAK BAIK before imputation : 4

Num of missing value in pm25 class BAIK after imputation        : 0
Num of missing value in pm25 class TIDAK BAIK after imputation  : 0



### **4.3. `so2`, `co`, `o3`, and `no2` Imputation**

- Create the `calculate_impute_values()` and `impute_missing_values()` function.

In [32]:
# Function to calculate impute values for the other columns.
def calculate_impute_values(data):
    """
    Calculate the impute values for column so2, co, o3, and no2.
        - so2 imputed using the mean.
        - co, o3, and no2 imputed using the median.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    Returns:
    -------
    impute_values : dict
        The calculated impute values.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Calculate the impute values.
    impute_so2 = int(data["so2"].mean())
    impute_co = int(data["co"].median())
    impute_o3 = int(data["o3"].median())
    impute_no2 = int(data["no2"].median())

    impute_values = {
        "so2": impute_so2,
        "co": impute_co,
        "o3": impute_o3,
        "no2": impute_no2
    }

    return impute_values

In [33]:
# Function to impute missing values for the other columns.
def impute_missing_values(data, impute_values):
    """
    Impute the missing values for column so2, co, o3, and no2.

    Parameters:
    ----------
    data : pd.DataFrame
        The loaded data.

    impute_values : dict
        The calculated impute values.

    Returns:
    -------
    data : pd.DataFrame
        The imputed data.
    """

    # Ensure raw data immutable.
    data = data.copy()
    print(f"Num of missing values before imputation :\n{data.isnull().sum()}\n")
    
    # Impute the missing values.
    data = data.fillna(value = impute_values)
    print(f"Num of missing values after imputation  :\n{data.isnull().sum()}")

    return data

Impute the the other columns in train, valid, and test set.

In [34]:
# Calculate the impute values.
impute_values = calculate_impute_values(data_train)

# Update the configuration parameter.
cols = ['so2', 'co', 'o3', 'no2']
param_keys = ['impute_so2', 'impute_co', 'impute_o3', 'impute_no2']

for col, param_key in zip(cols, param_keys):
    config = update_config(
        key = param_key,
        value = impute_values[col],
        params = config,
        path_config = PATH_CONFIG
    )

# Impute the missing values.
data_train = impute_missing_values(
    data = data_train,
    impute_values = impute_values
)

Params Updated! 
Key: impute_so2 
Value: 35

Params Updated! 
Key: impute_co 
Value: 11

Params Updated! 
Key: impute_o3 
Value: 29

Params Updated! 
Key: impute_no2 
Value: 18

Num of missing values before imputation :
stasiun      0
pm10         0
pm25         0
so2         76
co          13
o3          39
no2         14
category     0
dtype: int64

Num of missing values after imputation  :
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64


In [35]:
# Impute the missing values.
data_valid = impute_missing_values(
    data = data_valid,
    impute_values = impute_values
)

Num of missing values before imputation :
stasiun     0
pm10        0
pm25        0
so2         9
co          3
o3          4
no2         3
category    0
dtype: int64

Num of missing values after imputation  :
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64


In [36]:
# Impute the missing values.
data_test = impute_missing_values(
    data = data_test,
    impute_values = impute_values
)

Num of missing values before imputation :
stasiun      0
pm10         0
pm25         0
so2         12
co           0
o3           5
no2          2
category     0
dtype: int64

Num of missing values after imputation  :
stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64


## **5 - Encoding `stasiun`**
---

- Create the `fit_ohe_encoder()` and `transform_ohe_encoder()` function.

In [37]:
# Function to fit the encoder.
def fit_ohe_encoder(data, path_ohe):
    """
    Fit the OHE encoder.
    
    Parameters:
    ----------
    data : pd.Series
        Categorical input data.

    path_ohe : str
        The OHE encoder location.
    
    Returns:
    -------
    ohe_encoder : sklearn.preprocessing.OneHotEncoder
        Fitted OHE encoder object.
    """

    # Sklearn only accepts 2D matrix, thus we need to reshape the data.
    col_stasiun = np.array(data).reshape(-1, 1)

    # Create the encoder object.
    ohe_encoder = OneHotEncoder(sparse_output=False, dtype=int)

    # Fit the encoder.
    ohe_encoder.fit(col_stasiun)
    
    # Serialize the ohe_encoder.    
    joblib.dump(ohe_encoder, path_ohe)
    
    return ohe_encoder

In [38]:
# Function to encode the data.
def transform_ohe_encoder(data, encoder):
    """
    Transform the categorical column using OHE encoder.
    
    Parameters:
    ----------
    data : pd.DataFrame
        Data to be transformed.
        
    encoder : sklearn.preprocessing.OneHotEncoder
        The fitted encoder.
        
    Returns:
    -------
    data : pd.DataFrame
        The concatenated data with OHE columns.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Sklearn only accepts 2D matrix, thus we need to reshape the data.
    column = "stasiun"
    X_stasiun = np.array(data[column]).reshape(-1, 1)

    # Encode the data.
    stasiun_features = encoder.transform(X_stasiun)

    # Convert to dataframe.
    stasiun_features = pd.DataFrame(
        stasiun_features.tolist(),
        columns = list(encoder.categories_[0]),
        index = data.index
    )

    # Concat the OHE features with the original data.
    data = pd.concat(
        [stasiun_features, data],
        axis = 1
    )
    
    # Drop the original column.
    data = data.drop(columns = column)

    # Convert columns type to string.
    new_col = [str(col_name) for col_name in data.columns.tolist()]
    data.columns = new_col
    
    return data

In [39]:
# Fit the ohe_encoder.
PATH_ENCODER_STASIUN = "../models/ohe_stasiun.pkl"

ohe_stasiun = fit_ohe_encoder(
    data = config["range_stasiun"],
    path_ohe = PATH_ENCODER_STASIUN
)

# Update the configuration parameter.
config = update_config(
    key = "path_fitted_encoder_stasiun",
    value = PATH_ENCODER_STASIUN,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_fitted_encoder_stasiun 
Value: ../models/ohe_stasiun.pkl



Encode the `stasiun` column in train, valid, and test set.

In [40]:
# Encode the categorical column.
data_train = transform_ohe_encoder(
    data = data_train,
    encoder = ohe_stasiun
)

data_train.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
320,1,0,0,0,0,65.0,95.0,27.0,14.0,17.0,32.0,TIDAK BAIK
1685,1,0,0,0,0,55.0,72.0,30.0,13.0,22.0,40.0,TIDAK BAIK
1093,1,0,0,0,0,50.0,64.0,31.0,19.0,18.0,35.0,TIDAK BAIK
31,0,1,0,0,0,33.0,50.0,39.0,7.0,47.0,15.0,BAIK
1051,0,0,0,0,1,67.0,118.0,29.0,16.0,28.0,31.0,TIDAK BAIK


In [41]:
# Encode the categorical column.
data_valid = transform_ohe_encoder(
    data = data_valid,
    encoder = ohe_stasiun
)

data_valid.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
193,0,1,0,0,0,73.0,108.0,14.0,43.0,104.0,19.0,TIDAK BAIK
683,0,0,1,0,0,70.0,102.0,49.0,5.0,31.0,14.0,TIDAK BAIK
695,0,0,1,0,0,73.0,28.0,47.0,17.0,26.0,18.0,TIDAK BAIK
927,1,0,0,0,0,55.0,68.0,23.0,18.0,21.0,40.0,TIDAK BAIK
544,0,0,0,1,0,42.0,81.0,38.0,9.0,27.0,20.0,TIDAK BAIK


In [42]:
# Encode the categorical column.
data_test = transform_ohe_encoder(
    data = data_test,
    encoder = ohe_stasiun
)

data_test.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
724,0,0,0,1,0,71.0,140.0,40.0,12,24.0,26.0,TIDAK BAIK
1511,0,0,0,0,1,62.0,107.0,40.0,11,34.0,17.0,TIDAK BAIK
888,0,0,0,0,1,51.0,71.0,30.0,12,12.0,29.0,TIDAK BAIK
682,0,0,1,0,0,84.0,123.0,49.0,7,29.0,16.0,TIDAK BAIK
1021,0,0,0,1,0,63.0,109.0,43.0,20,27.0,19.0,TIDAK BAIK


## **6 - Scaling Data**
---

- Create the `fit_scaler()` and `transform_scaler()` function.

In [43]:
# Function to fit the scaler.
def fit_scaler(data, path_scaler, config):
    """
    Fit the scaler.
    
    Parameters:
    ----------
    data : pd.DataFrame
        Input data (all features must be in numeric form)

    path_scaler : str
        The scaler location.

    config : dict
        The loaded configuration file.
        
    Returns:
    -------
    scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler object (storing the mean & std of all features)
    """

    # Split input-output, StandardScaler() only accepts numeric data.
    label = config["label"]
    y = data[label]
    X = data.drop(columns = label)

    # Create scaler object.
    scaler = StandardScaler()

    # Fit the scaler.
    scaler.fit(X)

    # Serialize the scaler.    
    joblib.dump(scaler, path_scaler)
    
    return scaler

In [44]:
# Function to scale the data.
def transform_scaler(data, scaler, config):
    """
    Transform the data using scaler.
    
    Parameters:
    ----------
    data : pd.DataFrame
        Input data (all features must be in numeric form)    
        
    scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler object (storing the mean & std of all features)

    config : dict
        The loaded configuration file.
        
    Returns:
    -------
    data : pd.DataFrame
        The scaled data
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Split input-output, StandardScaler() only accepts numeric data.
    label = config["label"]
    y = data[label]
    X = data.drop(columns = label)

    # Scale the data.
    scaled_data = scaler.transform(X)

    # Convert to dataframe.
    X_scaled = pd.DataFrame(
        scaled_data,
        columns = X.columns,
        index = X.index
    )

    # Concat the X_scaled with y.
    data = pd.concat(
        [X_scaled, y],
        axis = 1
    )
    
    return data

In [45]:
# Fit the scaler.
PATH_SCALER = "../models/scaler.pkl"

scaler = fit_scaler(
    data = data_train,
    path_scaler = PATH_SCALER,
    config = config
)

# Update the configuration parameter.
config = update_config(
    key = "path_fitted_scaler",
    value = PATH_SCALER,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_fitted_scaler 
Value: ../models/scaler.pkl



Scale the data in train, valid, and test set.

In [46]:
# Scale the data.
data_train = transform_scaler(
    data = data_train,
    scaler = scaler,
    config = config
)

print(f"Data shape : {data_train.shape}")
data_train.head()

Data shape : (1450, 12)


Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
320,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.854414,0.693588,-0.674131,0.460037,-1.053053,1.399446,TIDAK BAIK
1685,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.177419,-0.240702,-0.430186,0.260559,-0.703466,2.285848,TIDAK BAIK
1093,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,-0.161078,-0.565673,-0.34887,1.457426,-0.983136,1.731847,TIDAK BAIK
31,-0.50968,1.9787,-0.498922,-0.487033,-0.498922,-1.311969,-1.134372,0.301651,-0.936307,1.04447,-0.484159,BAIK
1051,-0.50968,-0.505382,-0.498922,-0.487033,2.004321,0.989813,1.627879,-0.511501,0.858993,-0.283961,1.288646,TIDAK BAIK


In [47]:
# Scale the data.
data_valid = transform_scaler(
    data = data_valid,
    scaler = scaler,
    config = config
)

print(f"Data shape : {data_valid.shape}")
data_valid.head()

Data shape : (181, 12)


Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
193,-0.50968,1.9787,-0.498922,-0.487033,-0.498922,1.39601,1.221666,-1.731229,6.244893,5.029765,-0.040958,TIDAK BAIK
683,-0.50968,-0.505382,2.004321,-0.487033,-0.498922,1.192912,0.977938,1.114804,-1.335263,-0.074209,-0.594959,TIDAK BAIK
695,-0.50968,-0.505382,2.004321,-0.487033,-0.498922,1.39601,-2.028042,0.952173,1.05847,-0.423796,-0.151758,TIDAK BAIK
927,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.177419,-0.403188,-0.999392,1.257948,-0.773383,2.285848,TIDAK BAIK
544,-0.50968,-0.505382,-0.498922,2.053248,-0.498922,-0.702674,0.12489,0.220336,-0.537352,-0.353879,0.069842,TIDAK BAIK


In [48]:
# Scale the data.
data_test = transform_scaler(
    data = data_test,
    scaler = scaler,
    config = config
)

print(f"Data shape : {data_test.shape}")
data_test.head()

Data shape : (182, 12)


Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
724,-0.50968,-0.505382,-0.498922,2.053248,-0.498922,1.260611,2.521549,0.382967,0.061081,-0.563631,0.734644,TIDAK BAIK
1511,-0.50968,-0.505382,-0.498922,-0.487033,2.004321,0.651316,1.181045,0.382967,-0.138396,0.135543,-0.262558,TIDAK BAIK
888,-0.50968,-0.505382,-0.498922,-0.487033,2.004321,-0.093379,-0.281324,-0.430186,0.061081,-1.402641,1.067045,TIDAK BAIK
682,-0.50968,-0.505382,2.004321,-0.487033,-0.498922,2.140705,1.830986,1.114804,-0.936307,-0.214044,-0.373359,TIDAK BAIK
1021,-0.50968,-0.505382,-0.498922,2.053248,-0.498922,0.719015,1.262287,0.626912,1.656904,-0.353879,-0.040958,TIDAK BAIK


## **7 - Label Encoding**
---

- Create the `fit_label_encoder()` and `transform_label_encoder()` function.

In [49]:
# Function to fit label encoder.
def fit_label_encoder(label, path_le):
    """
    Fit the label encoder.

    Parameters:
    ----------
    label : pd.Series
        Categorical label.

    path_le : str
        The label encoder location.

    Returns:
    -------
    label_encoder : sklearn.preprocessing.LabelEncoder
        Fitted label encoder object.
    """

    # Create the label encoder object.
    label_encoder = LabelEncoder()

    # Fit the label encoder.
    label_encoder.fit(label)

    # Serialize the label encoder.
    joblib.dump(label_encoder, path_le)

    return label_encoder

In [50]:
# Function to encode the label.
def transform_label_encoder(label, encoder):
    """
    Transform the categorical label using label encoder.

    Parameters:
    ----------
    label : pd.Series
        Categorical label.

    encoder : sklearn.preprocessing.LabelEncoder
        Fitted label encoder object.

    Returns:
    encoded_label : pd.Series
        The encoded label.
    """

    # Ensure raw label immutable.
    label = label.copy()

    # Encode the label.
    encoded_label = pd.Series(
        encoder.transform(label),
        index = label.index,
        name = "category"
    )

    return encoded_label

In [51]:
# Fit the label_encoder.
PATH_ENCODER_LABEL = "../models/label_encoder.pkl"

label = config["label"]
y_train = data_train[label]

label_encoder = fit_label_encoder(
    label = y_train,
    path_le = PATH_ENCODER_LABEL
)

# Update the configuration parameter.
config = update_config(
    key = "path_fitted_encoder_label",
    value = PATH_ENCODER_LABEL,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_fitted_encoder_label 
Value: ../models/label_encoder.pkl



In [52]:
# Encode the label.
data_train["category"] = transform_label_encoder(
    label = data_train["category"],
    encoder = label_encoder
)

data_valid["category"] = transform_label_encoder(
    label = data_valid["category"],
    encoder = label_encoder
)

data_test["category"] = transform_label_encoder(
    label = data_test["category"],
    encoder = label_encoder
)

In [53]:
# Sanity check the train data.
data_train

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
320,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.854414,0.693588,-0.674131,0.460037,-1.053053,1.399446,1
1685,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.177419,-0.240702,-0.430186,0.260559,-0.703466,2.285848,1
1093,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,-0.161078,-0.565673,-0.348870,1.457426,-0.983136,1.731847,1
31,-0.509680,1.978700,-0.498922,-0.487033,-0.498922,-1.311969,-1.134372,0.301651,-0.936307,1.044470,-0.484159,0
1051,-0.509680,-0.505382,-0.498922,-0.487033,2.004321,0.989813,1.627879,-0.511501,0.858993,-0.283961,1.288646,1
...,...,...,...,...,...,...,...,...,...,...,...,...
182,-0.509680,1.978700,-0.498922,-0.487033,-0.498922,-1.718166,-1.093751,-1.649914,-0.936307,3.141994,-0.484159,1
1374,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,-0.025679,-0.362567,-1.243338,0.858993,-1.122971,1.177845,1
1495,-0.509680,-0.505382,-0.498922,-0.487033,2.004321,-1.176570,-0.525052,-0.999392,-0.138396,-0.703466,-0.262558,1
255,-0.509680,-0.505382,-0.498922,2.053248,-0.498922,2.479202,0.165511,-0.674131,7.042804,2.023315,-0.927360,1


## **8 - Label Balancing**
---

In [54]:
# Check the label distribution.
data_train["category"].value_counts(normalize=True)

category
1    0.895862
0    0.104138
Name: proportion, dtype: float64

- Seems like the label is highly imbalanced. We need to balancing the label.
- Note that: class `1 (TIDAK BAIK)` and class `0 (BAIK)`

In [55]:
# Function to balancing the label.
def label_balancer(data, balancer_type, config, random_state=123):
    """
    Balancing the category label.

    Parameters:
    ----------
    data : pd.DataFrame
        The scaled data.

    balancer_type : str
        The balancer type.

    config : dict
        The loaded configuration file.

    random_state : int, default = 123
        For reproducibility.

    Returns:
    -------
    X_balanced : pd.DataFrame
        The features with balanced label.

    y_balanced : pd.Series
        The label with balanced label.
    """

    # Ensure the raw data immutable.
    data = data.copy()

    # Split input-output, imblearn-style similar to sklearn-style.
    label = config["label"]
    y = data[label]
    X = data.drop(columns = label)

    # Set the balancer.
    list_balancer = ["rus", "ros", "sm"]

    if str(balancer_type).lower() not in list_balancer:
        raise RuntimeError("The balancer type is invalid.")
    else:
        if str(balancer_type).lower() == "rus":
            balancer = RUS(random_state = random_state)            
        elif str(balancer_type).lower() == "ros":
            balancer = ROS(random_state = random_state)
        else:
            balancer = SMOTE(random_state = random_state)

        # Fit resample the balancer.
        X_balanced, y_balanced = balancer.fit_resample(X, y)

        print(f"The label are balanced using {balancer.__class__.__name__}")

        # Check the label distribution.
        print(y_balanced.value_counts())

        return X_balanced, y_balanced

In [56]:
# Label balancing.
X_rus, y_rus = label_balancer(
    data = data_train,
    balancer_type = "rus",
    config = config
)

The label are balanced using RandomUnderSampler
category
0    151
1    151
Name: count, dtype: int64


In [57]:
# Label balancing.
X_ros, y_ros = label_balancer(
    data = data_train,
    balancer_type = "ros",
    config = config
)

The label are balanced using RandomOverSampler
category
1    1299
0    1299
Name: count, dtype: int64


In [58]:
# Label balancing.
X_sm, y_sm = label_balancer(
    data = data_train,
    balancer_type = "sm",
    config = config
)

The label are balanced using SMOTE
category
1    1299
0    1299
Name: count, dtype: int64


## **9 - Data Serialization**
---

In [59]:
# Define data configuration.
X_train = {
    "Undersampling": X_rus,
    "Oversampling": X_ros,
    "SMOTE": X_sm
}

y_train = {
    "Undersampling": y_rus,
    "Oversampling": y_ros,
    "SMOTE": y_sm
}

label = config["label"]

y_valid = data_valid[label]
X_valid = data_valid.drop(columns = label)

y_test = data_valid[label]
X_test = data_test.drop(columns = label)

data_configuration = {
    "train": {
        "X_train": X_train,
        "y_train": y_train
    },
    "valid": {
        "X_valid": X_valid,
        "y_valid": y_valid
    },
    "test": {
        "X_test": X_test,
        "y_test": y_test
    }
}

In [60]:
# Serialize the preprocessed data.
PATH_PROCESSED_DATA = "../data/processed/"

for key, value in data_configuration.items():
    config_key = f"path_clean_{key}"
    config_value = []

    for v in value:
        # Get each path.
        path = f"{PATH_PROCESSED_DATA + v}_clean.pkl"
        config_value.append(path)

        # Get each data.
        data = value[v]

        # Serialize the preprocessed data.
        joblib.dump(data, path)

    # Update the configuration parameters.
    config = update_config(
        key = config_key,
        value = config_value,
        params = config,
        path_config = PATH_CONFIG
    )

Params Updated! 
Key: path_clean_train 
Value: ['../data/processed/X_train_clean.pkl', '../data/processed/y_train_clean.pkl']

Params Updated! 
Key: path_clean_valid 
Value: ['../data/processed/X_valid_clean.pkl', '../data/processed/y_valid_clean.pkl']

Params Updated! 
Key: path_clean_test 
Value: ['../data/processed/X_test_clean.pkl', '../data/processed/y_test_clean.pkl']



In [61]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'impute_co': 11,
 'impute_no2': 18,
 'impute_o3': 29,
 'impute_pm10': {'BAIK': 28, 'TIDAK BAIK': 55},
 'impute_pm25': {'BAIK': 39, 'TIDAK BAIK': 82},
 'impute_so2': 35,
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'path_clean_test': ['../data/processed/X_test_clean.pkl',
  '../data/processed/y_test_clean.pkl'],
 'path_clean_train': ['../data/processed/X_train_clean.pkl',
  '../data/processed/y_train_clean.pkl'],
 'path_clean_valid': ['../data/processed/X_valid_clean.pkl',
  '../data/processed/y_valid_clean.pkl'],
 'path_data_joined': '../data/interim/joined_dataset.pkl',
 'path_data_raw': '../data/raw/',
 'path_data_test': ['../data/interim/X_test.pkl',
  '../data/interim/y_test.pkl