# **MLProcess - Air Quality**
---
**3 - Data Preprocessing**

In [1]:
# Import the required libraries.
import yaml
import joblib
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

## **1 - Configuration File**
---

In [2]:
# Function to load configuration parameter.
def load_config(path_config):
    """
    Load the configuration file (config.yaml).

    Parameters:
    ----------
    path_config : str
        Configuration file location.

    Returns:
    -------
    params : dict
        The configuration parameters.
    """

    # Try to load config.yaml file.
    try:
        with open(path_config, 'r') as file:
            params = yaml.safe_load(file)
    except FileNotFoundError as err:
        raise RuntimeError(f"Configuration file not found in {path_config}")

    return params

In [3]:
# Function to update configuration parameter.
def update_config(key, value, params, path_config):
    """
    Update the configuration parameter values.

    Parameters:
    ----------
    key : str
        The key to be updated.

    value : any type supported in Python
        The updated value.

    params : dict
        Loaded configuration parameters.

    path_config : str
        Configuration file location.

    Returns:
    -------
    config : dict
        Updated configuration parameters.
    """

    # To maintain the raw config immutable.
    params = params.copy()

    # Update the configuration parameters.
    params[key] = value

    with open(path_config, 'w') as file:
        yaml.dump(params, file)

    print(f"Params Updated! \nKey: {key} \nValue: {value}\n")

    # Reload the updated configuration parameters.
    config = load_config(path_config)

    return config

In [4]:
# Load the configuration file.
PATH_CONFIG = "../config/config.yaml"
config = load_config(PATH_CONFIG)

In [5]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'impute_co': 11.0,
 'impute_no2': 18.0,
 'impute_o3': 29.0,
 'impute_pm10': {'BAIK': 28.359154929577464, 'TIDAK BAIK': 55.17809298660362},
 'impute_pm25': {'BAIK': 39.827586206896555, 'TIDAK BAIK': 82.38564668769716},
 'impute_so2': 35.306404657933044,
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'path_joined_data': '../data/interim/joined_dataset.pkl',
 'path_ohe_stasiun': '../models/ohe_stasiun.pkl',
 'path_raw_data': '../data/raw/',
 'path_test_set': ['../data/interim/X_test.pkl', '../data/interim/y_test.pkl'],
 'path_train_set': ['../data/interim/X_train.pkl',
  '../data/interim/y_train.pkl'],
 'path_valid_set': ['../data/interim/X_valid.pkl',
  '../data/interim/y_valid.pkl'],
 

## **2 - Load Data**
---

In [6]:
# Function to load data.
def load_data(config):
    """
    Load every set of data.

    Parameters:
    ----------
    config : dict
        The loaded configuration file.

    Returns:
    -------
    data_train, data_valid, data_test : pd.DataFrame
        The loaded data.
    """

    # Load the train set.
    X_train = joblib.load(config["path_train_set"][0])
    y_train = joblib.load(config["path_train_set"][1])

    # Load the valid set.
    X_valid = joblib.load(config["path_valid_set"][0])
    y_valid = joblib.load(config["path_valid_set"][1])

    # Load the test set.
    X_test = joblib.load(config["path_test_set"][0])
    y_test = joblib.load(config["path_test_set"][1])

    # Concatenate the X and y of each set.
    data_train = pd.concat([X_train, y_train], axis=1)
    data_valid = pd.concat([X_valid, y_valid], axis=1)
    data_test = pd.concat([X_test, y_test], axis=1)

    # Validate the proportion.
    num_all_data = int(data_train.shape[0]) + int(data_valid.shape[0]) + int(data_test.shape[0])
    print(f"Data train proportion : {len(X_train) / num_all_data}")
    print(f"Data valid proportion : {len(X_valid) / num_all_data}")
    print(f"Data test proportion  : {len(X_test) / num_all_data}")

    return data_train, data_valid, data_test

In [7]:
# Load the data.
data_train, data_valid, data_test = load_data(config)

Data train proportion : 0.7997793712079426
Data valid proportion : 0.09983452840595698
Data test proportion  : 0.10038610038610038


## **3 - Join Categories**
---
`SEDANG` + `TIDAK SEHAT` => `TIDAK BAIK`

In [8]:
# Function to join categories.
def join_categories(set_data, config):
    # Check if label found in set data.
    if config["label"] in set_data.columns.to_list():
        set_data = set_data.copy()

        # Rename SEDANG to TIDAK SEHAT.
        set_data["category"] = set_data["category"].replace("SEDANG", "TIDAK SEHAT")

        # Rename TIDAK SEHAT to TIDAK BAIK.
        set_data["category"] = set_data["category"].replace("TIDAK SEHAT", "TIDAK BAIK")

        return set_data
    else:
        raise RuntimeError("Label is not detected in the dataset.")

In [9]:
# Update the configuration parameter.
config = update_config(
    key = "label_categories_new",
    value = ["BAIK", "TIDAK BAIK"],
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: label_categories_new 
Value: ['BAIK', 'TIDAK BAIK']



Join categories in train data.

In [10]:
data_train["category"].value_counts()

category
SEDANG         1044
TIDAK SEHAT     255
BAIK            151
Name: count, dtype: int64

In [11]:
data_train = join_categories(data_train, config)

In [12]:
data_train["category"].value_counts()

category
TIDAK BAIK    1299
BAIK           151
Name: count, dtype: int64

Join categories in valid data.

In [13]:
data_valid["category"].value_counts()

category
SEDANG         130
TIDAK SEHAT     32
BAIK            19
Name: count, dtype: int64

In [14]:
data_valid = join_categories(data_valid, config)

In [15]:
data_valid["category"].value_counts()

category
TIDAK BAIK    162
BAIK           19
Name: count, dtype: int64

Join categories in test data.

In [16]:
data_test["category"].value_counts()

category
SEDANG         131
TIDAK SEHAT     32
BAIK            19
Name: count, dtype: int64

In [17]:
data_test = join_categories(data_test, config)

In [18]:
data_test["category"].value_counts()

category
TIDAK BAIK    163
BAIK           19
Name: count, dtype: int64

## **4 - Handling Missing Value**
---
1. Convert all `-1` values into `NaN`.
2. Impute the missing values for all columns.

In [19]:
# Function to replace -1 with NaN.
def nan_replace(set_data):
    set_data = set_data.copy()
    set_data = set_data.replace(-1, np.nan)
    return set_data

In [20]:
data_train = nan_replace(data_train)
data_train.isnull().sum()

stasiun      0
pm10        39
pm25        66
so2         76
co          13
o3          39
no2         14
category     0
dtype: int64

In [21]:
data_valid = nan_replace(data_valid)
data_valid.isnull().sum()

stasiun     0
pm10        7
pm25        7
so2         9
co          3
o3          4
no2         3
category    0
dtype: int64

In [22]:
data_test = nan_replace(data_test)
data_test.isnull().sum()

stasiun      0
pm10         7
pm25        13
so2         12
co           0
o3           5
no2          2
category     0
dtype: int64

### **4.1. pm10 Imputation**

In [23]:
# pm10 imputed using class mean.
impute_baik = float(data_train[data_train['category'] == 'BAIK']['pm10'].mean())
impute_tidak_baik = float(data_train[data_train['category'] == 'TIDAK BAIK']['pm10'].mean())

print(f"Mean pm10 class BAIK       : {impute_baik}")
print(f"Mean pm10 class TIDAK BAIK : {impute_tidak_baik}")

Mean pm10 class BAIK       : 28.359154929577464
Mean pm10 class TIDAK BAIK : 55.17809298660362


In [24]:
# Update the configuration parameter.
config = update_config(
    key = "impute_pm10",
    value = {"BAIK": impute_baik,
             "TIDAK BAIK": impute_tidak_baik},
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: impute_pm10 
Value: {'BAIK': 28.359154929577464, 'TIDAK BAIK': 55.17809298660362}



In [25]:
# Impute the missing values.
data_train.loc[data_train[(data_train.category == 'BAIK') & data_train.pm10.isnull() == True].index, 'pm10'] = impute_baik

data_train.loc[data_train[(data_train.category == 'TIDAK BAIK') & data_train.pm10.isnull() == True].index, 'pm10'] = impute_tidak_baik

print(f"Num of missing value pm10 class BAIK       : {data_train[data_train['category'] == 'BAIK']['pm10'].isnull().sum()}")
print(f"Num of missing value pm10 class TIDAK BAIK : {data_train[data_train['category'] == 'TIDAK BAIK']['pm10'].isnull().sum()}")

Num of missing value pm10 class BAIK       : 0
Num of missing value pm10 class TIDAK BAIK : 0


In [26]:
# Impute the missing values.
data_valid.loc[data_valid[(data_valid.category == 'BAIK') & data_valid.pm10.isnull() == True].index, 'pm10'] = impute_baik

data_valid.loc[data_valid[(data_valid.category == 'TIDAK BAIK') & data_valid.pm10.isnull() == True].index, 'pm10'] = impute_tidak_baik

print(f"Num of missing value pm10 class BAIK       : {data_valid[data_valid['category'] == 'BAIK']['pm10'].isnull().sum()}")
print(f"Num of missing value pm10 class TIDAK BAIK : {data_valid[data_valid['category'] == 'TIDAK BAIK']['pm10'].isnull().sum()}")

Num of missing value pm10 class BAIK       : 0
Num of missing value pm10 class TIDAK BAIK : 0


In [27]:
# Impute the missing values.
data_test.loc[data_test[(data_test.category == 'BAIK') & data_test.pm10.isnull() == True].index, 'pm10'] = impute_baik

data_test.loc[data_test[(data_test.category == 'TIDAK BAIK') & data_test.pm10.isnull() == True].index, 'pm10'] = impute_tidak_baik

print(f"Num of missing value pm10 class BAIK       : {data_test[data_test['category'] == 'BAIK']['pm10'].isnull().sum()}")
print(f"Num of missing value pm10 class TIDAK BAIK : {data_test[data_test['category'] == 'TIDAK BAIK']['pm10'].isnull().sum()}")

Num of missing value pm10 class BAIK       : 0
Num of missing value pm10 class TIDAK BAIK : 0


### **4.2. pm25 Imputation**

In [28]:
# pm25 imputed using class mean.
impute_baik = float(data_train[data_train['category'] == 'BAIK']['pm25'].mean())
impute_tidak_baik = float(data_train[data_train['category'] == 'TIDAK BAIK']['pm25'].mean())

print(f"Mean pm25 class BAIK       : {impute_baik}")
print(f"Mean pm25 class TIDAK BAIK : {impute_tidak_baik}")

Mean pm25 class BAIK       : 39.827586206896555
Mean pm25 class TIDAK BAIK : 82.38564668769716


In [29]:
# Update the configuration parameter.
config = update_config(
    key = "impute_pm25",
    value = {"BAIK": impute_baik,
             "TIDAK BAIK": impute_tidak_baik},
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: impute_pm25 
Value: {'BAIK': 39.827586206896555, 'TIDAK BAIK': 82.38564668769716}



In [30]:
# Impute the missing values.
data_train.loc[data_train[(data_train.category == 'BAIK') & data_train.pm25.isnull() == True].index, 'pm25'] = impute_baik

data_train.loc[data_train[(data_train.category == 'TIDAK BAIK') & data_train.pm25.isnull() == True].index, 'pm25'] = impute_tidak_baik

print(f"Num of missing value pm25 class BAIK       : {data_train[data_train['category'] == 'BAIK']['pm25'].isnull().sum()}")
print(f"Num of missing value pm25 class TIDAK BAIK : {data_train[data_train['category'] == 'TIDAK BAIK']['pm25'].isnull().sum()}")

Num of missing value pm25 class BAIK       : 0
Num of missing value pm25 class TIDAK BAIK : 0


In [31]:
# Impute the missing values.
data_valid.loc[data_valid[(data_valid.category == 'BAIK') & data_valid.pm25.isnull() == True].index, 'pm25'] = impute_baik

data_valid.loc[data_valid[(data_valid.category == 'TIDAK BAIK') & data_valid.pm25.isnull() == True].index, 'pm25'] = impute_tidak_baik

print(f"Num of missing value pm25 class BAIK       : {data_valid[data_valid['category'] == 'BAIK']['pm25'].isnull().sum()}")
print(f"Num of missing value pm25 class TIDAK BAIK : {data_valid[data_valid['category'] == 'TIDAK BAIK']['pm25'].isnull().sum()}")

Num of missing value pm25 class BAIK       : 0
Num of missing value pm25 class TIDAK BAIK : 0


In [32]:
# Impute the missing values.
data_test.loc[data_test[(data_test.category == 'BAIK') & data_test.pm25.isnull() == True].index, 'pm25'] = impute_baik

data_test.loc[data_test[(data_test.category == 'TIDAK BAIK') & data_test.pm25.isnull() == True].index, 'pm25'] = impute_tidak_baik

print(f"Num of missing value pm25 class BAIK       : {data_test[data_test['category'] == 'BAIK']['pm25'].isnull().sum()}")
print(f"Num of missing value pm25 class TIDAK BAIK : {data_test[data_test['category'] == 'TIDAK BAIK']['pm25'].isnull().sum()}")

Num of missing value pm25 class BAIK       : 0
Num of missing value pm25 class TIDAK BAIK : 0


### **4.3. so2, co, o3, no2 Imputation**

In [33]:
# so2 imputed using the mean
# co, o3, no2 imputed using the median

impute_so2 = float(data_train["so2"].mean())
impute_co = float(data_train["co"].median())
impute_o3 = float(data_train["o3"].median())
impute_no2 = float(data_train["no2"].median())

impute_values = {
    'so2' : impute_so2,
    'co' : impute_co,
    'o3' : impute_o3,
    'no2' : impute_no2
}

impute_values

{'so2': 35.306404657933044, 'co': 11.0, 'o3': 29.0, 'no2': 18.0}

In [34]:
# Update the configuration parameter.
cols = ['so2', 'co', 'o3', 'no2']
param_keys = ['impute_so2', 'impute_co', 'impute_o3', 'impute_no2']

for col, param_key in zip(cols, param_keys):
    config = update_config(
        key = param_key,
        value = impute_values[col],
        params = config,
        path_config = PATH_CONFIG
    )

Params Updated! 
Key: impute_so2 
Value: 35.306404657933044

Params Updated! 
Key: impute_co 
Value: 11.0

Params Updated! 
Key: impute_o3 
Value: 29.0

Params Updated! 
Key: impute_no2 
Value: 18.0



In [35]:
# Impute the missing values.
data_train = data_train.fillna(value = impute_values)
data_train.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64

In [36]:
data_valid = data_valid.fillna(value = impute_values)
data_valid.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64

In [37]:
data_test = data_test.fillna(value = impute_values)
data_test.isnull().sum()

stasiun     0
pm10        0
pm25        0
so2         0
co          0
o3          0
no2         0
category    0
dtype: int64

In [38]:
data_train["stasiun"].value_counts()

stasiun
DKI1 (Bunderan HI)                  299
DKI2 (Kelapa Gading)                295
DKI5 (Kebon Jeruk) Jakarta Barat    289
DKI3 (Jagakarsa)                    289
DKI4 (Lubang Buaya)                 278
Name: count, dtype: int64

## **5 - Encoding Stasiun**
---

In [39]:
def fit_ohe_encoder(X_stasiun):
    """
    Fit the OHE encoder
    
    Parameters:
    ----------
    X_stasiun : pd.DataFrame
        Categorical input data
    
    Returns:
    -------
    ohe_encoder : sklearn object
        Fitted OHE encoder object
    """
    
    ohe_encoder = OneHotEncoder(sparse_output=False)
    ohe_encoder.fit(np.array(X_stasiun).reshape(-1, 1))
    
    # Serialize the ohe_encoder object.    
    joblib.dump(ohe_encoder, '../models/ohe_stasiun.pkl')
    
    return ohe_encoder

def transform_ohe_encoder(set_data, transformed_column, ohe_path):
    """
    Transform the categorical input data using OHE encoder
    
    Parameters:
    ----------
    set_data : pd.DataFrame
        Data to be transformed.
        
    transformed_column : str
        The column name.
        
    ohe_path : str
        The path to the ohe_encoder object.
        
    Returns:
    -------
    set_data : pd.DataFrame
        The concatenated set data with OHE columns.
    """
    
    set_data = set_data.copy()
    
    # Load the ohe_encoder.
    ohe_encoder = joblib.load(ohe_path)
    
    # Transform the data.
    X_stasiun = np.array(set_data[transformed_column]).reshape(-1, 1)
    stasiun_features = ohe_encoder.transform(X_stasiun)
    
    # Convert to dataframe.    
    stasiun_features = pd.DataFrame(stasiun_features.tolist(), 
                                    columns = list(ohe_encoder.categories_[0]))
    
    # Set index by original set data index.
    stasiun_features.set_index(set_data.index, inplace=True)
    
    # Concatenante the new features with the original set data.
    set_data = pd.concat([stasiun_features, set_data], axis=1)
    
    # Drop the "stasiun" column.
    set_data.drop(columns="stasiun", inplace=True)
    
    # Convert columns type to string.
    new_col = [str(col_name) for col_name in set_data.columns.tolist()]
    set_data.columns = new_col
    
    # Return the feature engineered data.
    return set_data

In [40]:
ohe_stasiun = fit_ohe_encoder(config['range_stasiun'])

In [41]:
# Update the configuration parameter.
PATH_OHE_STASIUN = "../models/ohe_stasiun.pkl"

config = update_config(
    key = "path_ohe_stasiun",
    value = PATH_OHE_STASIUN,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_ohe_stasiun 
Value: ../models/ohe_stasiun.pkl



Encode the `stasiun` column in train, valid, and test set.

In [42]:
data_train = transform_ohe_encoder(
    set_data = data_train,
    transformed_column = "stasiun",
    ohe_path = config['path_ohe_stasiun']
)

data_train.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
320,1.0,0.0,0.0,0.0,0.0,65.0,95.0,27.0,14.0,17.0,32.0,TIDAK BAIK
1685,1.0,0.0,0.0,0.0,0.0,55.0,72.0,30.0,13.0,22.0,40.0,TIDAK BAIK
1093,1.0,0.0,0.0,0.0,0.0,50.0,64.0,31.0,19.0,18.0,35.0,TIDAK BAIK
31,0.0,1.0,0.0,0.0,0.0,33.0,50.0,39.0,7.0,47.0,15.0,BAIK
1051,0.0,0.0,0.0,0.0,1.0,67.0,118.0,29.0,16.0,28.0,31.0,TIDAK BAIK


In [43]:
data_valid = transform_ohe_encoder(
    set_data = data_valid,
    transformed_column = "stasiun",
    ohe_path = config['path_ohe_stasiun']
)

data_valid.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
193,0.0,1.0,0.0,0.0,0.0,73.0,108.0,14.0,43.0,104.0,19.0,TIDAK BAIK
683,0.0,0.0,1.0,0.0,0.0,70.0,102.0,49.0,5.0,31.0,14.0,TIDAK BAIK
695,0.0,0.0,1.0,0.0,0.0,73.0,28.0,47.0,17.0,26.0,18.0,TIDAK BAIK
927,1.0,0.0,0.0,0.0,0.0,55.0,68.0,23.0,18.0,21.0,40.0,TIDAK BAIK
544,0.0,0.0,0.0,1.0,0.0,42.0,81.0,38.0,9.0,27.0,20.0,TIDAK BAIK


In [44]:
data_test = transform_ohe_encoder(
    set_data = data_test,
    transformed_column = "stasiun",
    ohe_path = config['path_ohe_stasiun']
)

data_test.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
724,0.0,0.0,0.0,1.0,0.0,71.0,140.0,40.0,12,24.0,26.0,TIDAK BAIK
1511,0.0,0.0,0.0,0.0,1.0,62.0,107.0,40.0,11,34.0,17.0,TIDAK BAIK
888,0.0,0.0,0.0,0.0,1.0,51.0,71.0,30.0,12,12.0,29.0,TIDAK BAIK
682,0.0,0.0,1.0,0.0,0.0,84.0,123.0,49.0,7,29.0,16.0,TIDAK BAIK
1021,0.0,0.0,0.0,1.0,0.0,63.0,109.0,43.0,20,27.0,19.0,TIDAK BAIK


## **6 - Scaling Data**
---

In [45]:
def fit_scaler(X_concat):
    """
    Fit the scaler
    
    Parameters:
    ----------
    X_concat : pd.DataFrame
        Input data (all features must be in numeric form)
        
    Returns:
    -------
    scaler : sklearn object
        Fitted scaler object (storing the mean & std of all features)
    """
    
    scaler = StandardScaler()
    scaler.fit(X_concat)

    # Serialize the ohe_encoder object.    
    joblib.dump(scaler, '../models/scaler.pkl')
    
    return scaler

def transform_scaler(X_concat, scaler):
    """
    Transform the data using scaler
    
    Parameters:
    ----------
    X_concat : pd.DataFrame
        Input data (all features must be in numeric form)
        
    scaler : sklearn object
        Fitted scaler object (storing the mean & std of all features)
        
    Returns:
    -------
    X_concat_scaled : pd.DataFrame
        Scaled data
    """
    
    X_concat = X_concat.copy()
    
    # Transform the data
    X_concat_scaled = pd.DataFrame(
        scaler.transform(X_concat),
        columns = X_concat.columns,
        index = X_concat.index
    )
    
    return X_concat_scaled

In [46]:
target = data_train['category']

# Fit the scaler
scaler = fit_scaler(data_train.drop(columns='category'))

# Transform the data
data_train = transform_scaler(X_concat = data_train.drop(columns='category'),
                              scaler = scaler)

data_train = pd.concat([data_train, target], axis=1)

data_train.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
320,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.854186,0.693283,-0.675448,0.460037,-1.053053,1.399446,TIDAK BAIK
1685,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.177055,-0.242143,-0.431498,0.260559,-0.703466,2.285848,TIDAK BAIK
1093,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,-0.161511,-0.567508,-0.350182,1.457426,-0.983136,1.731847,TIDAK BAIK
31,-0.50968,1.9787,-0.498922,-0.487033,-0.498922,-1.312634,-1.136897,0.30035,-0.936307,1.04447,-0.484159,BAIK
1051,-0.50968,-0.505382,-0.498922,-0.487033,2.004321,0.989612,1.628708,-0.512815,0.858993,-0.283961,1.288646,TIDAK BAIK


In [47]:
target = data_valid['category']

# Transform the data
data_valid = transform_scaler(X_concat = data_valid.drop(columns='category'),
                              scaler = scaler)

data_valid = pd.concat([data_valid, target], axis=1)

data_valid.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
193,-0.50968,1.9787,-0.498922,-0.487033,-0.498922,1.395891,1.222002,-1.732562,6.244893,5.029765,-0.040958,TIDAK BAIK
683,-0.50968,-0.505382,2.004321,-0.487033,-0.498922,1.192752,0.977978,1.113515,-1.335263,-0.074209,-0.594959,TIDAK BAIK
695,-0.50968,-0.505382,2.004321,-0.487033,-0.498922,1.395891,-2.031652,0.950882,1.05847,-0.423796,-0.151758,TIDAK BAIK
927,1.962014,-0.505382,-0.498922,-0.487033,-0.498922,0.177055,-0.404825,-1.000713,1.257948,-0.773383,2.285848,TIDAK BAIK
544,-0.50968,-0.505382,-0.498922,2.053248,-0.498922,-0.703216,0.123894,0.219034,-0.537352,-0.353879,0.069842,TIDAK BAIK


In [48]:
target = data_test['category']

# Transform the data
data_test = transform_scaler(X_concat = data_test.drop(columns='category'),
                             scaler = scaler)

data_test = pd.concat([data_test, target], axis=1)

data_test.head()

Unnamed: 0,DKI1 (Bunderan HI),DKI2 (Kelapa Gading),DKI3 (Jagakarsa),DKI4 (Lubang Buaya),DKI5 (Kebon Jeruk) Jakarta Barat,pm10,pm25,so2,co,o3,no2,category
724,-0.50968,-0.505382,-0.498922,2.053248,-0.498922,1.260465,2.523463,0.381667,0.061081,-0.563631,0.734644,TIDAK BAIK
1511,-0.50968,-0.505382,-0.498922,-0.487033,2.004321,0.651047,1.181331,0.381667,-0.138396,0.135543,-0.262558,TIDAK BAIK
888,-0.50968,-0.505382,-0.498922,-0.487033,2.004321,-0.093798,-0.282813,-0.431498,0.061081,-1.402641,1.067045,TIDAK BAIK
682,-0.50968,-0.505382,2.004321,-0.487033,-0.498922,2.140736,1.832062,1.113515,-0.936307,-0.214044,-0.373359,TIDAK BAIK
1021,-0.50968,-0.505382,-0.498922,2.053248,-0.498922,0.71876,1.262672,0.625616,1.656904,-0.353879,-0.040958,TIDAK BAIK


## **7 - Balancing Label**
---

In [49]:
data_train["category"].value_counts()

category
TIDAK BAIK    1299
BAIK           151
Name: count, dtype: int64

In [50]:
# Undersampling.
rus = RandomUnderSampler(random_state = 123)

X_rus, y_rus = rus.fit_resample(data_train.drop('category', axis=1),
                                data_train['category'])

data_train_rus = pd.concat([X_rus, y_rus], axis=1)

In [51]:
# Oversampling.
ros = RandomOverSampler(random_state = 123)

X_ros, y_ros = ros.fit_resample(data_train.drop('category', axis=1),
                                data_train['category'])

data_train_ros = pd.concat([X_ros, y_ros], axis=1)

In [52]:
# SMOTE.
smote = SMOTE(random_state = 123)

X_sm, y_sm = smote.fit_resample(data_train.drop('category', axis=1),
                                data_train['category'])

data_train_sm = pd.concat([X_sm, y_sm], axis=1)

## **8 - Label Encoding**
---

In [53]:
def fit_le_encoder(y_categori):
    """
    Fit the LE encoder
    
    Parameters:
    ----------
    y_categori : pd.Series
        Categorical input label
        
    Returns:
    -------
    le_encoder : sklearn object
        Fitted LE encoder object
    """
    
    le_encoder = LabelEncoder()
    le_encoder.fit(y_categori)

    # Serialize the ohe_encoder object.    
    joblib.dump(le_encoder, '../models/le_encoder.pkl')
    
    return le_encoder

def transform_le_encoder(y_categori, le_encoder):
    """
    Transform the categorical input label using LE encoder
    
    Parameters:
    ----------
    y_categori : pd.Series
        Categorical input label
        
    le_encoder : sklearn object
        Fitted LE encoder object
        
    Returns:
    -------
    y_categori_encoded : pd.DataFrame
        Encoded categorical input label
    """
    
    y_categori = y_categori.copy()
    
    # Transform the data
    y_categori_encoded = pd.Series(
        le_encoder.transform(y_categori),        
    )
    
    return y_categori_encoded

In [54]:
# Fit the label encoder.
le_category = fit_le_encoder(config["label_categories_new"])

In [55]:
# Transform RUS data.
y_rus = transform_le_encoder(y_rus, le_category)

In [56]:
# Transform ROS data.
y_ros = transform_le_encoder(y_ros, le_category)

In [57]:
# Transform SMOTE data.
y_sm = transform_le_encoder(y_sm, le_category)

In [58]:
# Transform valid set.
category_encoded = transform_le_encoder(data_valid["category"], le_category)

data_valid["category"] = category_encoded.values.tolist()

In [59]:
# Transform test set.
category_encoded = transform_le_encoder(data_test["category"], le_category)

data_test["category"] = category_encoded.values.tolist()

## **9 - Data Serialization**
---

In [61]:
# undersampling
joblib.dump(X_rus, "../data/processed/X_rus.pkl")
joblib.dump(y_rus, "../data/processed/y_rus.pkl")

# oversampling
joblib.dump(X_ros, "../data/processed/X_ros.pkl")
joblib.dump(y_ros, "../data/processed/y_ros.pkl")

# smote
joblib.dump(X_sm, "../data/processed/X_sm.pkl")
joblib.dump(y_sm, "../data/processed/y_sm.pkl")

# data valid
joblib.dump(data_valid.drop(columns='category'), "../data/processed/X_valid_feng.pkl")
joblib.dump(data_valid['category'], "../data/processed/y_valid_feng.pkl")

# data test
joblib.dump(data_test.drop(columns='category'), "../data/processed/X_test_feng.pkl")
joblib.dump(data_test['category'], "../data/processed/y_test_feng.pkl")

['../data/processed/y_test_feng.pkl']