# **MLProcess - Air Quality**
---
**1 - Data Pipeline**

In [1]:
# Import the required libraries.
import os

# Need to be installed.
import yaml
import joblib
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split

## **1 - Configuration File**
---

- Create two functions: `load_config()` and `update_config()`.

In [2]:
# Function to load configuration parameter.
def load_config(path_config):
    """
    Load the configuration file (config.yaml).

    Parameters:
    ----------
    path_config : str
        Configuration file location.

    Returns:
    -------
    params : dict
        The configuration parameters.
    """

    # Try to load config.yaml file.
    try:
        with open(path_config, 'r') as file:
            params = yaml.safe_load(file)
    except FileNotFoundError as err:
        raise RuntimeError(f"Configuration file not found in {path_config}")

    return params

In [3]:
# Function to update configuration parameter.
def update_config(key, value, params, path_config):
    """
    Update the configuration parameter values.

    Parameters:
    ----------
    key : str
        The key to be updated.

    value : any type supported in Python
        The updated value.

    params : dict
        Loaded configuration parameters.

    path_config : str
        Configuration file location.

    Returns:
    -------
    config : dict
        Updated configuration parameters.
    """

    # To maintain the raw config immutable.
    params = params.copy()

    # Update the configuration parameters.
    params[key] = value

    with open(path_config, 'w') as file:
        yaml.dump(params, file)

    print(f"Params Updated! \nKey: {key} \nValue: {value}\n")

    # Reload the updated configuration parameters.
    config = load_config(path_config)

    return config

In [4]:
# Load the configuration file.
PATH_CONFIG = "../config/config.yaml"
config = load_config(PATH_CONFIG)

In [5]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'label': 'categori',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'range_co': [0, 100],
 'range_no2': [0, 100],
 'range_o3': [0, 140],
 'range_pm10': [0, 800],
 'range_pm25': [0, 400],
 'range_so2': [0, 500],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jakarta Barat']}

## **2 - Data Collection**
---

- Create `load_data()` function.
- It receives one argument: `data_path`
- This function load all csv raw data and return the joined dataframe.

In [6]:
# Function to load raw data.
def load_raw_data(path_data):
    """
    Load csv files and join into one dataframe.

    Parameters:
    ----------
    path_data : str
        Raw dataset location.

    Returns:
    -------
    raw_dataset : pd.DataFrame
        Loaded and joined data.
    """

    # Create variable to store raw dataset.
    raw_dataset = pd.DataFrame()

    # Load and join the csv files.
    for i in tqdm(os.listdir(path_data)):
        raw_dataset = pd.concat([pd.read_csv(path_data + i), raw_dataset])

    return raw_dataset

In [7]:
# Load the raw dataset.
PATH_DATA_RAW = "../data/raw/"
raw_dataset = load_raw_data(PATH_DATA_RAW)

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 143.91it/s]


In [8]:
# Update the configuration parameter.
config = update_config(
    key = "path_data_raw",
    value = PATH_DATA_RAW,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_data_raw 
Value: ../data/raw/



In [9]:
# Check the raw dataset.
raw_dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-02-01,DKI1 (Bunderan HI),59,79,21,26,15,31,79,PM25,SEDANG
1,2021-02-02,DKI1 (Bunderan HI),30,46,21,14,16,24,46,PM25,BAIK
2,2021-02-03,DKI1 (Bunderan HI),23,33,19,11,14,19,33,PM25,BAIK
3,2021-02-04,DKI1 (Bunderan HI),36,53,23,14,12,24,53,PM25,SEDANG
4,2021-02-05,DKI1 (Bunderan HI),29,36,20,14,12,21,36,PM25,BAIK
...,...,...,...,...,...,...,...,...,...,...,...
145,2021-11-26,DKI5 (Kebon Jeruk) Jakarta Barat,32,59,16,7,15,20,59,PM25,SEDANG
146,2021-11-27,DKI5 (Kebon Jeruk) Jakarta Barat,22,36,13,7,18,22,36,PM25,BAIK
147,2021-11-28,DKI5 (Kebon Jeruk) Jakarta Barat,18,26,12,2,16,8,26,PM25,BAIK
148,2021-11-29,DKI5 (Kebon Jeruk) Jakarta Barat,15,34,13,3,13,9,34,PM25,BAIK


- We found that:
    1. Index only ranged from 0 to 149, while there are 1830 rows.
    2. Date only ranged from month 2 to 11, while there are 12 months.

In [10]:
# Try to reset the index to solve the first problem.
raw_dataset = raw_dataset.reset_index(drop=True)

In [11]:
# Check the updated index.
raw_dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-02-01,DKI1 (Bunderan HI),59,79,21,26,15,31,79,PM25,SEDANG
1,2021-02-02,DKI1 (Bunderan HI),30,46,21,14,16,24,46,PM25,BAIK
2,2021-02-03,DKI1 (Bunderan HI),23,33,19,11,14,19,33,PM25,BAIK
3,2021-02-04,DKI1 (Bunderan HI),36,53,23,14,12,24,53,PM25,SEDANG
4,2021-02-05,DKI1 (Bunderan HI),29,36,20,14,12,21,36,PM25,BAIK
...,...,...,...,...,...,...,...,...,...,...,...
1825,2021-11-26,DKI5 (Kebon Jeruk) Jakarta Barat,32,59,16,7,15,20,59,PM25,SEDANG
1826,2021-11-27,DKI5 (Kebon Jeruk) Jakarta Barat,22,36,13,7,18,22,36,PM25,BAIK
1827,2021-11-28,DKI5 (Kebon Jeruk) Jakarta Barat,18,26,12,2,16,8,26,PM25,BAIK
1828,2021-11-29,DKI5 (Kebon Jeruk) Jakarta Barat,15,34,13,3,13,9,34,PM25,BAIK


Now the index problem are fixed.

In [12]:
# Serialize the joined dataset.
PATH_DATA_JOINED = "../data/interim/joined_dataset.pkl"
joblib.dump(raw_dataset, PATH_DATA_JOINED)

['../data/interim/joined_dataset.pkl']

In [13]:
# Update the configuration parameter.
config = update_config(
    key = "path_data_joined",
    value = PATH_DATA_JOINED,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_data_joined 
Value: ../data/interim/joined_dataset.pkl



## **3 - Data Validation**
---

In [14]:
# Check the data type for each feature.
raw_dataset.dtypes

tanggal        str
stasiun        str
pm10        object
pm25        object
so2         object
co          object
o3          object
no2         object
max         object
critical       str
categori       str
dtype: object

- Several features don't have the same configuration data type.
- We need to handle those error columns.

### 3.1. Handling Column `tanggal`

In [15]:
# Try to cast the column to datetime type.
raw_dataset["tanggal"] = pd.to_datetime(raw_dataset["tanggal"])

### 3.2. Handling Column `pm10`

In [16]:
# Try to cast the column to int type.
raw_dataset["pm10"] = raw_dataset["pm10"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

- `ValueError` occurs, it tells us that there are data that isn't integer (`"---"`).
- We will replace those `"---"` with value that don't exists in the column.
- Based on data definition, we know that we can use `-1`.

In [17]:
# Ensure no single data that is -1.
raw_dataset.eq("-1").any() | raw_dataset.eq(-1).any()

tanggal     False
stasiun     False
pm10        False
pm25        False
so2         False
co          False
o3          False
no2         False
max         False
critical    False
categori    False
dtype: bool

In [18]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["pm10"] = raw_dataset["pm10"].replace("---", -1).astype(int)

### 3.3. Handling Column `pm25`

In [19]:
# Try to cast the column to int type.
raw_dataset["pm25"] = raw_dataset["pm25"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [20]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["pm25"] = raw_dataset["pm25"].replace("---", -1).astype(int)

ValueError: cannot convert float NaN to integer

- There is a different `ValueError`.
- There are `NaN` values, thus we can't directly convert to int.
- We need to handle this problem first.

In [21]:
# Sanity check the missing values.
raw_dataset["pm25"].isna().sum()

np.int64(62)

- There are 62 `NaN` values. For now, we can replace it with `-1`.

In [22]:
# Replace the NaN values with -1.
raw_dataset["pm25"] = raw_dataset["pm25"].fillna(-1)

# Sanity check the missing values.
raw_dataset["pm25"].isna().sum()

np.int64(0)

In [23]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["pm25"] = raw_dataset["pm25"].replace("---", -1).astype(int)

### 3.4. Handling Column `so2`

In [24]:
# Try to cast the column to int type.
raw_dataset["so2"] = raw_dataset["so2"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [25]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["so2"] = raw_dataset["so2"].replace("---", -1).astype(int)

### 3.5. Handling Column `co`

In [26]:
# Try to cast the column to int type.
raw_dataset["co"] = raw_dataset["co"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [27]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["co"] = raw_dataset["co"].replace("---", -1).astype(int)

### 3.6. Handling Column `o3`

In [28]:
# Try to cast the column to int type.
raw_dataset["o3"] = raw_dataset["o3"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [29]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["o3"] = raw_dataset["o3"].replace("---", -1).astype(int)

### 3.7. Handling Column `no2`

In [30]:
# Try to cast the column to int type.
raw_dataset["no2"] = raw_dataset["no2"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [31]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["no2"] = raw_dataset["no2"].replace("---", -1).astype(int)

### 3.8. Handling Column `max`

In [32]:
# Try to cast the column to int type.
raw_dataset["max"] = raw_dataset["max"].astype(int)

ValueError: invalid literal for int() with base 10: 'PM25'

- Seems like the error is different.
- There is data with value `"PM25"` in the `max` column.
- We meed to investigate the data.

In [33]:
# Check which data that cause error.
error_index = raw_dataset[raw_dataset["max"] == "PM25"].index
raw_dataset[raw_dataset["max"] == "PM25"]

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
762,2021-12-03,DKI1 (Bunderan HI),49,31,9,19,7,49,PM25,BAIK,


- Looks like there are typos on row index 762.
    - The `"BAIK"` value must be on `categori` column.
    - We need to investigate what do `max` and `critical` column represents.
- Let's randomly sample 5 data.

In [34]:
raw_dataset.sample(5)

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
627,2021-07-23,DKI1 (Bunderan HI),58,80,26,9,27,28,80,PM25,SEDANG
700,2021-07-03,DKI4 (Lubang Buaya),72,141,37,13,25,20,141,PM25,TIDAK SEHAT
1731,2021-11-22,DKI2 (Kelapa Gading),25,44,8,7,41,12,44,PM25,BAIK
1029,2021-04-25,DKI4 (Lubang Buaya),73,116,37,17,20,17,116,PM25,TIDAK SEHAT
1199,2021-10-11,DKI5 (Kebon Jeruk) Jakarta Barat,59,88,-1,9,39,11,88,PM25,SEDANG


- Looks like the `max` column represents the maximum value between any other int columns.
- And looks like the `critical` column represents the column name of `max` value.

- Thus, let's fix the error on the row index 762:
    - Replace the `max` column with `pm10` or `no2` value, let's take from `pm10`
    - Replace the `critical` column with `"PM10"`
    - Replace the `categori` column with `"BAIK"`

In [35]:
# Fix the error.
raw_dataset.loc[error_index, "max"] = raw_dataset.loc[error_index, "pm10"]
raw_dataset.loc[error_index, "critical"] = "PM10"
raw_dataset.loc[error_index, "categori"] = "BAIK"

In [36]:
# Sanity check the result.
raw_dataset.loc[error_index]

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
762,2021-12-03,DKI1 (Bunderan HI),49,31,9,19,7,49,49,PM10,BAIK


In [37]:
# Cast the column to int.
raw_dataset["max"] = raw_dataset["max"].astype(int)

### 3.9. Handling Column `critical`

In [38]:
# Check the unique value.
raw_dataset["critical"].value_counts()

critical
PM25    1631
PM10      65
O3        57
CO        34
SO2       26
Name: count, dtype: int64

Seems like no action needed.

### 3.10. Handling Column `categori`

In [39]:
# Check the unique value.
raw_dataset["categori"].value_counts()

categori
SEDANG            1305
TIDAK SEHAT        319
BAIK               189
TIDAK ADA DATA      17
Name: count, dtype: int64

- There are 17 `"TIDAK ADA DATA"` values, indicate the missing label.
- Since we don't know which label that can replace the `"TIDAK ADA DATA"`, thus we can drop those data.

In [40]:
# Drop the "TIDAK ADA DATA" category.
missing_labels = raw_dataset[raw_dataset["categori"] == "TIDAK ADA DATA"]
raw_dataset = raw_dataset.drop(index = missing_labels.index)

In [41]:
# Sanity check the result.
raw_dataset["categori"].value_counts()

categori
SEDANG         1305
TIDAK SEHAT     319
BAIK            189
Name: count, dtype: int64

- Let's rename the `categori` column into the proper name, `category`.
- Don't forget to update the configuration file.

In [42]:
# Rename "categori" into "category".
raw_dataset = raw_dataset.rename(columns = {"categori": "category"})

In [43]:
# Update the configuration parameter.
config = update_config(
    key = "label",
    value = "category",
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: label 
Value: category



In [44]:
# Update the configuration parameter.
col_object = config["columns_object"]
col_object[-1] = "category"

config = update_config(
    key = "columns_object",
    value = col_object,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: columns_object 
Value: ['stasiun', 'critical', 'category']



In [45]:
# Sanity check the data types.
raw_dataset.info()

<class 'pandas.DataFrame'>
Index: 1813 entries, 0 to 1829
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1813 non-null   datetime64[us]
 1   stasiun   1813 non-null   str           
 2   pm10      1813 non-null   int64         
 3   pm25      1813 non-null   int64         
 4   so2       1813 non-null   int64         
 5   co        1813 non-null   int64         
 6   o3        1813 non-null   int64         
 7   no2       1813 non-null   int64         
 8   max       1813 non-null   int64         
 9   critical  1813 non-null   str           
 10  category  1813 non-null   str           
dtypes: datetime64[us](1), int64(7), str(3)
memory usage: 170.0 KB


- All columns are already same as in the data definition.
- Now serialized the validated data.

In [46]:
# Serialized the validated data.
PATH_DATA_VALIDATED = "../data/interim/validated_data.pkl"
joblib.dump(raw_dataset, PATH_DATA_VALIDATED)

['../data/interim/validated_data.pkl']

In [47]:
# Update the configuration parameter.
config = update_config(
    key = "path_data_validated",
    value = PATH_DATA_VALIDATED,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_data_validated 
Value: ../data/interim/validated_data.pkl



## **4 - Update the Range of Data in Configuration File**
---

In [48]:
# Update the range of data with the min and max value of each column.
cols = ["pm10", "pm25", "so2", "co", "o3", "no2"]
param_keys = ["range_pm10", "range_pm25", "range_so2", "range_co", "range_o3", "range_no2"]

for col, key in zip(cols, param_keys):
    config = update_config(
        key = key,
        value = [int(np.min(raw_dataset[col])), int(np.max(raw_dataset[col]))],
        params = config,
        path_config = PATH_CONFIG
    )

Params Updated! 
Key: range_pm10 
Value: [-1, 179]

Params Updated! 
Key: range_pm25 
Value: [-1, 174]

Params Updated! 
Key: range_so2 
Value: [-1, 82]

Params Updated! 
Key: range_co 
Value: [-1, 47]

Params Updated! 
Key: range_o3 
Value: [-1, 151]

Params Updated! 
Key: range_no2 
Value: [-1, 65]



## **5 - Data Defense**
---

- Create the `check_data()` function.
- It receives 2 arguments: `input_data` and `params`
    - `input_data` is the raw dataset
    - `params` is the configuration parameters
- It is a void function (no return value).
- If `AssertionError` happens, there are exists data that don't match the configuration.

In [49]:
# Function for data defense.
def data_defense(data, config):
    """
    Do data defense for checking the data types and range of data.

    Parameters:
    ----------
    data : pd.DataFrame
        The data to be checked.

    config : dict
        Loaded configuration parameters.

    Returns:
    -------
    None, it's a void function.
    """

    # Check data types.
    assert data.select_dtypes("datetime").columns.to_list() == config["columns_datetime"], "an error occurs in datetime column(s)."
    assert data.select_dtypes("str").columns.to_list() == config["columns_object"], "an error occurs in object column(s)."
    assert data.select_dtypes("number").columns.to_list() == config["columns_int"], "an error occurs in int32 column(s)."

    # Check range of data.
    assert set(data['stasiun']).issubset(set(config['range_stasiun'])), "an error occurs in stasiun range."
    assert data['pm10'].between(config['range_pm10'][0], config['range_pm10'][1]).sum() == len(data), "an error occurs in pm10 range."
    assert data['pm25'].between(config['range_pm25'][0], config['range_pm25'][1]).sum() == len(data), "an error occurs in pm25 range."
    assert data['so2'].between(config['range_so2'][0], config['range_so2'][1]).sum() == len(data), "an error occurs in so2 range."
    assert data['co'].between(config['range_co'][0], config['range_co'][1]).sum() == len(data), "an error occurs in co range."
    assert data['o3'].between(config['range_o3'][0], config['range_o3'][1]).sum() == len(data), "an error occurs in o3 range."
    assert data['no2'].between(config['range_no2'][0], config['range_no2'][1]).sum() == len(data), "an error occurs in no2 range."

In [50]:
# Do data defense.
data_defense(raw_dataset, config)

Seems like our data are in good condition!

## **6 - Data Split**
---

In [51]:
# Function for Input-Output Split.
def split_input_output(data, config):
    """
    Split the input(X) and output (y).

    Parameters:
    ----------
    data : pd.DataFrame
        The processed dataset.

    config : dict
        Loaded configuration parameters.

    Returns:
    -------
    X : pd.DataFrame
        The input data.

    y : pd.Series
        The output data.
    """

    # Ensure raw data immutable.
    data = data.copy()

    # Split the X and y.
    X = data[config["features"]]
    y = data[config["label"]]

    print(f"Original data shape : {data.shape}")
    print(f"Selected Features   : {config["features"]}")
    print(f"X data shape        : {X.shape}")
    print(f"y data shape        : {y.shape}")

    return X, y

In [52]:
# Split input-output.
X, y = split_input_output(
    data = raw_dataset,
    config = config
)

Original data shape : (1813, 11)
Selected Features   : ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2']
X data shape        : (1813, 7)
y data shape        : (1813,)


In [53]:
# Sanity check the input (X).
X.head()

Unnamed: 0,stasiun,pm10,pm25,so2,co,o3,no2
0,DKI1 (Bunderan HI),59,79,21,26,15,31
1,DKI1 (Bunderan HI),30,46,21,14,16,24
2,DKI1 (Bunderan HI),23,33,19,11,14,19
3,DKI1 (Bunderan HI),36,53,23,14,12,24
4,DKI1 (Bunderan HI),29,36,20,14,12,21


In [54]:
# Sanity check the output (y).
y.head()

0    SEDANG
1      BAIK
2      BAIK
3    SEDANG
4      BAIK
Name: category, dtype: str

In [55]:
# Function for Train-Test Split.
def split_train_test(X, y, test_size, random_state=123):
    """
    Split the train and test set.

    Parameters:
    ----------
    X : pd.DataFrame
        The input data.

    y : pd.Series
        The output data.

    test_size : float
        The proportion of test set.

    random_state : int, default = 123
        For reproducibility

    Returns:
    -------
    X_train, X_test : pd.DataFrame
        The train and test input.

    y_train, y_test : pd.Series
        The train and test output.
    """

    X_train, X_test, y_train, y_test = train_test_split(
                                            X, y,
                                            test_size = test_size,
                                            random_state = random_state,
                                            stratify = y
                                       )

    print(f"X_train shape : {X_train.shape}")
    print(f"y_train shape : {y_train.shape}")
    print(f"X_test shape  : {X_test.shape}")
    print(f"y_test shape  : {y_test.shape}\n")

    return X_train, X_test, y_train, y_test

In [56]:
# train:valid:test -> 80:10:10

# Train vs not-train.
X_train, X_not_train, y_train, y_not_train = split_train_test(
    X = X,
    y = y,
    test_size = 0.2
)

# Valid vs test.
X_valid, X_test, y_valid, y_test = split_train_test(
    X = X_not_train,
    y = y_not_train,
    test_size = 0.5,
)

X_train shape : (1450, 7)
y_train shape : (1450,)
X_test shape  : (363, 7)
y_test shape  : (363,)

X_train shape : (181, 7)
y_train shape : (181,)
X_test shape  : (182, 7)
y_test shape  : (182,)



## **7 - Data Serialization**
---

In [57]:
# Serialize the splitted data.
PATH_DATA_SPLITTED = "../data/interim/"

data_configuration = {
    "train": {
        "X_train": X_train,
        "y_train": y_train
    },
    "valid": {
        "X_valid": X_valid,
        "y_valid": y_valid
    },
    "test": {
        "X_test": X_test,
        "y_test": y_test
    }
}

for key, value in data_configuration.items():
    config_key = f"path_data_{key}"
    config_value = []

    for v in value:
        # Get each path.
        path = f"{PATH_DATA_SPLITTED + v}.pkl"
        config_value.append(path)

        # Get each data.
        data = value[v]

        # Serialize the splitted data.
        joblib.dump(data, path)

    # Update the configuration parameters.
    config = update_config(
        key = config_key,
        value = config_value,
        params = config,
        path_config = PATH_CONFIG
    )

Params Updated! 
Key: path_data_train 
Value: ['../data/interim/X_train.pkl', '../data/interim/y_train.pkl']

Params Updated! 
Key: path_data_valid 
Value: ['../data/interim/X_valid.pkl', '../data/interim/y_valid.pkl']

Params Updated! 
Key: path_data_test 
Value: ['../data/interim/X_test.pkl', '../data/interim/y_test.pkl']



In [58]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'path_data_joined': '../data/interim/joined_dataset.pkl',
 'path_data_raw': '../data/raw/',
 'path_data_test': ['../data/interim/X_test.pkl',
  '../data/interim/y_test.pkl'],
 'path_data_train': ['../data/interim/X_train.pkl',
  '../data/interim/y_train.pkl'],
 'path_data_valid': ['../data/interim/X_valid.pkl',
  '../data/interim/y_valid.pkl'],
 'path_data_validated': '../data/interim/validated_data.pkl',
 'range_co': [-1, 47],
 'range_no2': [-1, 65],
 'range_o3': [-1, 151],
 'range_pm10': [-1, 179],
 'range_pm25': [-1, 174],
 'range_so2': [-1, 82],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jaka