### Import Library

In [1]:
import yaml
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split

### Load Config

In [2]:
with open("../config/config.yaml", "r") as file:
    config = yaml.safe_load(file)
config

{'data_raw': 'data/raw/smoke.csv',
 'data_final': 'data/processed/ori_new.pkl',
 'path_train': ['data/processed/x_train.pkl', 'data/processed/y_train.pkl'],
 'path_valid': ['data/processed/x_valid.pkl', 'data/processed/y_valid.pkl'],
 'path_test': ['data/processed/x_test.pkl', 'data/processed/y_test.pkl'],
 'path_train_feat': ['data/processed/x_train_feat.pkl',
  'data/processed/y_train_feat.pkl'],
 'path_valid_feat': ['data/processed/x_valid_feat.pkl',
  'data/processed/y_valid_feat.pkl'],
 'path_test_feat': ['data/processed/x_test_feat.pkl',
  'data/processed/y_test_feat.pkl'],
 'final_model_path': 'models/production_model.pkl',
 'training_log_path': 'log/training_log.json',
 'new_cols': ['utc',
  'temperature',
  'humidity',
  'tvoc',
  'eco2',
  'raw_h2',
  'raw_ethanol',
  'pressure',
  'pm10',
  'pm25',
  'nc05',
  'nc10',
  'nc25',
  'cnt',
  'fire_alarm'],
 'datetime_columns': ['utc'],
 'float_columns': ['temperature',
  'humidity',
  'pressure',
  'pm10',
  'pm25',
  'nc05',
 

### Load Data

In [3]:
ori = pd.read_csv("../data/raw/smoke.csv", index_col=0)
ori

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
0,1654733331,20.000,57.36,0,400,12306,18520,939.735,0.00,0.00,0.00,0.000,0.000,0,0
1,1654733332,20.015,56.67,0,400,12345,18651,939.744,0.00,0.00,0.00,0.000,0.000,1,0
2,1654733333,20.029,55.96,0,400,12374,18764,939.738,0.00,0.00,0.00,0.000,0.000,2,0
3,1654733334,20.044,55.28,0,400,12390,18849,939.736,0.00,0.00,0.00,0.000,0.000,3,0
4,1654733335,20.059,54.69,0,400,12403,18921,939.744,0.00,0.00,0.00,0.000,0.000,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,1655130047,18.438,15.79,625,400,13723,20569,936.670,0.63,0.65,4.32,0.673,0.015,5739,0
62626,1655130048,18.653,15.87,612,400,13731,20588,936.678,0.61,0.63,4.18,0.652,0.015,5740,0
62627,1655130049,18.867,15.84,627,400,13725,20582,936.687,0.57,0.60,3.95,0.617,0.014,5741,0
62628,1655130050,19.083,16.04,638,400,13712,20566,936.680,0.57,0.59,3.92,0.611,0.014,5742,0


In [4]:
ori_new = ori.copy()

In [5]:
# rubah nama kolom
ori_new.columns = config["new_cols"]
ori_new

Unnamed: 0,utc,temperature,humidity,tvoc,eco2,raw_h2,raw_ethanol,pressure,pm10,pm25,nc05,nc10,nc25,cnt,fire_alarm
0,1654733331,20.000,57.36,0,400,12306,18520,939.735,0.00,0.00,0.00,0.000,0.000,0,0
1,1654733332,20.015,56.67,0,400,12345,18651,939.744,0.00,0.00,0.00,0.000,0.000,1,0
2,1654733333,20.029,55.96,0,400,12374,18764,939.738,0.00,0.00,0.00,0.000,0.000,2,0
3,1654733334,20.044,55.28,0,400,12390,18849,939.736,0.00,0.00,0.00,0.000,0.000,3,0
4,1654733335,20.059,54.69,0,400,12403,18921,939.744,0.00,0.00,0.00,0.000,0.000,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,1655130047,18.438,15.79,625,400,13723,20569,936.670,0.63,0.65,4.32,0.673,0.015,5739,0
62626,1655130048,18.653,15.87,612,400,13731,20588,936.678,0.61,0.63,4.18,0.652,0.015,5740,0
62627,1655130049,18.867,15.84,627,400,13725,20582,936.687,0.57,0.60,3.95,0.617,0.014,5741,0
62628,1655130050,19.083,16.04,638,400,13712,20566,936.680,0.57,0.59,3.92,0.611,0.014,5742,0


### Checking type

In [6]:
# rubah waktu
ori_new[config["datetime_columns"][0]] = pd.to_datetime(ori_new[config["datetime_columns"][0]], unit = "s")
ori_new

Unnamed: 0,utc,temperature,humidity,tvoc,eco2,raw_h2,raw_ethanol,pressure,pm10,pm25,nc05,nc10,nc25,cnt,fire_alarm
0,2022-06-09 00:08:51,20.000,57.36,0,400,12306,18520,939.735,0.00,0.00,0.00,0.000,0.000,0,0
1,2022-06-09 00:08:52,20.015,56.67,0,400,12345,18651,939.744,0.00,0.00,0.00,0.000,0.000,1,0
2,2022-06-09 00:08:53,20.029,55.96,0,400,12374,18764,939.738,0.00,0.00,0.00,0.000,0.000,2,0
3,2022-06-09 00:08:54,20.044,55.28,0,400,12390,18849,939.736,0.00,0.00,0.00,0.000,0.000,3,0
4,2022-06-09 00:08:55,20.059,54.69,0,400,12403,18921,939.744,0.00,0.00,0.00,0.000,0.000,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,2022-06-13 14:20:47,18.438,15.79,625,400,13723,20569,936.670,0.63,0.65,4.32,0.673,0.015,5739,0
62626,2022-06-13 14:20:48,18.653,15.87,612,400,13731,20588,936.678,0.61,0.63,4.18,0.652,0.015,5740,0
62627,2022-06-13 14:20:49,18.867,15.84,627,400,13725,20582,936.687,0.57,0.60,3.95,0.617,0.014,5741,0
62628,2022-06-13 14:20:50,19.083,16.04,638,400,13712,20566,936.680,0.57,0.59,3.92,0.611,0.014,5742,0


In [7]:
ori_new.dtypes

utc            datetime64[ns]
temperature           float64
humidity              float64
tvoc                    int64
eco2                    int64
raw_h2                  int64
raw_ethanol             int64
pressure              float64
pm10                  float64
pm25                  float64
nc05                  float64
nc10                  float64
nc25                  float64
cnt                     int64
fire_alarm              int64
dtype: object

In [8]:
assert ori_new.select_dtypes("datetime").columns.to_list() == config["datetime_columns"], "an error occurs in datetime column(s)."
assert ori_new.select_dtypes("int").columns.to_list() == config["int_columns"], "an error occurs in int column(s)."
assert ori_new.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."

In [9]:
joblib.dump(ori_new, "../data/processed/ori_new.pkl")

['../data/processed/ori_new.pkl']

### Split Data

In [10]:
ori_new.shape

(62630, 15)

In [11]:
x = ori_new[config["predictors"]].copy()
y = ori_new[config["target"]].copy()

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = config["test_size"], random_state = 42, stratify = y)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = config["valid_size"], random_state = 42, stratify = y_test)

In [13]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_valid.shape, y_valid.shape)

(43841, 8) (43841,)
(9395, 8) (9395,)
(9394, 8) (9394,)


In [14]:
joblib.dump(x_train, "../data/processed/x_train.pkl")
joblib.dump(y_train, "../data/processed/y_train.pkl")
joblib.dump(x_test, "../data/processed/x_test.pkl")
joblib.dump(y_test, "../data/processed/y_test.pkl")
joblib.dump(x_valid, "../data/processed/x_valid.pkl")
joblib.dump(y_valid, "../data/processed/y_valid.pkl")

['../data/processed/y_valid.pkl']