### Import Library

In [1]:
import yaml
import joblib
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


### Load Config

In [2]:
with open("../config/config.yaml", "r") as file:
    config = yaml.safe_load(file)
config

{'data_raw': 'data/raw/smoke.csv',
 'data_final': 'data/processed/ori_new.pkl',
 'path_train': ['data/processed/x_train.pkl', 'data/processed/y_train.pkl'],
 'path_valid': ['data/processed/x_valid.pkl', 'data/processed/y_valid.pkl'],
 'path_test': ['data/processed/x_test.pkl', 'data/processed/y_test.pkl'],
 'path_train_feat': ['data/processed/x_train_feat.pkl',
  'data/processed/y_train_feat.pkl'],
 'path_valid_feat': ['data/processed/x_valid_feat.pkl',
  'data/processed/y_valid_feat.pkl'],
 'path_test_feat': ['data/processed/x_test_feat.pkl',
  'data/processed/y_test_feat.pkl'],
 'final_model_path': 'models/production_model.pkl',
 'training_log_path': 'log/training_log.json',
 'new_cols': ['UTC',
  'Temperature',
  'Humidity',
  'TVOC',
  'eCO2',
  'Raw_H2',
  'Raw_Ethanol',
  'Pressure',
  'PM1.0',
  'PM2.5',
  'NC0.5',
  'NC1.0',
  'NC2.5',
  'CNT',
  'Fire_Alarm'],
 'datetime_columns': ['UTC'],
 'float_columns': ['Temperature',
  'Humidity',
  'Pressure',
  'PM1.0',
  'PM2.5',
  'N

### Load Data

In [3]:
x_train = joblib.load("../data/processed/x_train.pkl")
y_train = joblib.load("../data/processed/y_train.pkl")
x_test = joblib.load("../data/processed/x_test.pkl")
y_test = joblib.load("../data/processed/y_test.pkl")
x_valid = joblib.load("../data/processed/x_valid.pkl")
y_valid = joblib.load("../data/processed/y_valid.pkl")


In [4]:
train = pd.concat([x_train, y_train], axis = 1)
test = pd.concat([x_test, y_test], axis = 1)
valid = pd.concat([x_valid, y_valid], axis = 1)

In [5]:
print(train.shape)
print(test.shape)
print(valid.shape)

(43841, 9)
(9395, 9)
(9394, 9)


### Remove Outlier

In [6]:
train_remoutl = pd.DataFrame()
for col_name in train.columns[:-1]:
    q1 = train[col_name].quantile(0.25)
    q3 = train[col_name].quantile(0.75)
    iqr = q3-q1
    bt_atas = q3 + (1.5*iqr)
    bt_bawah = q1 - (1.5*iqr)
    train_remoutl_col = train[(train[col_name]>=bt_bawah) & (train[col_name]<=bt_atas)].copy()
    train_remoutl = pd.concat([train_remoutl, train_remoutl_col], axis=0)
index_count = train_remoutl.index.value_counts()
ss = index_count[index_count == (train.shape[1]-1)].index
train_remoutl = train_remoutl.loc[ss].drop_duplicates()
train_remoutl

Unnamed: 0,Temperature,Humidity,TVOC,eCO2,Raw_H2,Raw_Ethanol,Pressure,PM1.0,Fire_Alarm
43502,27.120,47.37,1228,405,12909,19428,938.728,1.66,1
28848,22.250,47.86,167,400,13208,20047,939.664,2.19,1
46395,24.750,52.52,1277,400,12957,19410,938.714,1.77,1
18171,18.947,53.38,1164,400,12910,19437,938.699,1.55,1
5623,-9.231,54.37,62,400,13240,20193,939.750,2.30,1
...,...,...,...,...,...,...,...,...,...
29009,19.990,53.28,140,400,13218,20072,939.633,2.10,1
18622,20.340,47.66,1216,400,12919,19424,938.731,1.36,1
30840,22.080,49.39,0,400,13276,20268,939.683,2.51,1
33679,18.820,54.32,349,400,13078,19913,939.321,0.20,1


#### Balancing Data

In [7]:
sm = SMOTE(random_state = 112)
x_train_remoutl_sm, y_train_remoutl_sm = sm.fit_resample(train_remoutl.drop(config["target"], axis = 1), train_remoutl[config["target"]])
train_remoutl_sm = pd.concat([x_train_remoutl_sm, y_train_remoutl_sm], axis = 1)

In [8]:
train_remoutl_sm

Unnamed: 0,Temperature,Humidity,TVOC,eCO2,Raw_H2,Raw_Ethanol,Pressure,PM1.0,Fire_Alarm
0,27.120000,47.370000,1228,405,12909,19428,938.728000,1.660000,1
1,22.250000,47.860000,167,400,13208,20047,939.664000,2.190000,1
2,24.750000,52.520000,1277,400,12957,19410,938.714000,1.770000,1
3,18.947000,53.380000,1164,400,12910,19437,938.699000,1.550000,1
4,-9.231000,54.370000,62,400,13240,20193,939.750000,2.300000,1
...,...,...,...,...,...,...,...,...,...
44155,27.099712,54.619749,19,400,13012,19914,939.741775,0.188559,0
44156,12.470613,50.994619,157,400,13147,20013,939.632066,0.910745,0
44157,20.480957,48.998851,127,400,13178,20043,939.658469,0.930431,0
44158,25.543534,51.703673,2,400,12812,19808,939.825000,0.212240,0


In [9]:
x_train_remoutl_sm = train_remoutl_sm[config["predictors"]].copy()
y_train_remoutl_sm = train_remoutl_sm[config["target"]].copy()

In [10]:
joblib.dump(x_train_remoutl_sm, "../data/processed/x_train_feat.pkl")
joblib.dump(y_train_remoutl_sm, "../data/processed/y_train_feat.pkl")
joblib.dump(x_test, "../data/processed/x_test_feat.pkl")
joblib.dump(y_test, "../data/processed/y_test_feat.pkl")
joblib.dump(x_valid, "../data/processed/x_valid_feat.pkl")
joblib.dump(y_valid, "../data/processed/y_valid_feat.pkl")

['../data/processed/y_valid_feat.pkl']

In [11]:
print(x_train_remoutl_sm.shape, y_train_remoutl_sm.shape)
print(x_test.shape, y_test.shape)
print(x_valid.shape, y_valid.shape)

(44160, 8) (44160,)
(9395, 8) (9395,)
(9394, 8) (9394,)
