### Import Library

In [1]:
import pandas as pd
import joblib

### Load Data

In [5]:
path = '../data/raw/smoke_detection_iot.csv'
ori = pd.read_csv(path, index_col=0)
ori

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
0,1654733331,20.000,57.36,0,400,12306,18520,939.735,0.00,0.00,0.00,0.000,0.000,0,0
1,1654733332,20.015,56.67,0,400,12345,18651,939.744,0.00,0.00,0.00,0.000,0.000,1,0
2,1654733333,20.029,55.96,0,400,12374,18764,939.738,0.00,0.00,0.00,0.000,0.000,2,0
3,1654733334,20.044,55.28,0,400,12390,18849,939.736,0.00,0.00,0.00,0.000,0.000,3,0
4,1654733335,20.059,54.69,0,400,12403,18921,939.744,0.00,0.00,0.00,0.000,0.000,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,1655130047,18.438,15.79,625,400,13723,20569,936.670,0.63,0.65,4.32,0.673,0.015,5739,0
62626,1655130048,18.653,15.87,612,400,13731,20588,936.678,0.61,0.63,4.18,0.652,0.015,5740,0
62627,1655130049,18.867,15.84,627,400,13725,20582,936.687,0.57,0.60,3.95,0.617,0.014,5741,0
62628,1655130050,19.083,16.04,638,400,13712,20566,936.680,0.57,0.59,3.92,0.611,0.014,5742,0


In [6]:
joblib.dump(ori, '../data/processed/ori.pkl')

['../data/processed/ori.pkl']

### Data Definition

UTC :  
&nbsp;&nbsp;&nbsp;&nbsp;[integer]  
&nbsp;&nbsp;&nbsp;&nbsp;waktu pengambilan sample dalam detik UTC  
&nbsp;&nbsp;&nbsp;&nbsp;SECONDS SINCE JAN 01 1970  

Temperature[C]  :  
&nbsp;&nbsp;&nbsp;&nbsp;[float]  
&nbsp;&nbsp;&nbsp;&nbsp;suhu dalam derajat celcius  

Humidity[%] :  
&nbsp;&nbsp;&nbsp;&nbsp;[float]  
&nbsp;&nbsp;&nbsp;&nbsp;kelembaban udara  

TVOC[ppb]    :  
&nbsp;&nbsp;&nbsp;&nbsp;[integer]  
&nbsp;&nbsp;&nbsp;&nbsp;Total Volatile Organic Compounds  

eCO2[ppm]   :  
&nbsp;&nbsp;&nbsp;&nbsp;[integer]  
&nbsp;&nbsp;&nbsp;&nbsp;CO2 equivalent concentration  

Raw H2  :  
&nbsp;&nbsp;&nbsp;&nbsp;[integer]  
&nbsp;&nbsp;&nbsp;&nbsp;raw molecular hydrogen  

Raw Ethanol :  
&nbsp;&nbsp;&nbsp;&nbsp;[integer]  
&nbsp;&nbsp;&nbsp;&nbsp;raw ethanol gas  

Pressure[hPa]   :  
&nbsp;&nbsp;&nbsp;&nbsp;[float]  
&nbsp;&nbsp;&nbsp;&nbsp;Tekanan udara  

PM1.0, PM2.5   :  
&nbsp;&nbsp;&nbsp;&nbsp;[float]  
&nbsp;&nbsp;&nbsp;&nbsp;ukuran partikel dalam micron

NC0.5, NC1.0, NC2.5  :  
&nbsp;&nbsp;&nbsp;&nbsp;[float]  
&nbsp;&nbsp;&nbsp;&nbsp;Number concentration of particulate (micron)  

CNT :  
&nbsp;&nbsp;&nbsp;&nbsp;[integer]  
&nbsp;&nbsp;&nbsp;&nbsp;Sample counter  

Fire Alarm  :  
&nbsp;&nbsp;&nbsp;&nbsp;[integer]  
&nbsp;&nbsp;&nbsp;&nbsp;Fire Alarm, 1=fire, 0=not

> Kolom UTC dan CNT, tidak banyak berguna untuk modelling

### Data Validation

#### Tipe Data

In [7]:
ori.dtypes

UTC                 int64
Temperature[C]    float64
Humidity[%]       float64
TVOC[ppb]           int64
eCO2[ppm]           int64
Raw H2              int64
Raw Ethanol         int64
Pressure[hPa]     float64
PM1.0             float64
PM2.5             float64
NC0.5             float64
NC1.0             float64
NC2.5             float64
CNT                 int64
Fire Alarm          int64
dtype: object

In [8]:
# merubah data UTC menjadi datetime
ori['UTC'] = pd.to_datetime(ori['UTC'], unit='s')
ori.dtypes

UTC               datetime64[ns]
Temperature[C]           float64
Humidity[%]              float64
TVOC[ppb]                  int64
eCO2[ppm]                  int64
Raw H2                     int64
Raw Ethanol                int64
Pressure[hPa]            float64
PM1.0                    float64
PM2.5                    float64
NC0.5                    float64
NC1.0                    float64
NC2.5                    float64
CNT                        int64
Fire Alarm                 int64
dtype: object

In [9]:
ori.head()

Unnamed: 0,UTC,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,CNT,Fire Alarm
0,2022-06-09 00:08:51,20.0,57.36,0,400,12306,18520,939.735,0.0,0.0,0.0,0.0,0.0,0,0
1,2022-06-09 00:08:52,20.015,56.67,0,400,12345,18651,939.744,0.0,0.0,0.0,0.0,0.0,1,0
2,2022-06-09 00:08:53,20.029,55.96,0,400,12374,18764,939.738,0.0,0.0,0.0,0.0,0.0,2,0
3,2022-06-09 00:08:54,20.044,55.28,0,400,12390,18849,939.736,0.0,0.0,0.0,0.0,0.0,3,0
4,2022-06-09 00:08:55,20.059,54.69,0,400,12403,18921,939.744,0.0,0.0,0.0,0.0,0.0,4,0


#### Range

In [10]:
ori.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature[C],62630.0,15.970424,14.359576,-22.01,10.99425,20.13,25.4095,59.93
Humidity[%],62630.0,48.539499,8.865367,10.74,47.53,50.15,53.24,75.2
TVOC[ppb],62630.0,1942.057528,7811.589055,0.0,130.0,981.0,1189.0,60000.0
eCO2[ppm],62630.0,670.021044,1905.885439,400.0,400.0,400.0,438.0,60000.0
Raw H2,62630.0,12942.453936,272.464305,10668.0,12830.0,12924.0,13109.0,13803.0
Raw Ethanol,62630.0,19754.257912,609.513156,15317.0,19435.0,19501.0,20078.0,21410.0
Pressure[hPa],62630.0,938.627649,1.331344,930.852,938.7,938.816,939.418,939.861
PM1.0,62630.0,100.594309,922.524245,0.0,1.28,1.81,2.09,14333.69
PM2.5,62630.0,184.46777,1976.305615,0.0,1.34,1.88,2.18,45432.26
NC0.5,62630.0,491.463608,4265.661251,0.0,8.82,12.45,14.42,61482.03


#### Dimensi Data

In [13]:
ori.shape

(62630, 15)

#### Handling Columns Error

In [14]:
# dari tipe data, terlihat bahwa semua data sdh sesuai dg tipe nya, sehingga bagian ini tidak diperlukan

#### Check Missing Value

In [15]:
ori.isnull().sum().sort_values(ascending=False)

UTC               0
Temperature[C]    0
Humidity[%]       0
TVOC[ppb]         0
eCO2[ppm]         0
Raw H2            0
Raw Ethanol       0
Pressure[hPa]     0
PM1.0             0
PM2.5             0
NC0.5             0
NC1.0             0
NC2.5             0
CNT               0
Fire Alarm        0
dtype: int64

#### Drop Kolom tidak diperlukan

In [17]:
# Drop kolom UTC dan CNT
unused_columns = ['UTC', 'CNT']
ori_new = ori.drop(unused_columns, axis=1)
ori_new

Unnamed: 0,Temperature[C],Humidity[%],TVOC[ppb],eCO2[ppm],Raw H2,Raw Ethanol,Pressure[hPa],PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,Fire Alarm
0,20.000,57.36,0,400,12306,18520,939.735,0.00,0.00,0.00,0.000,0.000,0
1,20.015,56.67,0,400,12345,18651,939.744,0.00,0.00,0.00,0.000,0.000,0
2,20.029,55.96,0,400,12374,18764,939.738,0.00,0.00,0.00,0.000,0.000,0
3,20.044,55.28,0,400,12390,18849,939.736,0.00,0.00,0.00,0.000,0.000,0
4,20.059,54.69,0,400,12403,18921,939.744,0.00,0.00,0.00,0.000,0.000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,18.438,15.79,625,400,13723,20569,936.670,0.63,0.65,4.32,0.673,0.015,0
62626,18.653,15.87,612,400,13731,20588,936.678,0.61,0.63,4.18,0.652,0.015,0
62627,18.867,15.84,627,400,13725,20582,936.687,0.57,0.60,3.95,0.617,0.014,0
62628,19.083,16.04,638,400,13712,20566,936.680,0.57,0.59,3.92,0.611,0.014,0


In [31]:
# Change nama kolom --> feature_names must be string, and may not contain [, ] or <
new_columns_name = ['Temperature', 'Humidity', 'TVOC', 'eCO2', 'Raw_H2', 'Raw_Ethanol', 'Pressure', 'PM1.0', 'PM2.5', 'NC0.5', 'NC1.0',
       'NC2.5', 'Fire_Alarm']
ori_new.columns = new_columns_name

In [32]:
ori_new

Unnamed: 0,Temperature,Humidity,TVOC,eCO2,Raw_H2,Raw_Ethanol,Pressure,PM1.0,PM2.5,NC0.5,NC1.0,NC2.5,Fire_Alarm
0,20.000,57.36,0,400,12306,18520,939.735,0.00,0.00,0.00,0.000,0.000,0
1,20.015,56.67,0,400,12345,18651,939.744,0.00,0.00,0.00,0.000,0.000,0
2,20.029,55.96,0,400,12374,18764,939.738,0.00,0.00,0.00,0.000,0.000,0
3,20.044,55.28,0,400,12390,18849,939.736,0.00,0.00,0.00,0.000,0.000,0
4,20.059,54.69,0,400,12403,18921,939.744,0.00,0.00,0.00,0.000,0.000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62625,18.438,15.79,625,400,13723,20569,936.670,0.63,0.65,4.32,0.673,0.015,0
62626,18.653,15.87,612,400,13731,20588,936.678,0.61,0.63,4.18,0.652,0.015,0
62627,18.867,15.84,627,400,13725,20582,936.687,0.57,0.60,3.95,0.617,0.014,0
62628,19.083,16.04,638,400,13712,20566,936.680,0.57,0.59,3.92,0.611,0.014,0


In [34]:
joblib.dump(ori_new, '../data/processed/ori_new.pkl')

['../data/processed/ori_new.pkl']