# 0. Imports

In [1]:
import os
import warnings

import numpy as np
import pandas as pd
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

import datetime

import matplotlib.pyplot as plt

import torch
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor

from tqdm import tqdm

torch.manual_seed(41)

<torch._C.Generator at 0x1699f0b9d50>

# 1. Data

Processed dataset from [Part 1](https://github.com/johnwslee/fine_dust_analysis) will be used as raw data for study.

In [2]:
df = pd.read_csv(
    "../data/seoul_fine_dust_weather_2008_2021_for_ml.csv",
    index_col=0,
    parse_dates=True,
).reset_index()

In [3]:
df.head()

Unnamed: 0,date,temp(°C),precipitation(mm),wind_speed(m/s),wind_direction,humidity(%),local_P(hPa),cloud_cover,lowest_ceiling(100m),month,PM10_Counts,PM25_Counts,log_PM10_Counts,log_PM25_Counts
0,2008-01-01 10:00:00,-7.6,0.0,4.6,340.0,47.0,1010.7,2.0,12.0,1,30.0,11.0,3.433987,2.484907
1,2008-01-01 11:00:00,-6.9,0.0,4.1,290.0,42.0,1010.7,2.0,12.0,1,29.0,13.0,3.401197,2.639057
2,2008-01-01 12:00:00,-5.8,0.0,4.1,290.0,42.0,1010.7,2.0,12.0,1,29.0,12.0,3.401197,2.564949
3,2008-01-01 13:00:00,-5.0,0.0,4.1,290.0,36.0,1009.9,0.0,10.0,1,28.0,12.0,3.367296,2.564949
4,2008-01-01 14:00:00,-4.4,0.0,4.1,290.0,36.0,1009.9,0.0,10.0,1,27.0,13.0,3.332205,2.639057


The dataset above will be transformed so that it is used for multi-scale timeseries classification. In order to do so, `wind_direction`, `humidity(%)`, `lowest_ceiling(100m)`, `temp(°C)`, `wind_speed(m/s)`, and `local_P(hPa)` will be used as features, whereas the binary target, `Air_is_bad?` will be generated based on `PM10_Counts`.

In [4]:
new_columns = [
    "date",
    "wind_direction",
    "humidity(%)",
    "lowest_ceiling(100m)",
    "temp(°C)",
    "wind_speed(m/s)",
    "local_P(hPa)",
    "PM10_Counts",
]

In [5]:
df = df[new_columns]

In [6]:
df

Unnamed: 0,date,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa),PM10_Counts
0,2008-01-01 10:00:00,340.0,47.0,12.0,-7.6,4.6,1010.7,30.0
1,2008-01-01 11:00:00,290.0,42.0,12.0,-6.9,4.1,1010.7,29.0
2,2008-01-01 12:00:00,290.0,42.0,12.0,-5.8,4.1,1010.7,29.0
3,2008-01-01 13:00:00,290.0,36.0,10.0,-5.0,4.1,1009.9,28.0
4,2008-01-01 14:00:00,290.0,36.0,10.0,-4.4,4.1,1009.9,27.0
...,...,...,...,...,...,...,...,...
122728,2021-12-31 05:00:00,270.0,39.0,10.0,-7.7,3.5,1018.5,25.0
122729,2021-12-31 06:00:00,290.0,40.0,10.0,-8.0,3.2,1019.2,23.0
122730,2021-12-31 07:00:00,270.0,42.0,10.0,-8.6,2.7,1019.9,22.0
122731,2021-12-31 08:00:00,270.0,42.0,10.0,-8.8,2.9,1020.8,22.0


Since the time starts from 10 for 2008-01-01, the dataset should start from 2008-01-02 so that all features have equal length of data per day. The same goes for 2021-12-31. So Feature will range from 2008-01-02 to 2021-12-29, while it will be from 2008-01-03 to 2021-12-30 for target.

## 1.1. Train/Test Split

In [7]:
train_df = df.loc[df["date"].dt.date < datetime.date(2016, 1, 1)]
test_df = df.loc[df["date"].dt.date >= datetime.date(2016, 1, 1)]

## 1.2. Scaling on Feature Columns

In [8]:
features = list(df.columns[1:7])

In [9]:
scaler = StandardScaler()

In [10]:
train_df[features] = scaler.fit_transform(train_df[features])

In [11]:
test_df[features] = scaler.transform(test_df[features])

In [12]:
# Concatenate train/test df just for convenience of dataset preparation

df = pd.concat([train_df, test_df], axis=0)

In [13]:
df

Unnamed: 0,date,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa),PM10_Counts
0,2008-01-01 10:00:00,1.379492,-0.665907,-0.223685,-1.837310,1.384209,0.614341,30.0
1,2008-01-01 11:00:00,0.924452,-0.916402,-0.223685,-1.774089,1.032243,0.614341,29.0
2,2008-01-01 12:00:00,0.924452,-0.916402,-0.223685,-1.674743,1.032243,0.614341,29.0
3,2008-01-01 13:00:00,0.924452,-1.216997,-0.438062,-1.602491,1.032243,0.513459,28.0
4,2008-01-01 14:00:00,0.924452,-1.216997,-0.438062,-1.548302,1.032243,0.513459,27.0
...,...,...,...,...,...,...,...,...
122728,2021-12-31 05:00:00,0.742436,-1.066699,-0.438062,-1.846341,0.609884,1.597937,25.0
122729,2021-12-31 06:00:00,0.924452,-1.016600,-0.438062,-1.873436,0.398704,1.686208,23.0
122730,2021-12-31 07:00:00,0.742436,-0.916402,-0.438062,-1.927625,0.046738,1.774479,22.0
122731,2021-12-31 08:00:00,0.742436,-0.916402,-0.438062,-1.945688,0.187525,1.887971,22.0


## 1.3. Target Preparation

In [14]:
df.loc[df["PM10_Counts"] > 45].shape[0]

46562

In [15]:
df.shape[0]

122733

In [16]:
pm10_count = df[["date", "PM10_Counts"]]

In [17]:
pm10_count

Unnamed: 0,date,PM10_Counts
0,2008-01-01 10:00:00,30.0
1,2008-01-01 11:00:00,29.0
2,2008-01-01 12:00:00,29.0
3,2008-01-01 13:00:00,28.0
4,2008-01-01 14:00:00,27.0
...,...,...
122728,2021-12-31 05:00:00,25.0
122729,2021-12-31 06:00:00,23.0
122730,2021-12-31 07:00:00,22.0
122731,2021-12-31 08:00:00,22.0


In [18]:
pm10_count["date_new"] = pm10_count["date"].dt.date

In [19]:
target = pm10_count.groupby("date_new").mean().reset_index()

In [20]:
target["date_new"] = pd.to_datetime(target["date_new"])

In [21]:
target["Air_is_bad?"] = np.where(target["PM10_Counts"] >= 45, 1, 0)

In [22]:
target = target[["date_new", "Air_is_bad?"]]

In [23]:
target = target.loc[
    (target["date_new"].dt.date >= datetime.date(2008, 1, 3)) &
    (target["date_new"].dt.date <= datetime.date(2021, 12, 30))
].reset_index(drop=True)

In [24]:
target

Unnamed: 0,date_new,Air_is_bad?
0,2008-01-03,1
1,2008-01-04,1
2,2008-01-05,1
3,2008-01-06,1
4,2008-01-07,1
...,...,...
5106,2021-12-26,0
5107,2021-12-27,0
5108,2021-12-28,1
5109,2021-12-29,1


## 1.4. Feature Preparation

In [25]:
features = df[
    [
        "date",
        "wind_direction",
        "humidity(%)",
        "lowest_ceiling(100m)",
        "temp(°C)",
        "wind_speed(m/s)",
        "local_P(hPa)",
    ]
]

In [26]:
features["date_new"] = features["date"].dt.date

In [27]:
features

Unnamed: 0,date,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa),date_new
0,2008-01-01 10:00:00,1.379492,-0.665907,-0.223685,-1.837310,1.384209,0.614341,2008-01-01
1,2008-01-01 11:00:00,0.924452,-0.916402,-0.223685,-1.774089,1.032243,0.614341,2008-01-01
2,2008-01-01 12:00:00,0.924452,-0.916402,-0.223685,-1.674743,1.032243,0.614341,2008-01-01
3,2008-01-01 13:00:00,0.924452,-1.216997,-0.438062,-1.602491,1.032243,0.513459,2008-01-01
4,2008-01-01 14:00:00,0.924452,-1.216997,-0.438062,-1.548302,1.032243,0.513459,2008-01-01
...,...,...,...,...,...,...,...,...
122728,2021-12-31 05:00:00,0.742436,-1.066699,-0.438062,-1.846341,0.609884,1.597937,2021-12-31
122729,2021-12-31 06:00:00,0.924452,-1.016600,-0.438062,-1.873436,0.398704,1.686208,2021-12-31
122730,2021-12-31 07:00:00,0.742436,-0.916402,-0.438062,-1.927625,0.046738,1.774479,2021-12-31
122731,2021-12-31 08:00:00,0.742436,-0.916402,-0.438062,-1.945688,0.187525,1.887971,2021-12-31


In [28]:
features = (
    features.groupby("date_new")
    .agg(list)[
        [
            "wind_direction",
            "humidity(%)",
            "lowest_ceiling(100m)",
            "temp(°C)",
            "wind_speed(m/s)",
            "local_P(hPa)",
        ]
    ]
    .reset_index()
)

In [29]:
features["date_new"] = pd.to_datetime(features["date_new"])

In [30]:
features.head()

Unnamed: 0,date_new,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa)
0,2008-01-01,"[1.3794924231559433, 0.924452475006894, 0.9244...","[-0.665906724059201, -0.9164020855164998, -0.9...","[-0.22368526895253332, -0.22368526895253332, -...","[-1.8373098159974528, -1.7740892697507045, -1....","[1.3842093884999658, 1.0322432575055385, 1.032...","[0.6143406738734757, 0.6143406738734757, 0.614..."
1,2008-01-02,"[0.924452475006894, 1.1974764438963235, 1.1974...","[-0.16491600114460359, -0.665906724059201, -0....","[-0.5452500390488106, -0.5452500390488106, -0....","[-1.7379632433239907, -1.62958516404385, -1.53...","[-0.5164077188699413, 0.11713131692002768, 0.1...","[0.9043753409882764, 1.1817998051850536, 1.181..."
2,2008-01-03,"[-1.7147792242575914, -1.2597392761085422, -1....","[-0.06471785656168408, -0.11481692885314383, -...","[-0.22368526895253332, -0.8668148091450879, -0...","[-1.3496084592368196, -1.1870413403166087, -1....","[-1.8538790166487649, -0.6571941712677122, -0....","[1.1187487905948783, 0.7782733118079257, 0.778..."
3,2008-01-04,"[0.924452475006894, -1.5327632449979716, 1.379...","[0.035480288021235415, -0.01461878427022433, -...","[-0.8668148091450879, -0.8668148091450879, -0....","[-1.2231673667433223, -1.2592933931700356, -1....","[0.046738090721142475, -0.5164077188699413, -1...","[1.1817998051850536, 1.3331222402014657, 1.333..."
4,2008-01-05,"[0.7424364957472743, -1.5327632449979716, -1.2...","[-0.6158076517677413, 0.035480288021235415, -0...","[-0.5452500390488106, -0.5452500390488106, -0....","[-1.1328523006765383, -1.12382079406986, -0.97...","[-0.9387670760632539, -0.6571941712677122, -1....","[0.9548161526604138, 0.9043753409882764, 0.904..."


In [31]:
features = features.loc[
    (features["date_new"].dt.date >= datetime.date(2008, 1, 2))
    & (features["date_new"].dt.date <= datetime.date(2021, 12, 29))
].reset_index(drop=True)

In [32]:
features

Unnamed: 0,date_new,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa)
0,2008-01-02,"[0.924452475006894, 1.1974764438963235, 1.1974...","[-0.16491600114460359, -0.665906724059201, -0....","[-0.5452500390488106, -0.5452500390488106, -0....","[-1.7379632433239907, -1.62958516404385, -1.53...","[-0.5164077188699413, 0.11713131692002768, 0.1...","[0.9043753409882764, 1.1817998051850536, 1.181..."
1,2008-01-03,"[-1.7147792242575914, -1.2597392761085422, -1....","[-0.06471785656168408, -0.11481692885314383, -...","[-0.22368526895253332, -0.8668148091450879, -0...","[-1.3496084592368196, -1.1870413403166087, -1....","[-1.8538790166487649, -0.6571941712677122, -0....","[1.1187487905948783, 0.7782733118079257, 0.778..."
2,2008-01-04,"[0.924452475006894, -1.5327632449979716, 1.379...","[0.035480288021235415, -0.01461878427022433, -...","[-0.8668148091450879, -0.8668148091450879, -0....","[-1.2231673667433223, -1.2592933931700356, -1....","[0.046738090721142475, -0.5164077188699413, -1...","[1.1817998051850536, 1.3331222402014657, 1.333..."
3,2008-01-05,"[0.7424364957472743, -1.5327632449979716, -1.2...","[-0.6158076517677413, 0.035480288021235415, -0...","[-0.5452500390488106, -0.5452500390488106, -0....","[-1.1328523006765383, -1.12382079406986, -0.97...","[-0.9387670760632539, -0.6571941712677122, -1....","[0.9548161526604138, 0.9043753409882764, 0.904..."
4,2008-01-06,"[-1.5327632449979716, -0.8957073175893028, -0....","[0.8871645169760511, 0.5364710109358329, 0.436...","[-0.4380617823500515, -0.974003065843847, -0.9...","[-1.0515687412164327, -0.8619071024761865, -0....","[-1.0091603022621394, -1.0795535284610247, -1....","[0.9548161526604138, 0.9043753409882764, 0.904..."
...,...,...,...,...,...,...,...
5106,2021-12-25,"[0.924452475006894, 0.924452475006894, 0.74243...","[-0.4655104348933621, -0.9164020855164998, -1....","[-0.11649701225377423, -0.11649701225377423, -...","[-1.8102152961774178, -2.252759119904659, -2.1...","[1.5953890670966224, 0.609883900312226, 0.6802...","[1.2700712256112905, 2.039293603611432, 2.0392..."
5107,2021-12-26,"[0.7424364957472743, 0.7424364957472743, 0.924...","[-0.7160057963506608, -0.8162039409335803, -0....","[-0.11649701225377423, -0.11649701225377423, -...","[-2.451452265251584, -2.388231719004835, -2.29...","[-0.3756212664721703, 0.6802771265111115, 0.18...","[2.3671588794803324, 2.518481314496759, 2.4680..."
5108,2021-12-27,"[1.1974764438963235, -1.2597392761085422, -0.8...","[-0.36531229031044254, -0.31521321801898283, -...","[-0.11649701225377423, -0.11649701225377423, -...","[-2.2437276132979806, -1.9185933754575586, -1....","[-0.23483481407439957, -0.5164077188699413, -0...","[2.1275650240376835, 1.963632386103219, 1.9258..."
5109,2021-12-28,"[-1.5327632449979716, -1.2597392761085422, -1....","[1.1376598784333498, 1.0374617338504304, 0.636...","[-0.3308735256512924, -0.3308735256512924, -0....","[-1.7199002301106339, -1.6657111904705635, -1....","[-0.30522804027328476, -0.3756212664721703, -0...","[1.7114283277425177, 1.7240385306605557, 1.686..."


## 1.5. Dataset Preparation

In [33]:
data = pd.concat([features, target["Air_is_bad?"]], axis=1)

In [34]:
data

Unnamed: 0,date_new,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa),Air_is_bad?
0,2008-01-02,"[0.924452475006894, 1.1974764438963235, 1.1974...","[-0.16491600114460359, -0.665906724059201, -0....","[-0.5452500390488106, -0.5452500390488106, -0....","[-1.7379632433239907, -1.62958516404385, -1.53...","[-0.5164077188699413, 0.11713131692002768, 0.1...","[0.9043753409882764, 1.1817998051850536, 1.181...",1
1,2008-01-03,"[-1.7147792242575914, -1.2597392761085422, -1....","[-0.06471785656168408, -0.11481692885314383, -...","[-0.22368526895253332, -0.8668148091450879, -0...","[-1.3496084592368196, -1.1870413403166087, -1....","[-1.8538790166487649, -0.6571941712677122, -0....","[1.1187487905948783, 0.7782733118079257, 0.778...",1
2,2008-01-04,"[0.924452475006894, -1.5327632449979716, 1.379...","[0.035480288021235415, -0.01461878427022433, -...","[-0.8668148091450879, -0.8668148091450879, -0....","[-1.2231673667433223, -1.2592933931700356, -1....","[0.046738090721142475, -0.5164077188699413, -1...","[1.1817998051850536, 1.3331222402014657, 1.333...",1
3,2008-01-05,"[0.7424364957472743, -1.5327632449979716, -1.2...","[-0.6158076517677413, 0.035480288021235415, -0...","[-0.5452500390488106, -0.5452500390488106, -0....","[-1.1328523006765383, -1.12382079406986, -0.97...","[-0.9387670760632539, -0.6571941712677122, -1....","[0.9548161526604138, 0.9043753409882764, 0.904...",1
4,2008-01-06,"[-1.5327632449979716, -0.8957073175893028, -0....","[0.8871645169760511, 0.5364710109358329, 0.436...","[-0.4380617823500515, -0.974003065843847, -0.9...","[-1.0515687412164327, -0.8619071024761865, -0....","[-1.0091603022621394, -1.0795535284610247, -1....","[0.9548161526604138, 0.9043753409882764, 0.904...",1
...,...,...,...,...,...,...,...,...
5106,2021-12-25,"[0.924452475006894, 0.924452475006894, 0.74243...","[-0.4655104348933621, -0.9164020855164998, -1....","[-0.11649701225377423, -0.11649701225377423, -...","[-1.8102152961774178, -2.252759119904659, -2.1...","[1.5953890670966224, 0.609883900312226, 0.6802...","[1.2700712256112905, 2.039293603611432, 2.0392...",0
5107,2021-12-26,"[0.7424364957472743, 0.7424364957472743, 0.924...","[-0.7160057963506608, -0.8162039409335803, -0....","[-0.11649701225377423, -0.11649701225377423, -...","[-2.451452265251584, -2.388231719004835, -2.29...","[-0.3756212664721703, 0.6802771265111115, 0.18...","[2.3671588794803324, 2.518481314496759, 2.4680...",0
5108,2021-12-27,"[1.1974764438963235, -1.2597392761085422, -0.8...","[-0.36531229031044254, -0.31521321801898283, -...","[-0.11649701225377423, -0.11649701225377423, -...","[-2.2437276132979806, -1.9185933754575586, -1....","[-0.23483481407439957, -0.5164077188699413, -0...","[2.1275650240376835, 1.963632386103219, 1.9258...",1
5109,2021-12-28,"[-1.5327632449979716, -1.2597392761085422, -1....","[1.1376598784333498, 1.0374617338504304, 0.636...","[-0.3308735256512924, -0.3308735256512924, -0....","[-1.7199002301106339, -1.6657111904705635, -1....","[-0.30522804027328476, -0.3756212664721703, -0...","[1.7114283277425177, 1.7240385306605557, 1.686...",1


### 1.5.1. Preparation of X

In [35]:
# To filter out data with inconsistent data size

for i in tqdm(range(len(data))):
    for j in range(1, 7):
        if len(data.iloc[i, j]) != 24:
            print(i, j, len(data.iloc[i, j]))
        else:
            continue

 85%|████████████████████████████████████████████████████████████████▎           | 4326/5111 [00:00<00:00, 6140.47it/s]

2791 1 23
2791 2 23
2791 3 23
2791 4 23
2791 5 23
2791 6 23
2801 1 23
2801 2 23
2801 3 23
2801 4 23
2801 5 23
2801 6 23


100%|████████████████████████████████████████████████████████████████████████████| 5111/5111 [00:00<00:00, 6113.68it/s]


In [36]:
data = data.drop([2791, 2801], axis=0)

In [37]:
feature_array = []
for i in tqdm(range(len(data))):
    row_array = []
    for j in range(1, 7):
        row_array.append(np.array(data.iloc[i, j]))
    stacked_row_array = np.stack(row_array, axis=0)
    feature_array.append(stacked_row_array)
    
X = np.stack(feature_array, axis=0)

100%|████████████████████████████████████████████████████████████████████████████| 5109/5109 [00:01<00:00, 5098.81it/s]


In [38]:
X.shape

(5109, 6, 24)

### 1.5.2. Preparation of y

In [39]:
y = np.array(data["Air_is_bad?"])

In [40]:
y.shape

(5109,)

### 1.5.3. Train/Test Split for Deep Learning

In [41]:
# To find the number of data for train set
len(data.loc[data["date_new"].dt.date < datetime.date(2016, 1, 1)])

2919

In [42]:
X_train = X[:2919, :, :]
X_test = X[2919:, :, :]

y_train = y[:2919]
y_test = y[2919:]

# 2. DataLoader Preparation