# 0. Imports

In [1]:
import os
import warnings

import pandas as pd
import numpy as np

from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)
import datetime

# 1. Data

Processed dataset from [Part 1](https://github.com/johnwslee/fine_dust_analysis) will be used as raw data for study.

In [2]:
df = pd.read_csv(
    "../data/raw/seoul_fine_dust_weather_2008_2021_for_ml.csv",
    index_col=0,
    parse_dates=True,
).reset_index()

In [3]:
df.head()

Unnamed: 0,date,temp(°C),precipitation(mm),wind_speed(m/s),wind_direction,humidity(%),local_P(hPa),cloud_cover,lowest_ceiling(100m),month,PM10_Counts,PM25_Counts,log_PM10_Counts,log_PM25_Counts
0,2008-01-01 10:00:00,-7.6,0.0,4.6,340.0,47.0,1010.7,2.0,12.0,1,30.0,11.0,3.433987,2.484907
1,2008-01-01 11:00:00,-6.9,0.0,4.1,290.0,42.0,1010.7,2.0,12.0,1,29.0,13.0,3.401197,2.639057
2,2008-01-01 12:00:00,-5.8,0.0,4.1,290.0,42.0,1010.7,2.0,12.0,1,29.0,12.0,3.401197,2.564949
3,2008-01-01 13:00:00,-5.0,0.0,4.1,290.0,36.0,1009.9,0.0,10.0,1,28.0,12.0,3.367296,2.564949
4,2008-01-01 14:00:00,-4.4,0.0,4.1,290.0,36.0,1009.9,0.0,10.0,1,27.0,13.0,3.332205,2.639057


The dataset above will be transformed so that it is used for multi-scale timeseries classification. In order to do so, `wind_direction`, `humidity(%)`, `lowest_ceiling(100m)`, `temp(°C)`, `wind_speed(m/s)`, and `local_P(hPa)` will be used as features, whereas the binary target, `Air_is_bad?` will be generated based on `PM10_Counts`.

In [4]:
new_columns = [
    "date",
    "wind_direction",
    "humidity(%)",
    "lowest_ceiling(100m)",
    "temp(°C)",
    "wind_speed(m/s)",
    "local_P(hPa)",
    "PM10_Counts"
]

In [5]:
df = df[new_columns]

In [6]:
df

Unnamed: 0,date,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa),PM10_Counts
0,2008-01-01 10:00:00,340.0,47.0,12.0,-7.6,4.6,1010.7,30.0
1,2008-01-01 11:00:00,290.0,42.0,12.0,-6.9,4.1,1010.7,29.0
2,2008-01-01 12:00:00,290.0,42.0,12.0,-5.8,4.1,1010.7,29.0
3,2008-01-01 13:00:00,290.0,36.0,10.0,-5.0,4.1,1009.9,28.0
4,2008-01-01 14:00:00,290.0,36.0,10.0,-4.4,4.1,1009.9,27.0
...,...,...,...,...,...,...,...,...
122728,2021-12-31 05:00:00,270.0,39.0,10.0,-7.7,3.5,1018.5,25.0
122729,2021-12-31 06:00:00,290.0,40.0,10.0,-8.0,3.2,1019.2,23.0
122730,2021-12-31 07:00:00,270.0,42.0,10.0,-8.6,2.7,1019.9,22.0
122731,2021-12-31 08:00:00,270.0,42.0,10.0,-8.8,2.9,1020.8,22.0


Since the time starts from 10 for 2008-01-01, the dataset should start from 2008-01-02 so that all features have equal length of data per day. So Feature will range from 2008-01-02 to 2021-12-30, while it will be from 2008-01-03 to 2021-12-31 for target.

# 2. Target Preparation

In [7]:
df.loc[df["PM10_Counts"] > 45].shape[0]

46562

In [8]:
df.shape[0]

122733

In [9]:
pm10_count = df[["date", "PM10_Counts"]]

In [10]:
pm10_count

Unnamed: 0,date,PM10_Counts
0,2008-01-01 10:00:00,30.0
1,2008-01-01 11:00:00,29.0
2,2008-01-01 12:00:00,29.0
3,2008-01-01 13:00:00,28.0
4,2008-01-01 14:00:00,27.0
...,...,...
122728,2021-12-31 05:00:00,25.0
122729,2021-12-31 06:00:00,23.0
122730,2021-12-31 07:00:00,22.0
122731,2021-12-31 08:00:00,22.0


In [11]:
pm10_count["date_new"] = pm10_count["date"].dt.date

In [12]:
target = pm10_count.groupby("date_new").mean().reset_index()

In [13]:
target["date_new"] = pd.to_datetime(target["date_new"])

In [14]:
target["Air_is_bad?"] = np.where(target["PM10_Counts"] >= 45, 1, 0)

In [15]:
target = target[["date_new", "Air_is_bad?"]]

In [16]:
target = target.loc[target["date_new"].dt.date >= datetime.date(2008, 1, 3)].reset_index(drop=True)

In [17]:
target

Unnamed: 0,date_new,Air_is_bad?
0,2008-01-03,1
1,2008-01-04,1
2,2008-01-05,1
3,2008-01-06,1
4,2008-01-07,1
...,...,...
5107,2021-12-27,0
5108,2021-12-28,1
5109,2021-12-29,1
5110,2021-12-30,0


# 3. Feature Preparation

In [18]:
features = df[[
    "date",
    "wind_direction",
    "humidity(%)",
    "lowest_ceiling(100m)",
    "temp(°C)",
    "wind_speed(m/s)",
    "local_P(hPa)"
]]

In [19]:
features["date_new"] = features["date"].dt.date

In [20]:
features

Unnamed: 0,date,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa),date_new
0,2008-01-01 10:00:00,340.0,47.0,12.0,-7.6,4.6,1010.7,2008-01-01
1,2008-01-01 11:00:00,290.0,42.0,12.0,-6.9,4.1,1010.7,2008-01-01
2,2008-01-01 12:00:00,290.0,42.0,12.0,-5.8,4.1,1010.7,2008-01-01
3,2008-01-01 13:00:00,290.0,36.0,10.0,-5.0,4.1,1009.9,2008-01-01
4,2008-01-01 14:00:00,290.0,36.0,10.0,-4.4,4.1,1009.9,2008-01-01
...,...,...,...,...,...,...,...,...
122728,2021-12-31 05:00:00,270.0,39.0,10.0,-7.7,3.5,1018.5,2021-12-31
122729,2021-12-31 06:00:00,290.0,40.0,10.0,-8.0,3.2,1019.2,2021-12-31
122730,2021-12-31 07:00:00,270.0,42.0,10.0,-8.6,2.7,1019.9,2021-12-31
122731,2021-12-31 08:00:00,270.0,42.0,10.0,-8.8,2.9,1020.8,2021-12-31


In [21]:
features = features.groupby("date_new").agg(list)[[
    "wind_direction",
    "humidity(%)",
    "lowest_ceiling(100m)",
    "temp(°C)",
    "wind_speed(m/s)",
    "local_P(hPa)"
]].reset_index()

In [22]:
features["date_new"] = pd.to_datetime(features["date_new"])

In [23]:
features.head()

Unnamed: 0,date_new,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa)
0,2008-01-01,"[340.0, 290.0, 290.0, 290.0, 290.0, 290.0, 290...","[47.0, 42.0, 42.0, 36.0, 36.0, 34.0, 35.0, 37....","[12.0, 12.0, 12.0, 10.0, 10.0, 10.0, 10.0, 10....","[-7.6, -6.9, -5.8, -5.0, -4.4, -3.5, -3.4, -3....","[4.6, 4.1, 4.1, 4.1, 4.1, 3.6, 3.8, 3.6, 3.3, ...","[1010.7, 1010.7, 1010.7, 1009.9, 1009.9, 1009...."
1,2008-01-02,"[290.0, 320.0, 320.0, 290.0, 290.0, 290.0, 290...","[57.0, 47.0, 42.0, 40.0, 30.0, 31.0, 30.0, 34....","[9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...","[-6.5, -5.3, -4.3, -2.7, -1.2, -0.6, 0.2, 0.4,...","[1.9, 2.8, 2.8, 2.4, 2.7, 3.6, 2.8, 4.3, 4.1, ...","[1013.0, 1015.2, 1015.2, 1015.2, 1014.0, 1014...."
2,2008-01-03,"[0.0, 50.0, 50.0, 0.0, 250.0, 270.0, 270.0, 27...","[59.0, 58.0, 57.0, 53.0, 46.0, 33.0, 30.0, 37....","[12.0, 6.0, 6.0, 12.0, 6.0, 6.0, 9.0, 9.0, 9.0...","[-2.2, -0.4, 1.0, 3.2, 6.0, 6.4, 6.4, 5.9, 5.0...","[0.0, 1.7, 2.1, 0.3, 3.7, 5.6, 5.1, 3.9, 3.7, ...","[1014.7, 1012.0, 1012.0, 1012.0, 1011.2, 1011...."
3,2008-01-04,"[290.0, 20.0, 340.0, 250.0, 290.0, 200.0, 290....","[61.0, 60.0, 54.0, 44.0, 40.0, 32.0, 37.0, 38....","[6.0, 6.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...","[-0.8, -1.2, 0.3, 1.7, 2.5, 3.5, 3.2, 2.9, 2.5...","[2.7, 1.9, 0.8, 2.4, 2.8, 3.5, 1.3, 1.3, 2.4, ...","[1015.2, 1016.4, 1016.4, 1016.4, 1014.0, 1014...."
4,2008-01-05,"[270.0, 20.0, 50.0, 230.0, 270.0, 250.0, 290.0...","[48.0, 61.0, 53.0, 46.0, 50.0, 48.0, 48.0, 51....","[9.0, 9.0, 9.0, 12.0, 6.0, 6.0, 10.0, 6.0, 5.0...","[0.2, 0.3, 2.0, 4.4, 4.5, 5.4, 5.6, 5.5, 4.5, ...","[1.3, 1.7, 0.8, 1.5, 2.7, 3.4, 3.2, 3.1, 3.1, ...","[1013.4, 1013.0, 1013.0, 1013.0, 1011.9, 1011...."


In [24]:
features = features.loc[
    (features["date_new"].dt.date >= datetime.date(2008, 1, 2)) &
    (features["date_new"].dt.date <= datetime.date(2021, 12, 30))
].reset_index(drop=True)

In [25]:
features

Unnamed: 0,date_new,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa)
0,2008-01-02,"[290.0, 320.0, 320.0, 290.0, 290.0, 290.0, 290...","[57.0, 47.0, 42.0, 40.0, 30.0, 31.0, 30.0, 34....","[9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...","[-6.5, -5.3, -4.3, -2.7, -1.2, -0.6, 0.2, 0.4,...","[1.9, 2.8, 2.8, 2.4, 2.7, 3.6, 2.8, 4.3, 4.1, ...","[1013.0, 1015.2, 1015.2, 1015.2, 1014.0, 1014...."
1,2008-01-03,"[0.0, 50.0, 50.0, 0.0, 250.0, 270.0, 270.0, 27...","[59.0, 58.0, 57.0, 53.0, 46.0, 33.0, 30.0, 37....","[12.0, 6.0, 6.0, 12.0, 6.0, 6.0, 9.0, 9.0, 9.0...","[-2.2, -0.4, 1.0, 3.2, 6.0, 6.4, 6.4, 5.9, 5.0...","[0.0, 1.7, 2.1, 0.3, 3.7, 5.6, 5.1, 3.9, 3.7, ...","[1014.7, 1012.0, 1012.0, 1012.0, 1011.2, 1011...."
2,2008-01-04,"[290.0, 20.0, 340.0, 250.0, 290.0, 200.0, 290....","[61.0, 60.0, 54.0, 44.0, 40.0, 32.0, 37.0, 38....","[6.0, 6.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...","[-0.8, -1.2, 0.3, 1.7, 2.5, 3.5, 3.2, 2.9, 2.5...","[2.7, 1.9, 0.8, 2.4, 2.8, 3.5, 1.3, 1.3, 2.4, ...","[1015.2, 1016.4, 1016.4, 1016.4, 1014.0, 1014...."
3,2008-01-05,"[270.0, 20.0, 50.0, 230.0, 270.0, 250.0, 290.0...","[48.0, 61.0, 53.0, 46.0, 50.0, 48.0, 48.0, 51....","[9.0, 9.0, 9.0, 12.0, 6.0, 6.0, 10.0, 6.0, 5.0...","[0.2, 0.3, 2.0, 4.4, 4.5, 5.4, 5.6, 5.5, 4.5, ...","[1.3, 1.7, 0.8, 1.5, 2.7, 3.4, 3.2, 3.1, 3.1, ...","[1013.4, 1013.0, 1013.0, 1013.0, 1011.9, 1011...."
4,2008-01-06,"[20.0, 90.0, 90.0, 20.0, 50.0, 250.0, 200.0, 2...","[78.0, 71.0, 69.0, 65.0, 64.0, 67.0, 68.0, 69....","[10.0, 5.0, 5.0, 12.0, 5.0, 6.0, 12.0, 6.0, 5....","[1.1, 3.2, 3.9, 5.1, 5.9, 7.1, 7.3, 7.3, 6.5, ...","[1.2, 1.1, 0.9, 1.3, 1.2, 2.7, 2.0, 1.0, 1.8, ...","[1013.4, 1013.0, 1013.0, 1013.0, 1011.2, 1011...."
...,...,...,...,...,...,...,...
5107,2021-12-26,"[270.0, 270.0, 290.0, 320.0, 270.0, 290.0, 290...","[46.0, 44.0, 43.0, 39.0, 35.0, 32.0, 31.0, 33....","[13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13....","[-14.4, -13.7, -12.7, -11.5, -9.5, -8.9, -8.1,...","[2.1, 3.6, 2.9, 3.3, 2.6, 4.4, 4.0, 3.0, 2.6, ...","[1024.6, 1025.8, 1025.4, 1024.0, 1022.8, 1022...."
5108,2021-12-27,"[320.0, 50.0, 90.0, 200.0, 180.0, 230.0, 180.0...","[53.0, 54.0, 44.0, 40.0, 45.0, 48.0, 60.0, 65....","[13.0, 13.0, 13.0, 13.0, 9.0, 8.0, 7.0, 7.0, 7...","[-12.1, -8.5, -5.9, -4.8, -4.4, -4.2, -4.2, -4...","[2.3, 1.9, 1.6, 1.3, 2.4, 1.7, 0.6, 1.7, 0.7, ...","[1022.7, 1021.4, 1021.1, 1020.1, 1019.1, 1018...."
5109,2021-12-28,"[20.0, 50.0, 70.0, 70.0, 50.0, 50.0, 50.0, 50....","[83.0, 81.0, 73.0, 64.0, 62.0, 59.0, 58.0, 58....","[11.0, 11.0, 12.0, 12.0, 13.0, 11.0, 13.0, 13....","[-6.3, -5.7, -4.0, -2.5, -2.0, -1.5, -1.2, -0....","[2.2, 2.1, 2.3, 2.1, 2.8, 2.1, 2.6, 2.6, 2.5, ...","[1019.4, 1019.5, 1019.2, 1018.2, 1017.3, 1016...."
5110,2021-12-29,"[20.0, 70.0, 50.0, 50.0, 180.0, 270.0, 250.0, ...","[78.0, 81.0, 78.0, 76.0, 72.0, 59.0, 54.0, 58....","[13.0, 10.0, 10.0, 15.0, 15.0, 11.0, 10.0, 10....","[-3.2, 0.2, 1.5, 2.3, 4.9, 5.1, 5.5, 5.4, 4.9,...","[1.6, 1.2, 1.6, 1.1, 2.1, 4.9, 4.5, 4.7, 3.7, ...","[1013.6, 1009.3, 1009.0, 1007.8, 1006.8, 1006...."


# 4. Dataset Preparation

In [26]:
data = pd.concat(
    [features, target["Air_is_bad?"]], axis=1
)

In [27]:
data

Unnamed: 0,date_new,wind_direction,humidity(%),lowest_ceiling(100m),temp(°C),wind_speed(m/s),local_P(hPa),Air_is_bad?
0,2008-01-02,"[290.0, 320.0, 320.0, 290.0, 290.0, 290.0, 290...","[57.0, 47.0, 42.0, 40.0, 30.0, 31.0, 30.0, 34....","[9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...","[-6.5, -5.3, -4.3, -2.7, -1.2, -0.6, 0.2, 0.4,...","[1.9, 2.8, 2.8, 2.4, 2.7, 3.6, 2.8, 4.3, 4.1, ...","[1013.0, 1015.2, 1015.2, 1015.2, 1014.0, 1014....",1
1,2008-01-03,"[0.0, 50.0, 50.0, 0.0, 250.0, 270.0, 270.0, 27...","[59.0, 58.0, 57.0, 53.0, 46.0, 33.0, 30.0, 37....","[12.0, 6.0, 6.0, 12.0, 6.0, 6.0, 9.0, 9.0, 9.0...","[-2.2, -0.4, 1.0, 3.2, 6.0, 6.4, 6.4, 5.9, 5.0...","[0.0, 1.7, 2.1, 0.3, 3.7, 5.6, 5.1, 3.9, 3.7, ...","[1014.7, 1012.0, 1012.0, 1012.0, 1011.2, 1011....",1
2,2008-01-04,"[290.0, 20.0, 340.0, 250.0, 290.0, 200.0, 290....","[61.0, 60.0, 54.0, 44.0, 40.0, 32.0, 37.0, 38....","[6.0, 6.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...","[-0.8, -1.2, 0.3, 1.7, 2.5, 3.5, 3.2, 2.9, 2.5...","[2.7, 1.9, 0.8, 2.4, 2.8, 3.5, 1.3, 1.3, 2.4, ...","[1015.2, 1016.4, 1016.4, 1016.4, 1014.0, 1014....",1
3,2008-01-05,"[270.0, 20.0, 50.0, 230.0, 270.0, 250.0, 290.0...","[48.0, 61.0, 53.0, 46.0, 50.0, 48.0, 48.0, 51....","[9.0, 9.0, 9.0, 12.0, 6.0, 6.0, 10.0, 6.0, 5.0...","[0.2, 0.3, 2.0, 4.4, 4.5, 5.4, 5.6, 5.5, 4.5, ...","[1.3, 1.7, 0.8, 1.5, 2.7, 3.4, 3.2, 3.1, 3.1, ...","[1013.4, 1013.0, 1013.0, 1013.0, 1011.9, 1011....",1
4,2008-01-06,"[20.0, 90.0, 90.0, 20.0, 50.0, 250.0, 200.0, 2...","[78.0, 71.0, 69.0, 65.0, 64.0, 67.0, 68.0, 69....","[10.0, 5.0, 5.0, 12.0, 5.0, 6.0, 12.0, 6.0, 5....","[1.1, 3.2, 3.9, 5.1, 5.9, 7.1, 7.3, 7.3, 6.5, ...","[1.2, 1.1, 0.9, 1.3, 1.2, 2.7, 2.0, 1.0, 1.8, ...","[1013.4, 1013.0, 1013.0, 1013.0, 1011.2, 1011....",1
...,...,...,...,...,...,...,...,...
5107,2021-12-26,"[270.0, 270.0, 290.0, 320.0, 270.0, 290.0, 290...","[46.0, 44.0, 43.0, 39.0, 35.0, 32.0, 31.0, 33....","[13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13....","[-14.4, -13.7, -12.7, -11.5, -9.5, -8.9, -8.1,...","[2.1, 3.6, 2.9, 3.3, 2.6, 4.4, 4.0, 3.0, 2.6, ...","[1024.6, 1025.8, 1025.4, 1024.0, 1022.8, 1022....",0
5108,2021-12-27,"[320.0, 50.0, 90.0, 200.0, 180.0, 230.0, 180.0...","[53.0, 54.0, 44.0, 40.0, 45.0, 48.0, 60.0, 65....","[13.0, 13.0, 13.0, 13.0, 9.0, 8.0, 7.0, 7.0, 7...","[-12.1, -8.5, -5.9, -4.8, -4.4, -4.2, -4.2, -4...","[2.3, 1.9, 1.6, 1.3, 2.4, 1.7, 0.6, 1.7, 0.7, ...","[1022.7, 1021.4, 1021.1, 1020.1, 1019.1, 1018....",1
5109,2021-12-28,"[20.0, 50.0, 70.0, 70.0, 50.0, 50.0, 50.0, 50....","[83.0, 81.0, 73.0, 64.0, 62.0, 59.0, 58.0, 58....","[11.0, 11.0, 12.0, 12.0, 13.0, 11.0, 13.0, 13....","[-6.3, -5.7, -4.0, -2.5, -2.0, -1.5, -1.2, -0....","[2.2, 2.1, 2.3, 2.1, 2.8, 2.1, 2.6, 2.6, 2.5, ...","[1019.4, 1019.5, 1019.2, 1018.2, 1017.3, 1016....",1
5110,2021-12-29,"[20.0, 70.0, 50.0, 50.0, 180.0, 270.0, 250.0, ...","[78.0, 81.0, 78.0, 76.0, 72.0, 59.0, 54.0, 58....","[13.0, 10.0, 10.0, 15.0, 15.0, 11.0, 10.0, 10....","[-3.2, 0.2, 1.5, 2.3, 4.9, 5.1, 5.5, 5.4, 4.9,...","[1.6, 1.2, 1.6, 1.1, 2.1, 4.9, 4.5, 4.7, 3.7, ...","[1013.6, 1009.3, 1009.0, 1007.8, 1006.8, 1006....",0


In [28]:
data.to_csv("../data/interim/fine_dust_dataset.csv", index=False)