# 0. Imports

In [1]:
import os

import pandas as pd

# 1. Fine-Dust Data of Seoul

The original data was downloaded from [Public Data Portal](https://www.data.go.kr/data/15089266/fileData.do). The data contains hourly data for fine dust in Seoul, Korea, for the period of 2008 to 2021.

In [7]:
os.listdir("../data/raw/fine_dust")

['서울시 대기질 자료 제공_2008-2011.csv',
 '서울시 대기질 자료 제공_2012-2015.csv',
 '서울시 대기질 자료 제공_2016-2019.csv',
 '서울시 대기질 자료 제공_2020-2021.csv']

In [20]:
fine_dust_df = pd.DataFrame()
for i in range(len(os.listdir("../data/raw/fine_dust"))):
    df = pd.read_csv(
        f"../data/raw/fine_dust/{os.listdir('../data/raw/fine_dust')[i]}",
        encoding="cp949",
    )
    fine_dust_df = pd.concat([fine_dust_df, df])

In [21]:
fine_dust_df.head(3)

Unnamed: 0,일시,구분,미세먼지(PM10),초미세먼지(PM25)
0,2011-12-31 23:00,평균,89.0,61.0
1,2011-12-31 23:00,강남구,91.0,58.0
2,2011-12-31 23:00,강동구,89.0,59.0


In [22]:
# Column names were renamed in English

fine_dust_df = fine_dust_df.rename(
    columns={
        "일시": "Date",
        "구분": "Data_Type",
        "미세먼지(PM10)": "PM10_Counts",
        "초미세먼지(PM25)": "PM25_Counts",
    }
)

In [23]:
fine_dust_df.head(3)

Unnamed: 0,Date,Data_Type,PM10_Counts,PM25_Counts
0,2011-12-31 23:00,평균,89.0,61.0
1,2011-12-31 23:00,강남구,91.0,58.0
2,2011-12-31 23:00,강동구,89.0,59.0


In [24]:
# Only the data averaged over Seoul were selected.
fine_dust_df = (
    fine_dust_df.query("Data_Type == '평균'")  # '평균' means Average
    .sort_values(by="Date")
    .drop(columns=["Data_Type"])
    .reset_index(drop=True)
)

In [25]:
fine_dust_df["Date"] = pd.to_datetime(fine_dust_df["Date"])

In [26]:
fine_dust_df.head()

Unnamed: 0,Date,PM10_Counts,PM25_Counts
0,2008-01-01 10:00:00,30.0,11.0
1,2008-01-01 11:00:00,29.0,13.0
2,2008-01-01 12:00:00,29.0,12.0
3,2008-01-01 13:00:00,28.0,12.0
4,2008-01-01 14:00:00,27.0,13.0


In [27]:
fine_dust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122733 entries, 0 to 122732
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Date         122733 non-null  datetime64[ns]
 1   PM10_Counts  122733 non-null  float64       
 2   PM25_Counts  122733 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 2.8 MB


# 서울특별시 날씨 Data

In [None]:
# 원본 데이터 출처: https://data.kma.go.kr/data/grnd/selectAsosRltmList.do?pgmNo=36&tabNo=1

os.listdir("data/raw_weather")

In [None]:
weather_df = pd.DataFrame()
for i in range(len(os.listdir("data/raw_weather"))):
    df = pd.read_csv(
        f"data/raw_weather/{os.listdir('data/raw_weather')[i]}", encoding="cp949"
    )
    weather_df = pd.concat([weather_df, df])

In [None]:
weather_df["일시"] = pd.to_datetime(weather_df["일시"])

In [None]:
weather_df = weather_df.sort_values(by="일시").reset_index(drop=True)

In [None]:
weather_df

In [None]:
weather_df.info()

In [None]:
weather_df.nunique()

In [None]:
weather_df.columns

In [None]:
new_cols = [
    "일시",
    "기온(°C)",
    "강수량(mm)",
    "풍속(m/s)",
    "풍향(16방위)",
    "습도(%)",
    "증기압(hPa)",
    "이슬점온도(°C)",
    "현지기압(hPa)",
    "전운량(10분위)",
    "중하층운량(10분위)",
    "최저운고(100m )",
]

In [None]:
weather_df = weather_df[new_cols]
weather_df.rename(
    columns={"최저운고(100m )": "최저운고(100m)"}, inplace=True
)  # Removes the white space

In [None]:
weather_df

In [None]:
weather_df.info()

# Data 합치기

In [None]:
combined_df = fine_dust_df.merge(weather_df, on="일시", how="left")

In [None]:
combined_df

In [None]:
combined_df.info()

In [None]:
combined_df.to_csv("data/processed/seoul_fine_dust_weather_2008_2021.csv", index=False)