# 미세먼지, 습도, 일조시간, 기압

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd 
import numpy as np
pd.set_option("max_rows", 500)
pd.set_option("max_columns", 500)
pd.set_option('float_format', '{:f}'.format)

import os 
import datetime
from tqdm import tqdm 
from functools import reduce

## 0. 데이터 로드

### 0-1. 미세먼지 데이터

In [6]:
def load_dust(year) : 
    
    # folder path 설정 
    file_path = os.path.join(os.path.abspath(".."), 'data\\' + year)
    file_list = os.listdir(file_path)
    
    # 파일 불러오기
    data_file = sorted([file for file in file_list if file.startswith(year)])
    
    # csv 파일들을 DataFrame으로 불러와서 concat
    df = pd.DataFrame()
    for d in data_file:
        data = pd.read_excel(os.path.join(file_path, d), engine='openpyxl')
        df = pd.concat([df, data], axis=0).reset_index(drop=True)
    df = df[~df.지역.isna()] # 지역값이 없는 경우 제거
    df = df[~df.PM10.isna()]
    df = df[~df.PM25.isna()]
    
    # 지역 리스트
    region = {'서울':108, '부산':159, '대구':143, '인천':112, '광주':156, '대전':133, '울산':152, '수원':118, 
              '춘천':101, '강릉':105, '청주':131, '천안':232, '전주':146, '여수':168, '안동':136, '창원':155}
    
    # 시간별 데이터를 일별 데이터로 변환
    # 지역별 평균값
    data = pd.DataFrame()
    for r, code in region.items(): # r:지명, code:지역코드
        sample = df.copy()
        sample['date'] = sample['측정일시'].apply(lambda x : str(x)[:8])
        sample['region'] = sample['지역'].apply(lambda x : r if r in x else None)
        sample = sample[sample.region == r]
        sample['aws_id'] = code
        sample = sample.groupby(['date', 'region', 'aws_id']).mean().reset_index()[['date', 'region', 'aws_id', 'PM10', 'PM25']]
        data = pd.concat([data, sample], axis=0).reset_index(drop=True)

    return data

In [7]:
dust2018 = load_dust('2018')
dust2019 = load_dust('2019')
dust2020 = load_dust('2020')

In [8]:
dust = pd.concat([dust2018, dust2019, dust2020], axis=0).reset_index(drop=True)
dust.shape

(17241, 5)

In [9]:
dust.columns = ['date', 'region', 'aws_id', 'PM10', 'PM2.5']

In [10]:
dust.head()

Unnamed: 0,date,region,aws_id,PM10,PM2.5
0,20180101,서울,108,42.307692,21.470696
1,20180102,서울,108,40.470588,22.711397
2,20180103,서울,108,35.242478,19.042478
3,20180104,서울,108,46.385027,25.381462
4,20180105,서울,108,57.841918,37.159858


### 0-2. 날씨 데이터

In [11]:
def load_file(weather) : 
    
    '''
    weather : 추출하고 싶은 날씨 데이터 변수 
    '''
    
    # folder path 설정 
    file_path = os.path.join(os.path.abspath(".."), 'data')
    file_list = os.listdir(file_path)
    
    # startswith('i') : i로 시작하는 파일 모두 불러오기 / endswith('csv') : csv 파일 모두 불러오기 
    data_file = sorted([file for file in file_list if file.startswith(weather)])
 
    # csv 파일들을 DataFrame으로 불러와서 concat
    df = pd.DataFrame()
    for d in data_file:
        data = pd.read_csv(os.path.join(file_path, d))
        df = pd.concat([df, data], axis=0).reset_index(drop=True)
    
    # index column 삭제 ..ㅎㅎ 
    df = df.drop('Unnamed: 0', axis=1)
    
    return df

In [12]:
humid = load_file('humid')
sun = load_file('sun')
press = load_file('press')

humid.shape, sun.shape, press.shape

((420864, 4), (17441, 3), (17536, 4))

## 1. 미세먼지

In [13]:
tqdm.pandas()
dust['date'] = dust['date'].progress_apply(lambda x : pd.to_datetime(x))

100%|██████████████████████████████████████████████████████████████████████████| 17241/17241 [00:02<00:00, 7093.60it/s]


### 결측치

In [14]:
dust.isnull().sum()

date      0
region    0
aws_id    0
PM10      0
PM2.5     0
dtype: int64

## 2. 습도  
* 일별 데이터  
  * `date` : 날짜  
  * `aws_id` : 관측지점  
  * `hm_max` : 시간당 최대 습도

In [17]:
tqdm.pandas()
humid['date'] = humid['tm'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|████████████████████████████████████████████████████████████████████████| 420864/420864 [01:01<00:00, 6834.69it/s]


In [18]:
humid2 = humid.groupby(['date', 'aws_id']).max().reset_index()[['date', 'aws_id', 'hm_max']]
humid2.head()

Unnamed: 0,date,aws_id,hm_max
0,2018-01-01,105,25.4
1,2018-01-01,108,57.1
2,2018-01-01,112,67.2
3,2018-01-01,119,84.7
4,2018-01-01,131,71.4


### 결측치

In [19]:
''' 해당 지역, 해당 년월의 평균치로 결측치를 대체하는 함수'''

def missing_value(data, aws_id, year, month, var):
    data['year'] = data['date'].apply(lambda x : x.year)
    data['month'] = data['date'].apply(lambda x : x.month)
    sample = data[(data.aws_id==aws_id)&(data.year==year)&(data.month==month)]
    alt = sample[var].mean() # 평균
    # 대체
    data[(data.aws_id==aws_id)&(data.year==year)&(data.month==month)] = data[(data.aws_id==aws_id)&(data.year==year)&(data.month==month)].fillna(alt) 
    data = data.drop(['year', 'month'], axis=1) # 월 변수 제거
    return data

In [20]:
humid2.isnull().sum()

date      0
aws_id    0
hm_max    9
dtype: int64

In [21]:
humid2[humid2.hm_max.isna()] # 232 : 충남(천안)

Unnamed: 0,date,aws_id,hm_max
7087,2019-03-19,232,
7103,2019-03-20,232,
7119,2019-03-21,232,
7231,2019-03-28,232,
7247,2019-03-29,232,
7263,2019-03-30,232,
7279,2019-03-31,232,
7295,2019-04-01,232,
7511,2019-04-15,143,


In [22]:
humid2 = missing_value(humid2, 232, 2019, 3, 'hm_max')
humid2 = missing_value(humid2, 232, 2019, 4, 'hm_max')
humid2 = missing_value(humid2, 143, 2019, 4, 'hm_max')

In [23]:
humid2.isnull().sum()

date      0
aws_id    0
hm_max    0
dtype: int64

## 3. 일조시간  
* 일별 데이터  
  * `date` : 날짜  
  * `stn_id` : 지역  
  * `sum_ss_hr` : 합계 일조시간

In [24]:
tqdm.pandas()
sun['date'] = sun['tma'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████████████████████████████████████████████████████████████████████| 17441/17441 [00:02<00:00, 7515.69it/s]


In [25]:
sun2 = sun.groupby(['date', 'stn_id']).max().reset_index()[['date', 'stn_id', 'sum_ss_hr']]
sun2.columns = ['date', 'aws_id', 'sum_ss_hr']

In [26]:
sun2.head()

Unnamed: 0,date,aws_id,sum_ss_hr
0,2018-01-01,105,57.9
1,2018-01-01,108,51.6
2,2018-01-01,112,53.8
3,2018-01-01,119,52.7
4,2018-01-01,131,54.6


### 결측치

In [27]:
sun2.isnull().sum() # 결측치 없음

date         0
aws_id       0
sum_ss_hr    0
dtype: int64

## 4. 기압  
* 일별 데이터  
  * `date` : 날짜  
  * `stn_id` : 지역  
  * `max_pa` : 최고 현지기압

In [28]:
tqdm.pandas()
press['date'] = press['tma'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████████████████████████████████████████████████████████████████████| 17536/17536 [00:02<00:00, 6193.16it/s]


In [29]:
press2 = press.groupby(['date', 'stn_id']).max().reset_index()[['date', 'stn_id', 'max_pa']]
press2.columns = ['date', 'aws_id', 'max_pa']

In [30]:
press2.head()

Unnamed: 0,date,aws_id,max_pa
0,2018-01-01,105,1023.0
1,2018-01-01,108,1018.1
2,2018-01-01,112,1020.3
3,2018-01-01,119,1025.1
4,2018-01-01,131,1022.0


### 결측치

In [31]:
press2.isnull().sum()

date      0
aws_id    0
max_pa    4
dtype: int64

In [32]:
press2[press2.max_pa.isna()] # 143 : 대구 / 105 : 강릉

Unnamed: 0,date,aws_id,max_pa
7511,2019-04-15,143,
11648,2019-12-30,105,
11664,2019-12-31,105,
11680,2020-01-01,105,


In [33]:
press2 = missing_value(press2, 143, 2019, 4, 'max_pa')
press2 = missing_value(press2, 105, 2019, 12, 'max_pa')
press2 = missing_value(press2, 105, 2020, 1, 'max_pa')

In [34]:
press2.isnull().sum()

date      0
aws_id    0
max_pa    0
dtype: int64

# 데이터 병합

In [35]:
dust.shape, humid2.shape, sun2.shape, press2.shape # total = 17536

((17241, 5), (17536, 3), (17441, 3), (17536, 3))

In [57]:
dust

Unnamed: 0,date,region,aws_id,PM10,PM2.5
0,2018-01-01,서울,108,42.307692,21.470696
1,2018-01-02,서울,108,40.470588,22.711397
2,2018-01-03,서울,108,35.242478,19.042478
3,2018-01-04,서울,108,46.385027,25.381462
4,2018-01-05,서울,108,57.841918,37.159858
...,...,...,...,...,...
17236,2020-12-27,창원,155,41.212766,28.989362
17237,2020-12-28,창원,155,28.828897,20.368821
17238,2020-12-29,창원,155,38.098901,28.183150
17239,2020-12-30,창원,155,13.068702,7.732824


In [72]:
weather = reduce(lambda left, right : pd.merge(left, right, how='left', on=['date', 'aws_id']), [humid2, dust, sun2, press2])
weather = weather[['date', 'region', 'aws_id', 'PM10', 'PM2.5', 'hm_max', 'sum_ss_hr', 'max_pa']]

In [73]:
print(weather.shape)
weather.head()

(17536, 8)


Unnamed: 0,date,region,aws_id,PM10,PM2.5,hm_max,sum_ss_hr,max_pa
0,2018-01-01,강릉,105,20.066667,13.4,25.4,57.9,1023.0
1,2018-01-01,서울,108,42.307692,21.470696,57.1,51.6,1018.1
2,2018-01-01,인천,112,37.518681,18.641758,67.2,53.8,1020.3
3,2018-01-01,,119,,,84.7,52.7,1025.1
4,2018-01-01,청주,131,51.34965,32.706294,71.4,54.6,1022.0
