In [118]:
import numpy as np
import pandas as pd
import dataload
import datetime as dt
import requests
from tqdm import tqdm, tqdm_notebook

## 데이터 로드 및 병합

In [119]:
weather1819 = pd.read_csv('../data/weather1819.csv', encoding='cp949')
weather2020 = pd.read_csv('../data/weather2020.csv', encoding='cp949')

weather1819.columns = ['date', 'aws_id', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws']
weather2020.columns = ['date', 'aws_id', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws']

weather1819_2 = pd.read_csv('../data/weather1819_2.csv', encoding='cp949').iloc[:,1:]
weather2020_2 = pd.read_csv('../data/weather2020_2.csv', encoding='cp949').iloc[:,1:]

In [120]:
weather1819.head()

Unnamed: 0,date,aws_id,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3


In [121]:
weather1819_2.head()

Unnamed: 0,date,aws_id,region,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,105,강릉,20.066667,13.4,25.4,57.9,1023.0
1,2018-01-01,108,서울,42.307692,21.470696,57.1,51.6,1018.1
2,2018-01-01,112,인천,37.518681,18.641758,67.2,53.8,1020.3
3,2018-01-01,119,수원,42.782895,21.375,84.7,52.7,1025.1
4,2018-01-01,131,청주,51.34965,32.706294,71.4,54.6,1022.0


In [125]:
weather = weather1819.merge(weather1819_2, on=['date', 'aws_id'])

In [126]:
weather.head()

Unnamed: 0,date,aws_id,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,region,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7,강릉,20.066667,13.4,25.4,57.9,1023.0
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6,인천,37.518681,18.641758,67.2,53.8,1020.3
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0,수원,42.782895,21.375,84.7,52.7,1025.1
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2,안동,39.75,28.583333,56.4,58.3,1010.6
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3,울산,38.548193,15.433735,42.8,58.3,1017.1


In [127]:
weather = weather1819.merge(weather1819_2, on=['date', 'aws_id'])
weather.columns = ['날짜', '지점번호', '평균기온', '최고기온', '최저기온', '일별강수량', '1시간최대강수량', '평균풍속', 
                      '지역', 'PM10', 'PM25', '1시간최대습도', '일조시간합', '최고현지기압']

## 날짜 관련 파생변수

In [128]:
weather['날짜'] = weather['날짜'].apply(lambda x : pd.to_datetime(x))
weather['연']  = weather['날짜'].dt.year
weather['월']  = weather['날짜'].dt.month
weather['일']  = weather['날짜'].dt.day
weather["분기"] = weather['날짜'].dt.quarter
weather['요일']  = weather['날짜'].dt.weekday

In [129]:
def getHoliday(year):
    
    '''
    공공데이터포털 공휴일 정보 OPEN API  
    '''
    
    url = f'http://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?solYear={year}&ServiceKey={key}&_type=json&numOfRows=20'
    response = requests.get(url)
    holidays = response.json()['response']['body']['items']['item']
    holidays = pd.DataFrame(holidays)
    holidays['locdate'] = holidays['locdate'].astype(str).apply(lambda x : '-'.join([x[:4], x[4:6], x[6:]]))
    
    return holidays

key = '8H1yac%2Bb0yetY2Waad%2BQIRU43O17onRUd7iR0k2p6%2B4i0yNYn3ym0cXgt3ZqWrR9uFw%2BDu%2B5quHbsXxjpsw5ng%3D%3D'

# 공휴일 
holidays2018 = getHoliday(2018)
holidays2019 = getHoliday(2019)

holidays = pd.concat([holidays2018, holidays2019], axis=0, ignore_index = True)
holidays["locdate"] = holidays['locdate'].apply(lambda x: pd.to_datetime(str(x), format='%Y-%m-%d'))
holidays = holidays.drop(['dateKind', 'seq'], axis=1)
holidays.columns = ['공휴일명', '공휴일여부', '날짜']

In [130]:
holidays.head()

Unnamed: 0,공휴일명,공휴일여부,날짜
0,1월1일,Y,2018-01-01
1,설날,Y,2018-02-15
2,설날,Y,2018-02-16
3,설날,Y,2018-02-17
4,삼일절,Y,2018-03-01


In [131]:
weather = weather.merge(holidays[['날짜', '공휴일명', '공휴일여부']], on='날짜', how='left')
weather['공휴일여부'] = weather['공휴일여부'].apply(lambda x : 1 if x == 'Y' else 0)

# 주말 : 월 0 화 1 수 2 목 3 금 4 토 5 일 6 
weather['주말여부'] = weather['요일'].apply(lambda x : 1 if x in [5,6] else 0)

In [132]:
# 계절 : 봄 0 여름 1 가을 2 겨울 3 
weather["계절"] = weather["월"].apply(lambda x : 0 if x in [3,4,5] 
                                                     else 1 if x in [6,7,8] 
                                                     else 2 if x in [9,10,11]
                                                     else 3)

In [133]:
weather.head()

Unnamed: 0,날짜,지점번호,평균기온,최고기온,최저기온,일별강수량,1시간최대강수량,평균풍속,지역,PM10,...,최고현지기압,연,월,일,분기,요일,공휴일명,공휴일여부,주말여부,계절
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7,강릉,20.066667,...,1023.0,2018,1,1,1,0,1월1일,1,0,3
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6,인천,37.518681,...,1020.3,2018,1,1,1,0,1월1일,1,0,3
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0,수원,42.782895,...,1025.1,2018,1,1,1,0,1월1일,1,0,3
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2,안동,39.75,...,1010.6,2018,1,1,1,0,1월1일,1,0,3
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3,울산,38.548193,...,1017.1,2018,1,1,1,0,1월1일,1,0,3


## 날씨 관련 파생변수

In [134]:
# 체감온도 : 외부에 있는 사람이나 동물이 바람과 한기에 노출된 피부로 부터 열을 빼앗길 때 느끼는 추운 정도를 나타내는 지수
weather['체감온도'] = 13.12 + 0.6215*weather['평균기온'] - 11.37 * weather['평균풍속'] * 0.16 + 0.3965 * weather['평균풍속'] * 0.16 * weather['평균기온']

# 열지수 : 기온과 습도에 따라 사람이 실제로 느끼는 더위를 지수화한 것
H = weather['1시간최대습도']; T = weather['평균기온']
RH = H / sum(H) * 100 # percentage
weather['열지수'] = -42.379 + 2.04901523*T + 10.14333127*RH - .22475541*T*RH - .00683783*T*T - .05481717*RH*RH + .00122874*T*T*RH + .00085282*T*RH*RH - .00000199*T*T*RH*RH

# 폭염여부 : 일 최고기온이 33℃ 이상인 날
weather['폭염여부'] = weather['최고기온'].apply(lambda x : 1 if x>=33 else 0)

# 강수여부 : 일강수량이 0.1mm 이상인 날
weather['강수여부'] = weather['일별강수량'].apply(lambda x : 1 if x>=0.1 else 0)

In [135]:
weather.head()

Unnamed: 0,날짜,지점번호,평균기온,최고기온,최저기온,일별강수량,1시간최대강수량,평균풍속,지역,PM10,...,분기,요일,공휴일명,공휴일여부,주말여부,계절,체감온도,열지수,폭염여부,강수여부
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7,강릉,20.066667,...,1,0,1월1일,1,0,3,7.502056,-39.701524,0,0
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6,인천,37.518681,...,1,0,1월1일,1,0,3,9.992379,-42.924922,0,0
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0,수원,42.782895,...,1,0,1월1일,1,0,3,10.136402,-45.791893,0,0
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2,안동,39.75,...,1,0,1월1일,1,0,3,8.356692,-44.375704,0,0
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3,울산,38.548193,...,1,0,1월1일,1,0,3,8.861429,-38.064334,0,0


In [137]:
weather['폭염여부'].value_counts()

0    10979
1      701
Name: 폭염여부, dtype: int64

In [138]:
weather['강수여부'].value_counts()

0    8766
1    2914
Name: 강수여부, dtype: int64

In [136]:
weather.describe()

Unnamed: 0,지점번호,평균기온,최고기온,최저기온,일별강수량,1시간최대강수량,평균풍속,PM10,PM25,1시간최대습도,...,일,분기,요일,공휴일여부,주말여부,계절,체감온도,열지수,폭염여부,강수여부
count,11680.0,11680.0,11680.0,11680.0,11680.0,11680.0,11680.0,11485.0,11485.0,11680.0,...,11680.0,11680.0,11680.0,11680.0,11680.0,11680.0,11680.0,11680.0,11680.0,11680.0
mean,146.1875,14.130681,19.052217,9.923553,3.44012,1.099666,2.128851,41.009934,22.934385,84.65008,...,15.720548,2.509589,2.993151,0.046575,0.284932,1.490411,19.893058,-15.380639,0.060017,0.249486
std,30.846131,9.76445,9.75696,10.172436,13.404039,3.914129,1.189058,22.4991,14.881685,14.601819,...,8.796624,1.116815,2.001786,0.210737,0.451401,1.116815,7.625643,18.277233,0.237529,0.432734
min,105.0,-14.8,-11.9,-20.6,0.0,0.0,0.2,3.666667,2.0,19.2,...,1.0,1.0,0.0,0.0,0.0,0.0,-13.67518,-74.143962,0.0,0.0
25%,128.0,6.2,11.3,1.7,0.0,0.0,1.3,25.5,12.776915,76.8,...,8.0,2.0,1.0,0.0,0.0,0.0,13.999218,-29.865485,0.0,0.0
50%,144.5,14.9,20.2,10.2,0.0,0.0,1.8,35.914894,19.482046,89.3,...,16.0,3.0,3.0,0.0,0.0,1.0,20.659607,-13.316635,0.0,0.0
75%,156.75,22.3,27.0,18.7,0.0,0.1,2.6,50.931579,28.985507,96.5,...,23.0,4.0,5.0,0.0,1.0,2.0,26.215981,-0.040189,0.0,0.0
max,232.0,33.9,39.6,30.3,310.0,79.0,17.9,201.386364,148.096491,99.9,...,31.0,4.0,6.0,1.0,1.0,3.0,35.116815,19.258391,1.0,1.0


In [141]:
weather.to_csv('../data/weather_final.csv', encoding='cp949', index=None)