In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd 
import numpy as np
pd.set_option("max_rows", 500)
pd.set_option("max_columns", 500)
pd.set_option('float_format', '{:f}'.format)

import os 
import datetime
from tqdm import tqdm 

In [63]:
def load_file(weather) : 
    
    '''
    weather : 추출하고 싶은 날씨 데이터 변수 
    '''
    
    # folder path 설정 
    file_path = os.path.join(os.getcwd(), 'weather')
    file_list = os.listdir(file_path)
    
    # startswith('i') : i로 시작하는 파일 모두 불러오기 / endswith('csv') : csv 파일 모두 불러오기 
    data_file = sorted([file for file in file_list if file.startswith(weather)])
 
    # csv 파일들을 DataFrame으로 불러와서 concat
    df = pd.DataFrame()
    for d in data_file:
        data = pd.read_csv(os.path.join(file_path, d))
        df = pd.concat([df, data], axis=0).reset_index(drop=True)
    
    # index column 삭제 ..ㅎㅎ 
    df = df.drop('Unnamed: 0', axis=1)
    
    return df

In [64]:
temp = load_file('temp')
wind = load_file('wind')
rain = load_file('rain')
forecast = load_file('forecast')

temp.shape, wind.shape, rain.shape, forecast.shape

((11680, 5), (11680, 3), (280320, 4), (170820, 5))

# 1. 기온
* 일별 데이터 : 시각, 지점번호, 평균기온, 최고기온, 최저기온

In [75]:
temp['date'] = temp['tma'].apply(lambda x : pd.to_datetime(x[:10]))

In [76]:
'''
avg_ta : 평균기온 
max_ta : 최고기온 
min_ta : 최저기온 
'''

temp.head().append(temp.tail())

Unnamed: 0,tma,stn_id,avg_ta,max_ta,min_ta,date
0,2018-01-01 00:00:00.0,105,1.3,5.7,-2.1,2018-01-01
1,2018-01-01 00:00:00.0,112,-0.3,2.7,-2.7,2018-01-01
2,2018-01-01 00:00:00.0,119,-1.7,4.7,-6.9,2018-01-01
3,2018-01-01 00:00:00.0,136,-1.0,4.7,-6.5,2018-01-01
4,2018-01-01 00:00:00.0,152,2.1,6.2,-0.4,2018-01-01
11675,2019-12-31 00:00:00.0,112,-7.3,-5.3,-9.9,2019-12-31
11676,2019-12-31 00:00:00.0,131,-5.7,-1.4,-8.0,2019-12-31
11677,2019-12-31 00:00:00.0,143,-2.5,2.6,-5.1,2019-12-31
11678,2019-12-31 00:00:00.0,155,-1.7,4.8,-4.0,2019-12-31
11679,2019-12-31 00:00:00.0,184,2.9,6.4,1.8,2019-12-31


In [58]:
print("관측 지역 수 : ", temp.stn_id.nunique())
print(temp.stn_id.unique())

관측 지역 수 :  16
[108 131 133 136 146 155 168 184 105 112 143 152 156 159 232 119]


# 2. 강수량
* 관측시간, AWS번호, RN_DAY (누적 강수량, 마지막 23시 데이터 가져오기), RN_HR1 (1시간 강수량, 일별로 최대값 가져오기)

In [77]:
rain['date'] = rain['tm'].apply(lambda x : pd.to_datetime(x[:10]))

In [71]:
print("관측 지역 수 : ", rain.aws_id.nunique())
print(rain.aws_id.unique())

관측 지역 수 :  16
[133 119 136 155 105 108 156 184 159 112 232 131 143 146 152 168]


In [117]:
rain = rain.sort_values('tm').groupby(['date', 'aws_id']).agg({
        'rn_day' : [('rn_day', 'last')], 
        'rn_hr1' : [('rn_hr1', np.max)]
}).reset_index()
rain.columns = a.columns.get_level_values(level=0)

In [118]:
'''
rn_day : 일별 강수량 
rn_hr1 : 일별 1시간 최다 강수량 
'''

print(rain.shape)
rain.head().append(rain.tail())

(11680, 4)


Unnamed: 0,date,aws_id,rn_day,rn_hr1
0,2018-01-01,105,0.0,0.0
1,2018-01-01,108,0.0,0.0
2,2018-01-01,112,0.0,0.0
3,2018-01-01,119,0.0,0.0
4,2018-01-01,131,0.0,0.0
11675,2019-12-31,156,0.0,0.0
11676,2019-12-31,159,0.0,0.0
11677,2019-12-31,168,0.0,0.0
11678,2019-12-31,184,0.0,0.0
11679,2019-12-31,232,0.0,0.0


In [119]:
rain[rain["rn_day"] != 0].head()

Unnamed: 0,date,aws_id,rn_day,rn_hr1
76,2018-01-05,159,0.8,0.4
78,2018-01-05,184,4.2,1.0
107,2018-01-07,156,2.5,1.5
109,2018-01-07,168,0.9,0.9
110,2018-01-07,184,3.2,1.4


# 3. 풍속 
* 관측시간, 지점번호, 평균풍속

In [80]:
wind['date'] = wind['tma'].apply(lambda x : pd.to_datetime(x[:10]))

In [81]:
'''
avg_ws : 평균풍속 
'''
wind.head().append(wind.tail())

Unnamed: 0,tma,stn_id,avg_ws,date
0,2018-01-01 00:00:00.0,112,1.6,2018-01-01
1,2018-01-01 00:00:00.0,155,1.8,2018-01-01
2,2018-01-01 00:00:00.0,168,4.5,2018-01-01
3,2018-01-01 00:00:00.0,184,3.5,2018-01-01
4,2018-01-02 00:00:00.0,119,1.6,2018-01-02
11675,2019-12-31 00:00:00.0,133,2.5,2019-12-31
11676,2019-12-31 00:00:00.0,155,2.5,2019-12-31
11677,2019-12-31 00:00:00.0,156,3.0,2019-12-31
11678,2019-12-31 00:00:00.0,168,8.7,2019-12-31
11679,2019-12-31 00:00:00.0,184,5.4,2019-12-31


In [73]:
print("관측 지역 수 : ", wind.stn_id.nunique())
print(wind.stn_id.unique())

관측 지역 수 :  16
[112 155 168 184 119 136 156 105 131 133 143 146 159 232 108 152]


# 4. 예보 
* 예측시점, 예측일시, 지점번호, 기상예측카테고리, 강수예측카테고리 

In [181]:
# 예측 일시 2018~2019 데이터만 사용 
forecast['date'] = forecast['tm_ef'].apply(lambda x : pd.to_datetime(x[:10]))

In [182]:
'''
wf_sky_cd : 기상예측카테고리 (WB01 : 맑음, WB02 : 구름조금, WB03 : 구름많음, WB04 : 흐림) 
wf_pre_cd : 강수예측카테고리 (WB00 : 강수없음, WB09 : 비, WB11 : 비 또는 눈, WB12 : 눈, WB13 : 눈 또는 비)
'''
forecast.head().append(forecast.tail())

Unnamed: 0,tm_fc,tm_ef,stn_id,wf_sky_cd,wf_pre_cd,date
0,2017-12-22 06:00:00.0,2018-01-01 00:00:00.0,156,WB03,WB00,2018-01-01
1,2017-12-22 18:00:00.0,2018-01-01 00:00:00.0,184,WB03,WB00,2018-01-01
2,2017-12-24 06:00:00.0,2018-01-02 00:00:00.0,131,WB03,WB00,2018-01-02
3,2017-12-24 06:00:00.0,2018-01-01 00:00:00.0,133,WB03,WB00,2018-01-01
4,2017-12-24 06:00:00.0,2018-01-02 00:00:00.0,146,WB03,WB00,2018-01-02
170815,2019-12-27 18:00:00.0,2019-12-30 00:00:00.0,146,WB04,WB09,2019-12-30
170816,2019-12-27 18:00:00.0,2019-12-31 00:00:00.0,146,WB03,WB00,2019-12-31
170817,2019-12-28 06:00:00.0,2019-12-31 12:00:00.0,105,WB01,WB00,2019-12-31
170818,2019-12-28 06:00:00.0,2019-12-31 12:00:00.0,146,WB01,WB00,2019-12-31
170819,2019-12-28 18:00:00.0,2019-12-31 00:00:00.0,159,WB01,WB00,2019-12-31


In [183]:
# 함정 : 관측 지역이 8개 ..
print("관측 지역 수 : ", forecast.stn_id.nunique())
print(forecast.stn_id.unique())

관측 지역 수 :  8
[156 184 131 133 146 105 159 143]


In [184]:
# 가장 최근의 시점 (tm_fc) 에서 예측한 낮 (12:00) & 밤 (0:00) 예보 가져오기 
forecast[(forecast["stn_id"]==156) & (forecast["date"]=='2018-01-03')].sort_values(['tm_fc', 'tm_ef'])

Unnamed: 0,tm_fc,tm_ef,stn_id,wf_sky_cd,wf_pre_cd,date
30359,2017-12-24 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
54805,2017-12-24 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
42611,2017-12-25 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
79393,2017-12-25 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
30368,2017-12-26 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
36525,2017-12-26 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
79409,2017-12-27 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
18314,2017-12-27 06:00:00.0,2018-01-03 12:00:00.0,156,WB03,WB00,2018-01-03
12138,2017-12-27 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
24334,2017-12-27 18:00:00.0,2018-01-03 12:00:00.0,156,WB03,WB00,2018-01-03


In [185]:
# 예시! 
forecast[(forecast["stn_id"]==156) & (forecast["date"]=='2018-01-03')].sort_values(['tm_fc', 'tm_ef']).drop_duplicates('tm_ef', keep='last')

Unnamed: 0,tm_fc,tm_ef,stn_id,wf_sky_cd,wf_pre_cd,date
12201,2017-12-31 18:00:00.0,2018-01-03 00:00:00.0,156,WB02,WB00,2018-01-03
60968,2017-12-31 18:00:00.0,2018-01-03 12:00:00.0,156,WB03,WB00,2018-01-03


In [186]:
'''
wf_sky_day : 낮 기상예측카테고리
wf_sky_night : 밤 기상예측카테고리

wf_rain_day : 낮 강수예측카테고리
wf_rain_night : 밤 강수예측카테고리
'''

forecast = forecast.sort_values(['tm_fc', 'tm_ef']).drop_duplicates(['tm_ef', 'stn_id'], keep='last')
forecast = forecast.groupby(['date', 'stn_id']).agg({
        'wf_sky_cd' : [('wf_sky_day', 'first'), ('wf_sky_night', 'last')], 
        'wf_pre_cd' : [('wf_rain_day', 'first'), ('wf_rain_night', 'last')]
}).reset_index()

# column rename 
forecast.columns = forecast.columns.get_level_values(level=1)
column_list = list(forecast.columns)
column_list[0], column_list[1] = 'date', 'stn_id'

forecast.columns = column_list

In [188]:
def sky_condition(x) : 
    '''
    WB01 : 맑음, WB02 : 구름조금, WB03 : 구름많음, WB04 : 흐림
    '''
    if x == 'WB01' : return '맑음'
    elif x == 'WB02' : return '구름조금'
    elif x == 'WB03' : return '구름많음'
    else : return '흐림'
    
def rain_condition(x) : 
    '''
    WB00 : 강수없음, WB09 : 비, WB11 : 비 또는 눈, WB12 : 눈, WB13 : 눈 또는 비
    '''
    if x == 'WB00' : return '강수없음'
    elif x == 'WB09' : return '비'
    elif x == 'WB11' : return '비 또는 눈'
    elif x == 'WB12' : return '눈'
    else : return '눈 또는 비'    

In [191]:
forecast["wf_sky_day"] = forecast["wf_sky_day"].apply(lambda x : sky_condition(x))
forecast["wf_sky_night"] = forecast["wf_sky_night"].apply(lambda x : sky_condition(x))
forecast["wf_rain_day"] = forecast["wf_rain_day"].apply(lambda x : rain_condition(x))
forecast["wf_rain_night"] = forecast["wf_rain_night"].apply(lambda x : rain_condition(x))

In [192]:
forecast.head()

Unnamed: 0,date,stn_id,wf_sky_day,wf_sky_night,wf_rain_day,wf_rain_night
0,2018-01-01,105,구름조금,구름조금,강수없음,강수없음
1,2018-01-01,131,구름조금,구름조금,강수없음,강수없음
2,2018-01-01,133,구름많음,구름많음,강수없음,강수없음
3,2018-01-01,143,구름조금,구름조금,강수없음,강수없음
4,2018-01-01,146,구름많음,구름많음,강수없음,강수없음
