In [254]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd 
import numpy as np
pd.set_option("max_rows", 500)
pd.set_option("max_columns", 500)
pd.set_option('float_format', '{:f}'.format)

import os 
import datetime
from tqdm import tqdm 
from functools import reduce

In [63]:
def load_file(weather) : 
    
    '''
    weather : 추출하고 싶은 날씨 데이터 변수 
    '''
    
    # folder path 설정 
    file_path = os.path.join(os.getcwd(), 'weather')
    file_list = os.listdir(file_path)
    
    # startswith('i') : i로 시작하는 파일 모두 불러오기 / endswith('csv') : csv 파일 모두 불러오기 
    data_file = sorted([file for file in file_list if file.startswith(weather)])
 
    # csv 파일들을 DataFrame으로 불러와서 concat
    df = pd.DataFrame()
    for d in data_file:
        data = pd.read_csv(os.path.join(file_path, d))
        df = pd.concat([df, data], axis=0).reset_index(drop=True)
    
    # index column 삭제 ..ㅎㅎ 
    df = df.drop('Unnamed: 0', axis=1)
    
    return df

In [194]:
temp = load_file('temp')
wind = load_file('wind')
rain = load_file('rain')
forecast = load_file('forecast')

temp.shape, wind.shape, rain.shape, forecast.shape

((17536, 5), (17536, 3), (420864, 4), (256464, 5))

# 1. 기온
* 일별 데이터 : 시각, 지점번호, 평균기온, 최고기온, 최저기온

In [195]:
tqdm.pandas()
temp['date'] = temp['tma'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████| 17536/17536 [00:01<00:00, 14506.81it/s]


In [196]:
'''
avg_ta : 평균기온 
max_ta : 최고기온 
min_ta : 최저기온 
'''

temp.head().append(temp.tail())

Unnamed: 0,tma,stn_id,avg_ta,max_ta,min_ta,date
0,2018-01-01 00:00:00.0,105,1.3,5.7,-2.1,2018-01-01
1,2018-01-01 00:00:00.0,112,-0.3,2.7,-2.7,2018-01-01
2,2018-01-01 00:00:00.0,119,-1.7,4.7,-6.9,2018-01-01
3,2018-01-01 00:00:00.0,136,-1.0,4.7,-6.5,2018-01-01
4,2018-01-01 00:00:00.0,152,2.1,6.2,-0.4,2018-01-01
17531,2020-12-31 00:00:00.0,152,-3.8,1.5,-8.9,2020-12-31
17532,2020-12-31 00:00:00.0,155,-4.1,2.2,-9.8,2020-12-31
17533,2020-12-31 00:00:00.0,159,-2.9,2.9,-8.0,2020-12-31
17534,2020-12-31 00:00:00.0,168,-2.1,2.8,-7.0,2020-12-31
17535,2020-12-31 00:00:00.0,184,2.9,4.6,-0.3,2020-12-31


In [197]:
print("관측 지역 수 : ", temp.stn_id.nunique())
print(temp.stn_id.unique())

관측 지역 수 :  16
[105 112 119 136 152 184 232 131 143 155 168 159 108 156 146 133]


In [206]:
temp.isnull().sum()

tma       0
stn_id    0
avg_ta    2
max_ta    0
min_ta    1
date      0
dtype: int64

In [207]:
temp[temp["avg_ta"].isna()]

Unnamed: 0,tma,stn_id,avg_ta,max_ta,min_ta,date
8449,2019-11-25 00:00:00.0,232,,6.8,0.5,2019-11-25
8458,2019-11-26 00:00:00.0,232,,13.4,,2019-11-26


In [233]:
# 실제 기상청에도 자료가 없어서, avg_ta = (max+min)/2 로 결측값 채움 
temp.loc[8449, 'avg_ta'] = (temp.loc[8449]['max_ta'] + temp.loc[8449]['min_ta']) / 2 

temp.loc[8458, 'min_ta'] = 8.2
temp.loc[8458, 'avg_ta'] = (temp.loc[8458]['max_ta'] + temp.loc[8458]['min_ta']) / 2 

In [234]:
temp[temp["avg_ta"].isna()]

Unnamed: 0,tma,stn_id,avg_ta,max_ta,min_ta,date


In [238]:
temp.isnull().sum()

tma       0
stn_id    0
avg_ta    0
max_ta    0
min_ta    0
date      0
dtype: int64

# 2. 강수량
* 관측시간, AWS번호, RN_DAY (누적 강수량, 마지막 23시 데이터 가져오기), RN_HR1 (1시간 강수량, 일별로 최대값 가져오기)

In [198]:
rain['date'] = rain['tm'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████| 420864/420864 [00:27<00:00, 15085.17it/s]


In [199]:
print("관측 지역 수 : ", rain.aws_id.nunique())
print(rain.aws_id.unique())

관측 지역 수 :  16
[133 119 136 155 105 108 156 184 159 112 232 131 143 146 152 168]


In [202]:
rain = rain.sort_values('tm').groupby(['date', 'aws_id']).agg({
        'rn_day' : [('rn_day', 'last')], 
        'rn_hr1' : [('rn_hr1', np.max)]
}).reset_index()
rain.columns = rain.columns.get_level_values(level=0)

In [203]:
'''
rn_day : 일별 강수량 
rn_hr1 : 일별 1시간 최다 강수량 
'''

print(rain.shape)
rain.head().append(rain.tail())

(17536, 4)


Unnamed: 0,date,aws_id,rn_day,rn_hr1
0,2018-01-01,105,0.0,0.0
1,2018-01-01,108,0.0,0.0
2,2018-01-01,112,0.0,0.0
3,2018-01-01,119,0.0,0.0
4,2018-01-01,131,0.0,0.0
17531,2020-12-31,156,0.6,0.3
17532,2020-12-31,159,0.0,0.0
17533,2020-12-31,168,0.0,0.0
17534,2020-12-31,184,3.8,2.4
17535,2020-12-31,232,0.0,0.0


In [205]:
rain[rain["rn_day"] != 0].head().append(rain[rain["rn_day"] != 0].tail())

Unnamed: 0,date,aws_id,rn_day,rn_hr1
76,2018-01-05,159,0.8,0.4
78,2018-01-05,184,4.2,1.0
107,2018-01-07,156,2.5,1.5
109,2018-01-07,168,0.9,0.9
110,2018-01-07,184,3.2,1.4
17518,2020-12-30,184,4.3,1.9
17528,2020-12-31,146,0.7,0.5
17529,2020-12-31,152,,
17531,2020-12-31,156,0.6,0.3
17534,2020-12-31,184,3.8,2.4


In [208]:
# 결측값 확인 
rain.isnull().sum()

date       0
aws_id     0
rn_day    16
rn_hr1    17
dtype: int64

In [210]:
rain[rain["rn_hr1"].isna()]

Unnamed: 0,date,aws_id,rn_day,rn_hr1
520,2018-02-02,146,,
728,2018-02-15,146,,
5465,2018-12-08,152,,
7511,2019-04-15,143,,
8323,2019-06-05,119,,
8329,2019-06-05,152,,
8578,2019-06-21,112,,
8579,2019-06-21,119,,
8738,2019-07-01,112,,
8739,2019-07-01,119,,


In [211]:
# 결측값 0 으로 채우기
rain = rain.fillna(0)

In [212]:
rain.isnull().sum()

date      0
aws_id    0
rn_day    0
rn_hr1    0
dtype: int64

# 3. 풍속 
* 관측시간, 지점번호, 평균풍속

In [213]:
wind['date'] = wind['tma'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████| 17536/17536 [00:01<00:00, 15200.28it/s]


In [214]:
'''
avg_ws : 평균풍속 
'''
wind.head().append(wind.tail())

Unnamed: 0,tma,stn_id,avg_ws,date
0,2018-01-01 00:00:00.0,112,1.6,2018-01-01
1,2018-01-01 00:00:00.0,155,1.8,2018-01-01
2,2018-01-01 00:00:00.0,168,4.5,2018-01-01
3,2018-01-01 00:00:00.0,184,3.5,2018-01-01
4,2018-01-02 00:00:00.0,119,1.6,2018-01-02
17531,2020-12-30 00:00:00.0,232,3.3,2020-12-30
17532,2020-12-31 00:00:00.0,112,3.0,2020-12-31
17533,2020-12-31 00:00:00.0,152,3.3,2020-12-31
17534,2020-12-31 00:00:00.0,155,2.0,2020-12-31
17535,2020-12-31 00:00:00.0,159,4.1,2020-12-31


In [215]:
print("관측 지역 수 : ", wind.stn_id.nunique())
print(wind.stn_id.unique())

관측 지역 수 :  16
[112 155 168 184 119 136 156 105 131 133 143 146 159 232 108 152]


In [216]:
wind.isnull().sum()

tma        0
stn_id     0
avg_ws    11
date       0
dtype: int64

In [217]:
# 0으로 채우는 것이 맞을까 ..? -> 평균값 
wind[wind.avg_ws.isna()]

Unnamed: 0,tma,stn_id,avg_ws,date
3793,2018-07-30 00:00:00.0,112,,2018-07-30
7697,2019-04-16 00:00:00.0,143,,2019-04-16
7794,2019-05-11 00:00:00.0,133,,2019-05-11
7926,2019-06-12 00:00:00.0,152,,2019-06-12
9565,2019-07-30 00:00:00.0,131,,2019-07-30
10020,2019-11-16 00:00:00.0,232,,2019-11-16
10023,2019-11-17 00:00:00.0,232,,2019-11-17
10038,2019-11-22 00:00:00.0,133,,2019-11-22
11525,2019-11-25 00:00:00.0,232,,2019-11-25
11529,2019-11-26 00:00:00.0,232,,2019-11-26


In [281]:
''' 해당 지역, 해당 년월의 평균치로 결측치를 대체하는 함수'''

def missing_value(data, aws_id, year, month, var):
    data['year'] = data['date'].apply(lambda x : x.year)
    data['month'] = data['date'].apply(lambda x : x.month)
    sample = data[(data.stn_id==aws_id)&(data.year==year)&(data.month==month)]
    alt = sample[var].mean() # 평균
    # 대체
    data[(data.stn_id==aws_id)&(data.year==year)&(data.month==month)] = data[(data.stn_id==aws_id)&(data.year==year)&(data.month==month)].fillna(alt) 
    data = data.drop(['year', 'month'], axis=1) # 월 변수 제거
    return data

In [284]:
wind = missing_value(wind, 112, 2018, 7, 'avg_ws')
wind = missing_value(wind, 143, 2019, 4, 'avg_ws')
wind = missing_value(wind, 133, 2019, 5, 'avg_ws')
wind = missing_value(wind, 152, 2019, 6, 'avg_ws')
wind = missing_value(wind, 131, 2019, 7, 'avg_ws')
wind = missing_value(wind, 232, 2019, 11, 'avg_ws')
wind = missing_value(wind, 133, 2019, 11, 'avg_ws')
wind = missing_value(wind, 159, 2020, 3, 'avg_ws')

In [285]:
wind.isnull().sum()

tma       0
stn_id    0
avg_ws    0
date      0
dtype: int64

In [287]:
wind.loc[3793]

tma       2018-07-30 00:00:00.0
stn_id                      112
avg_ws                 2.563333
date        2018-07-30 00:00:00
Name: 3793, dtype: object

# 4. 예보 
* 예측시점, 예측일시, 지점번호, 기상예측카테고리, 강수예측카테고리 

In [218]:
# 예측 일시 2018~2019 데이터만 사용 
forecast['date'] = forecast['tm_ef'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████| 256464/256464 [00:16<00:00, 15206.53it/s]


In [219]:
'''
wf_sky_cd : 기상예측카테고리 (WB01 : 맑음, WB02 : 구름조금, WB03 : 구름많음, WB04 : 흐림) 
wf_pre_cd : 강수예측카테고리 (WB00 : 강수없음, WB09 : 비, WB11 : 비 또는 눈, WB12 : 눈, WB13 : 눈 또는 비)
'''
forecast.head().append(forecast.tail())

Unnamed: 0,tm_fc,tm_ef,stn_id,wf_sky_cd,wf_pre_cd,date
0,2017-12-22 06:00:00.0,2018-01-01 00:00:00.0,156,WB03,WB00,2018-01-01
1,2017-12-22 18:00:00.0,2018-01-01 00:00:00.0,184,WB03,WB00,2018-01-01
2,2017-12-24 06:00:00.0,2018-01-02 00:00:00.0,131,WB03,WB00,2018-01-02
3,2017-12-24 06:00:00.0,2018-01-01 00:00:00.0,133,WB03,WB00,2018-01-01
4,2017-12-24 06:00:00.0,2018-01-02 00:00:00.0,146,WB03,WB00,2018-01-02
256459,2020-12-27 18:00:00.0,2020-12-30 12:00:00.0,143,WB01,WB00,2020-12-30
256460,2020-12-28 06:00:00.0,2020-12-31 12:00:00.0,133,WB04,WB12,2020-12-31
256461,2020-12-28 06:00:00.0,2020-12-31 00:00:00.0,146,WB04,WB12,2020-12-31
256462,2020-12-28 18:00:00.0,2020-12-31 00:00:00.0,105,WB01,WB00,2020-12-31
256463,2020-12-28 18:00:00.0,2020-12-31 00:00:00.0,156,WB04,WB12,2020-12-31


In [220]:
# 함정 : 관측 지역이 8개 ..
# 광주 제주 청주 대전 전주 강릉 부산 대구 
print("관측 지역 수 : ", forecast.stn_id.nunique())
print(forecast.stn_id.unique())

관측 지역 수 :  8
[156 184 131 133 146 105 159 143]


In [184]:
# 가장 최근의 시점 (tm_fc) 에서 예측한 낮 (12:00) & 밤 (0:00) 예보 가져오기 
forecast[(forecast["stn_id"]==156) & (forecast["date"]=='2018-01-03')].sort_values(['tm_fc', 'tm_ef'])

Unnamed: 0,tm_fc,tm_ef,stn_id,wf_sky_cd,wf_pre_cd,date
30359,2017-12-24 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
54805,2017-12-24 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
42611,2017-12-25 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
79393,2017-12-25 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
30368,2017-12-26 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
36525,2017-12-26 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
79409,2017-12-27 06:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
18314,2017-12-27 06:00:00.0,2018-01-03 12:00:00.0,156,WB03,WB00,2018-01-03
12138,2017-12-27 18:00:00.0,2018-01-03 00:00:00.0,156,WB03,WB00,2018-01-03
24334,2017-12-27 18:00:00.0,2018-01-03 12:00:00.0,156,WB03,WB00,2018-01-03


In [185]:
# 예시! 
forecast[(forecast["stn_id"]==156) & (forecast["date"]=='2018-01-03')].sort_values(['tm_fc', 'tm_ef']).drop_duplicates('tm_ef', keep='last')

Unnamed: 0,tm_fc,tm_ef,stn_id,wf_sky_cd,wf_pre_cd,date
12201,2017-12-31 18:00:00.0,2018-01-03 00:00:00.0,156,WB02,WB00,2018-01-03
60968,2017-12-31 18:00:00.0,2018-01-03 12:00:00.0,156,WB03,WB00,2018-01-03


In [221]:
'''
wf_sky_day : 낮 기상예측카테고리
wf_sky_night : 밤 기상예측카테고리

wf_rain_day : 낮 강수예측카테고리
wf_rain_night : 밤 강수예측카테고리
'''

forecast = forecast.sort_values(['tm_fc', 'tm_ef']).drop_duplicates(['tm_ef', 'stn_id'], keep='last')
forecast = forecast.groupby(['date', 'stn_id']).agg({
        'wf_sky_cd' : [('wf_sky_day', 'first'), ('wf_sky_night', 'last')], 
        'wf_pre_cd' : [('wf_rain_day', 'first'), ('wf_rain_night', 'last')]
}).reset_index()

# column rename 
forecast.columns = forecast.columns.get_level_values(level=1)
column_list = list(forecast.columns)
column_list[0], column_list[1] = 'date', 'stn_id'

forecast.columns = column_list

In [188]:
def sky_condition(x) : 
    '''
    WB01 : 맑음, WB02 : 구름조금, WB03 : 구름많음, WB04 : 흐림
    '''
    if x == 'WB01' : return '맑음'
    elif x == 'WB02' : return '구름조금'
    elif x == 'WB03' : return '구름많음'
    else : return '흐림'
    
def rain_condition(x) : 
    '''
    WB00 : 강수없음, WB09 : 비, WB11 : 비 또는 눈, WB12 : 눈, WB13 : 눈 또는 비
    '''
    if x == 'WB00' : return '강수없음'
    elif x == 'WB09' : return '비'
    elif x == 'WB11' : return '비 또는 눈'
    elif x == 'WB12' : return '눈'
    else : return '눈 또는 비'    

In [222]:
forecast["wf_sky_day"] = forecast["wf_sky_day"].apply(lambda x : sky_condition(x))
forecast["wf_sky_night"] = forecast["wf_sky_night"].apply(lambda x : sky_condition(x))
forecast["wf_rain_day"] = forecast["wf_rain_day"].apply(lambda x : rain_condition(x))
forecast["wf_rain_night"] = forecast["wf_rain_night"].apply(lambda x : rain_condition(x))

In [223]:
forecast.head().append(forecast.tail())

Unnamed: 0,date,stn_id,wf_sky_day,wf_sky_night,wf_rain_day,wf_rain_night
0,2018-01-01,105,구름조금,구름조금,강수없음,강수없음
1,2018-01-01,131,구름조금,구름조금,강수없음,강수없음
2,2018-01-01,133,구름많음,구름많음,강수없음,강수없음
3,2018-01-01,143,구름조금,구름조금,강수없음,강수없음
4,2018-01-01,146,구름많음,구름많음,강수없음,강수없음
8763,2020-12-31,143,맑음,맑음,강수없음,강수없음
8764,2020-12-31,146,흐림,흐림,눈,눈
8765,2020-12-31,156,흐림,흐림,눈,눈
8766,2020-12-31,159,맑음,맑음,강수없음,강수없음
8767,2020-12-31,184,흐림,흐림,눈,눈


In [224]:
forecast.isnull().sum()

date             0
stn_id           0
wf_sky_day       0
wf_sky_night     0
wf_rain_day      0
wf_rain_night    0
dtype: int64

# DataFrame 합치기 

In [247]:
def data_split(df, col) : 
    df1 = df.loc[df[col].between('2018-01-01', '2019-12-31')].reset_index(drop=True)
    df2 = df.loc[df[col].between('2020-01-01', '2020-12-31')].reset_index(drop=True)
    print(df1.shape, df2.shape, "Good!" if df1.shape[0]+df2.shape[0]==df.shape[0] else "TRY AGAIN")
    return df1, df2 

In [288]:
temp1, temp2 = data_split(temp, 'date') 
rain1, rain2 = data_split(rain, 'date') 
wind1, wind2 = data_split(wind, 'date') 
forecast1, forecast2 = data_split(forecast, 'date') 

(11680, 6) (5856, 6) Good!
(11680, 4) (5856, 4) Good!
(11680, 4) (5856, 4) Good!
(5840, 6) (2928, 6) Good!


In [289]:
rain1 = rain1.rename(columns = {'aws_id' : 'stn_id'})
rain2 = rain2.rename(columns = {'aws_id' : 'stn_id'})

In [290]:
weather1 = reduce(lambda left, right: pd.merge(left, right, on=['date', 'stn_id']), [temp1.drop("tma", axis=1), rain1, wind1.drop("tma", axis=1)])
weather2 = reduce(lambda left, right: pd.merge(left, right, on=['date', 'stn_id']), [temp2.drop("tma", axis=1), rain2, wind2.drop("tma", axis=1)])

In [291]:
weather1 = weather1.reindex(columns=['date', 'stn_id', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws'])
weather2 = weather2.reindex(columns=['date', 'stn_id', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws'])

In [292]:
print(weather1.shape)
weather1.head().append(weather1.tail())

(11680, 8)


Unnamed: 0,date,stn_id,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3
11675,2019-12-31,112,-7.3,-5.3,-9.9,0.0,0.0,5.0
11676,2019-12-31,131,-5.7,-1.4,-8.0,0.0,0.0,2.2
11677,2019-12-31,143,-2.5,2.6,-5.1,0.0,0.0,5.2
11678,2019-12-31,155,-1.7,4.8,-4.0,0.0,0.0,2.5
11679,2019-12-31,184,2.9,6.4,1.8,0.0,0.0,5.4


In [293]:
print(weather2.shape)
weather2.head().append(weather2.tail())

(5856, 8)


Unnamed: 0,date,stn_id,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws
0,2020-01-01,131,-2.1,1.5,-7.0,0.0,0.0,0.6
1,2020-01-01,133,-0.6,5.1,-7.7,0.0,0.0,0.9
2,2020-01-01,143,0.2,4.6,-4.9,0.0,0.0,3.0
3,2020-01-01,146,-0.5,4.3,-6.4,0.0,0.0,1.2
4,2020-01-01,152,1.3,6.4,-3.9,0.0,0.0,3.0
5851,2020-12-31,152,-3.8,1.5,-8.9,0.0,0.0,3.3
5852,2020-12-31,155,-4.1,2.2,-9.8,0.0,0.0,2.0
5853,2020-12-31,159,-2.9,2.9,-8.0,0.0,0.0,4.1
5854,2020-12-31,168,-2.1,2.8,-7.0,0.0,0.0,4.7
5855,2020-12-31,184,2.9,4.6,-0.3,3.8,2.4,6.1


In [294]:
weather1.isnull().sum()

date      0
stn_id    0
avg_ta    0
max_ta    0
min_ta    0
rn_day    0
rn_hr1    0
avg_ws    0
dtype: int64

In [295]:
weather2.isnull().sum()

date      0
stn_id    0
avg_ta    0
max_ta    0
min_ta    0
rn_day    0
rn_hr1    0
avg_ws    0
dtype: int64

In [296]:
weather1.to_csv('weather1819.csv', index=False)
weather2.to_csv('weather2020.csv', index=False)