# Import Library

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
import datetime as dt
import pandas as pd
import os
import xml.etree.ElementTree as ET

In [3]:
%cd /gdrive/MyDrive/Colab_Weather_API/lib
import new_weather as nw
import weather_dict as wd
from importlib import reload

/gdrive/MyDrive/Colab_Weather_API/lib


In [4]:
%cd /gdrive/MyDrive/Colab_Weather_API/lib
reload(nw)

/gdrive/MyDrive/Colab_Weather_API/lib


<module 'new_weather' from '/gdrive/MyDrive/Colab_Weather_API/lib/new_weather.py'>

In [5]:
%cd /gdrive/MyDrive/Colab_Weather_API/lib
reload(wd)

/gdrive/MyDrive/Colab_Weather_API/lib


<module 'weather_dict' from '/gdrive/MyDrive/Colab_Weather_API/lib/weather_dict.py'>

# LOG

In [6]:
tz = dt.timezone(dt.timedelta(hours=9))
dt_now = dt.datetime.now(tz=tz)
str_now = dt.datetime.strftime(dt_now, '%Y-%m-%d %H:%M:%S')
# print(f"# 작성시간:\t{str_now}")
# print("# 작성자:\t\tjongphago")
# print("# 주제:\t\t함수'change_args' Error")

In [7]:
# 작성시간: 2021-06-28 17:14:19
# 작성자:   jongphago
# 주제:     함수'change_args' Error
"""
문제점:
    - arg_dict를 생성할때 YYYYmmdd 형식으로 저장하여 시간 정보를 무시한다.
    - 함수 change_args를 호출할때 변경해야 하는 시간 값은 정확히 999hrs후 
    값인데, 시간정보가 없어서 특정 시간만큼 추가되는 현상이 발생한다.
    
해결책:
    Alt1:
        - makre_arg_dict 함수를 작성할때 날짜관련 key값이 '년-월-일-시-분' 
        데이터를 모두 갖도록 작성한다
        - 장점: 정확한 시간(ex:999시간 후) 단위로 컨트롤이 가능
        - 단점: 많은 함수를 수정해야 한다. 미리 고생.
    Alt2:
        - 일단위로 K시간을 입력하면 %24 만큼의 시간만 입력 되도록 한다.
        - 장점: 수정할 함수가 적음
        - 단점: 정확한 컨트롤이 어려움. 앞으로 고생.
""" 
pass

# serviceKey 생성

In [8]:
lib_path = "/gdrive/MyDrive/Colab_PRIVATE/weather_api"
service_key = nw.get_service_key(lib_path)

# Info Dictionary

In [9]:
call_info = pd.DataFrame(wd.call_dict).transpose()
call_info

Unnamed: 0,항목명,항목크기,항목구분,샘플데이터,항목설명
serviceKey,인증키,100,1,인증키,공공데이터포털에서 발급받은 인증키(URL_Encode)
numOfRows,한_페이지_결과_수,4,0,10,한 페이지 결과 수(Default: 10)
pageNo,페이지_번호,4,0,1,페이지 번호(Default: 1)
dataType,응답자료형식,4,0,XML,요청자료형식(XML/JSON)(Default: XML)
dataCd,자료_코드,4,1,ASOS,자료 분류 코드
dateCd,날짜_코드,3,1,HR,날짜 분류 코드
startDt,시작일,8,1,20100101,조회 기간 시작일
startHh,시작시,2,1,1,조회 기간 시작시
endDt,종료일,8,1,20100601,조회 기간 종료일((전일(D-1) 까지 제공))
endHh,종료시,2,1,1,조회 기간 종료시


##Error dict

In [10]:
error_info = pd.DataFrame(wd.error_dict).transpose()
error_info

Unnamed: 0,msg,discription
0,NORMAL_SERVICE,정상
1,APPLICATION_ERROR,어플리케이션 에러
2,DB_ERROR,데이터베이스 에러
3,NODATA_ERROR,데이터없음 에러
4,HTTP_ERROR,HTTP 에러
5,SERVICETIME_OUT,서비스 연결실패 에러
10,INVALID_REQUEST_PARAMETER_ERROR,잘못된 요청 파라메터 에러
11,NO_MANDATORY_REQUEST_PARAMETERS_ERROR,필수요청 파라메터가 없음
12,NO_OPENAPI_SERVICE_ERROR,해당 오픈API서비스가 없거나 폐기됨
20,SERVICE_ACCESS_DENIED_ERROR,서비스 접근거부


##컬럼명 해석기

In [11]:
asos_columns_info = pd.DataFrame(wd.asos_dict).transpose()
asos_columns_info.head()

Unnamed: 0,항목명,항목크기,샘플데이터,설명
numOfRows,한 페이지 결과 수,4,1,한 페이지당 표출 데이터 수
pageNo,페이지 번호,4,1,페이지 수
totalCount,데이터 총 개수,10,1,데이터 총 개수
resultCode,응답메시지 코드,2,0,응답 메시지코드
resultMsg,응답메시지 내용,100,NORMAL SERVICE,응답 메시지 설명


In [12]:
asos_columns_info[['항목명', '샘플데이터']].transpose()

Unnamed: 0,numOfRows,pageNo,totalCount,resultCode,resultMsg,dataType,tm,rnum,stnId,stnNm,ta,taQcflg,rn,rnQcflg,ws,wsQcflg,wd,wdQcflg,hm,hmQcflg,pv,td,pa,paQcflg,ps,psQcflg,ss,ssQcflg,icsr,dsnw,hr3Fhsc,dc10Tca,dc10LmcsCa,clfmAbbrCd,lcsCh,vs,gndSttCd,dmstMtphNo,ts,tsQcflg,m005Te,m01Te,m02Te,m03Te
항목명,한 페이지 결과 수,페이지 번호,데이터 총 개수,응답메시지 코드,응답메시지 내용,데이터 타입,시간,목록 순서,지점 번호,서울,기온,기온 품질검사,강수량,강수량 품질검사,풍속,풍속 품질검사,풍향,풍향 품질검사,습도,습도 품질검사,증기압,이슬점온도,현지기압,현지기압 품질검사,해면기압,해면기압 품질검사,일조,일조 품질검사,일사,적설,3시간신적설,전운량,중하층운량,운형,최저운고,시정,지면상태,현상번호,지면온도,지면온도 품질검사,5cm 지중온도,10cm 지중온도,20cm 지중온도,30cm 지중온도
샘플데이터,1,1,1,0,NORMAL SERVICE,XML,2010-01-01 10,1,108,서울,23.8,0,10.5,0,1,0,110,0,36,0,1.1,-21.4,1012.4,0,1023.6,0,1,0,0.73,2.2,0.2,0,0,scas,8,2300,17,1904,-3.4,0,-4.9,-2.4,-1,0.4


In [13]:
wd.translate(wd.asos_dict, 'ts')

'지면온도'

## 관측소 정보

In [14]:
station_info = pd.DataFrame(wd.station_dict).transpose()
station_info

Unnamed: 0,stnNm,manager,경도,위도
90,속초,강원지방기상청,128.565,38.2509
93,북춘천,춘천기상대,127.755,37.9475
95,철원,강원지방기상청,127.304,38.1479
98,동두천,수도권기상청,127.061,37.9019
99,파주,수도권기상청,126.766,37.8859
...,...,...,...,...
285,합천,울산기상대,128.17,35.5651
288,밀양,울산기상대,128.744,35.4915
289,산청,창원기상대,127.879,35.413
294,거제,부산지방기상청,128.605,34.8882


In [15]:
station_info.index

Int64Index([ 90,  93,  95,  98,  99, 100, 101, 102, 104, 105, 106, 108, 112,
            114, 115, 119, 121, 127, 129, 130, 131, 133, 135, 136, 137, 138,
            140, 143, 146, 152, 155, 156, 159, 162, 165, 168, 169, 170, 172,
            174, 177, 184, 185, 188, 189, 192, 201, 202, 203, 211, 212, 216,
            217, 221, 226, 232, 235, 236, 238, 239, 243, 244, 245, 247, 248,
            251, 252, 253, 254, 255, 257, 258, 259, 260, 261, 262, 263, 264,
            266, 268, 271, 272, 273, 276, 277, 278, 279, 281, 283, 284, 285,
            288, 289, 294, 295],
           dtype='int64')

# weather_args 생성

In [16]:
site_index = 108
arg_dict = nw.make_arg_dict(site_index, '20210101', '20210601')
arg_dict['_numOfRows']

3648

In [17]:
incheon_arg_dict = nw.make_arg_dict(112, '20210101', '20210601')
incheon_arg_dict['_numOfRows']

3648

In [18]:
seoul_arg_dict = nw.make_arg_dict(108, '20210101', '20210601')
seoul_arg_dict['_numOfRows']

3648

# Call API

In [19]:
master_df = nw.call(service_key, arg_dict)

In [20]:
incheon = nw.call(service_key, incheon_arg_dict)

In [21]:
seoul = nw.call(service_key, seoul_arg_dict)

In [22]:
# master_df.shape
master_df.head()

Unnamed: 0,tm,rnum,stnId,stnNm,ta,taQcflg,rn,rnQcflg,ws,wsQcflg,wd,wdQcflg,hm,hmQcflg,pv,td,pa,paQcflg,ps,psQcflg,ss,ssQcflg,icsr,dsnw,hr3Fhsc,dc10Tca,dc10LmcsCa,clfmAbbrCd,lcsCh,vs,gndSttCd,dmstMtphNo,ts,tsQcflg,m005Te,m01Te,m02Te,m03Te
0,2021-01-01 00:00,1,108,서울,-8.3,,,9.0,1.8,,250,,66,,2.2,-13.5,1016.4,,1027.7,,,9,,,,0,0,,,2000,,,-6.8,,-0.9,-0.7,0.3,1.6
1,2021-01-01 01:00,2,108,서울,-8.7,,,,2.4,,270,,68,,2.2,-13.5,1016.4,,1027.7,,,9,,,,0,0,,,2000,,,-6.9,,-1.0,-0.8,0.3,1.6
2,2021-01-01 02:00,3,108,서울,-9.1,,,,1.6,,270,,69,,2.1,-13.7,1016.2,,1027.5,,,9,,,,0,0,,,2000,,,-7.1,,-1.1,-0.8,0.3,1.6
3,2021-01-01 03:00,4,108,서울,-9.3,,,,1.1,,250,,70,,2.1,-13.7,1016.8,,1028.1,,,9,,,,0,0,,,2000,,,-7.3,,-1.2,-0.9,0.3,1.6
4,2021-01-01 04:00,5,108,서울,-9.3,,,,0.3,,0,,71,,2.2,-13.5,1016.2,,1027.5,,,9,,,,0,0,,,2000,,,-7.5,,-1.3,-1.0,0.2,1.5


## Flag 보유 컬럼

In [23]:
flag_columns_list = []
for column in master_df.columns:
    if column.endswith('Qcflg'):
        flag_columns_list.append(column)

In [24]:
flag_list =[columns[:2] for columns in flag_columns_list]

In [25]:
# master_df[flag_list]

# 전처리

## 운형

In [26]:
# wd.cloud_name(3)
# wd.cloud_label('Ac')

In [27]:
pd.DataFrame(master_df['clfmAbbrCd'].value_counts()).transpose()

Unnamed: 0,Ci,Sc,StNs,ScAs,ScCi,AcCi,Ac,St,As,ScAc,Cs,AcCs,CbStNs,ScAcCi,ScCs,CuSc,Cc,StAs,CsCi
clfmAbbrCd,529,492,200,189,88,84,71,64,53,34,19,10,8,5,3,2,2,1,1


## Multi label one hot encoding

In [28]:
# cloud form index
if not any(master_df.columns.isin(['clfmIdx'])):
    master_df['clfmIdx'] = master_df['clfmAbbrCd'].apply(wd.converter, args=(wd.cloud_dict, 10, False, ))
    print("컬럼 'clfmIdx'을 추가하였습니다.")
else:
    print("컬럼 'clfmIdx'이 존재합니다.")

컬럼 'clfmIdx'을 추가하였습니다.


In [29]:
master_df['clfmIdx'][~(master_df['clfmAbbrCd'].isna())]

6       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
7       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
8       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
9       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
10      [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
                     ...              
3639    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3640    [0, 0, 0, 0, 1, 0, 1, 0, 0, 0]
3641    [1, 0, 0, 1, 0, 0, 0, 0, 0, 0]
3642    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3645    [1, 0, 0, 1, 0, 0, 0, 0, 0, 0]
Name: clfmIdx, Length: 1855, dtype: object

In [30]:
# master_df.drop(columns='clfmIdx', inplace=True)

# 현상번호

In [31]:
pd.DataFrame(master_df['dmstMtphNo'].value_counts()).T

Unnamed: 0,19,1901,42,01,40,05,4219,1902,4019,1905,190201,4001,421901,1916,4201,11,5,401901,1904,16,191601,190401,1105,190601,421902,0201,02,19160201,191602,4204,190605
dmstMtphNo,238,197,156,139,126,64,24,15,15,13,6,5,5,4,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1


In [32]:
(~master_df['dmstMtphNo'].isna()).sum()

1034

In [33]:
# phenomenon index
if not any(master_df.columns.isin(['dmstMtphNoIdx'])):
    master_df['dmstMtphNoIdx'] = master_df['dmstMtphNo'].apply(wd.converter, args=(wd.phNo_dict, 67, False))
    print("컬럼 'dmstMtphNoIdx'을 추가하였습니다.")
else:
    print("컬럼 'dmstMtphNoIdx'이 존재합니다.")

컬럼 'dmstMtphNoIdx'을 추가하였습니다.


In [34]:
# master_df.drop(columns='dmstMtphNoIdx', inplace=True)

In [35]:
master_df

Unnamed: 0,tm,rnum,stnId,stnNm,ta,taQcflg,rn,rnQcflg,ws,wsQcflg,wd,wdQcflg,hm,hmQcflg,pv,td,pa,paQcflg,ps,psQcflg,ss,ssQcflg,icsr,dsnw,hr3Fhsc,dc10Tca,dc10LmcsCa,clfmAbbrCd,lcsCh,vs,gndSttCd,dmstMtphNo,ts,tsQcflg,m005Te,m01Te,m02Te,m03Te,clfmIdx,dmstMtphNoIdx
0,2021-01-01 00:00,1,108,서울,-8.3,,,9,1.8,,250,,66,,2.2,-13.5,1016.4,,1027.7,,,9,,,,0,0,,,2000,,,-6.8,,-0.9,-0.7,0.3,1.6,,
1,2021-01-01 01:00,2,108,서울,-8.7,,,,2.4,,270,,68,,2.2,-13.5,1016.4,,1027.7,,,9,,,,0,0,,,2000,,,-6.9,,-1.0,-0.8,0.3,1.6,,
2,2021-01-01 02:00,3,108,서울,-9.1,,,,1.6,,270,,69,,2.1,-13.7,1016.2,,1027.5,,,9,,,,0,0,,,2000,,,-7.1,,-1.1,-0.8,0.3,1.6,,
3,2021-01-01 03:00,4,108,서울,-9.3,,,,1.1,,250,,70,,2.1,-13.7,1016.8,,1028.1,,,9,,,,0,0,,,2000,,,-7.3,,-1.2,-0.9,0.3,1.6,,
4,2021-01-01 04:00,5,108,서울,-9.3,,,,0.3,,0,,71,,2.2,-13.5,1016.2,,1027.5,,,9,,,,0,0,,,2000,,,-7.5,,-1.3,-1.0,0.2,1.5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3643,2021-06-01 19:00,647,108,서울,22.4,,,9,1.4,,250,,67,,18.1,15.9,1003.5,,1013.3,,0.0,,0.18,,,10,1,,37,2000,,,21.8,,22.8,21.8,20.8,19.9,,
3644,2021-06-01 20:00,648,108,서울,21.4,,,9,3.4,,320,,75,,19.0,16.7,1003.8,,1013.7,,0.0,,0.03,,,10,4,,40,1873,,,20.6,,22.5,21.7,20.8,20.0,,
3645,2021-06-01 21:00,649,108,서울,20.9,,,9,1.6,,320,,76,,18.8,16.5,1003.9,,1013.8,,,9,,,,9,2,AcCi,49,1603,,,20.0,,22.1,21.4,20.8,20.1,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0]",
3646,2021-06-01 22:00,650,108,서울,20.2,,,9,1.0,,50,,77,,18.2,16.0,1003.9,,1013.9,,,9,,,,9,6,,61,1214,,,19.3,,21.8,21.2,20.8,20.2,,


# 결측치 처리

##결측치 확인

In [36]:
pd.DataFrame(master_df.isnull().sum()/master_df.shape[0], columns=['null_ratio']).T

Unnamed: 0,tm,rnum,stnId,stnNm,ta,taQcflg,rn,rnQcflg,ws,wsQcflg,wd,wdQcflg,hm,hmQcflg,pv,td,pa,paQcflg,ps,psQcflg,ss,ssQcflg,icsr,dsnw,hr3Fhsc,dc10Tca,dc10LmcsCa,clfmAbbrCd,lcsCh,vs,gndSttCd,dmstMtphNo,ts,tsQcflg,m005Te,m01Te,m02Te,m03Te,clfmIdx,dmstMtphNoIdx
null_ratio,0.0,0.0,0.0,0.0,0.0,1.0,0.902686,0.786458,0.000548,0.999452,0.000548,0.999452,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.457511,0.542489,0.457511,0.910362,0.993421,0.0,0.0,0.491502,0.564419,0.0,1.0,0.716557,0.0,0.998355,0.0,0.0,0.0,0.0,0.491502,0.716557


In [37]:
wd.flag_dict

{1: '오류', 9: '결측', 'null': '정상'}

In [38]:
master_df.columns

Index(['tm', 'rnum', 'stnId', 'stnNm', 'ta', 'taQcflg', 'rn', 'rnQcflg', 'ws',
       'wsQcflg', 'wd', 'wdQcflg', 'hm', 'hmQcflg', 'pv', 'td', 'pa',
       'paQcflg', 'ps', 'psQcflg', 'ss', 'ssQcflg', 'icsr', 'dsnw', 'hr3Fhsc',
       'dc10Tca', 'dc10LmcsCa', 'clfmAbbrCd', 'lcsCh', 'vs', 'gndSttCd',
       'dmstMtphNo', 'ts', 'tsQcflg', 'm005Te', 'm01Te', 'm02Te', 'm03Te',
       'clfmIdx', 'dmstMtphNoIdx'],
      dtype='object')

## 강수량: rn

In [39]:
master_df['rnQcflg'].value_counts()

9    779
Name: rnQcflg, dtype: int64

In [40]:
def zero_null(values):
    if all(values.isna()):
        return 0.
    else:
        rn_value = values[0]
        return rn_value

In [41]:
master_df['rn'].isna().sum()

3293

In [42]:
master_df['rn'] = master_df[['rn', 'rnQcflg']].apply(zero_null, axis=1)

In [43]:
master_df['rn'].isnull().sum()

778

## 일사량: icsr

### gps

In [44]:
%cd /gdrive/MyDrive/Colab_Weather_API

/gdrive/MyDrive/Colab_Weather_API


In [45]:
'''1회만 실행하여 station_dict를 직접 수정한다.'''
# location_df = pd.read_csv('./data/META_location_20210704203232.csv', encoding='CP949', error_bad_lines=False, index_col=['지점'])
# station_index = wd.station_dict.keys()
# _station_df = location_df[['위도', '경도']].loc[list(station_index)]
# station_df = _station_df.groupby(level=0).first()
# station_df.to_dict('index')

'1회만 실행하여 station_dict를 직접 수정한다.'

### sunrise/sunset

In [46]:
!pip install astral



In [47]:
from astral import LocationInfo
import datetime
from astral.sun import sun

In [48]:
city = LocationInfo(name="Seoul", region="Korea", timezone="Asia/Seoul", 
                    latitude=wd.station_dict[site_index]['위도'], 
                    longitude=wd.station_dict[site_index]['경도'])
print((
    f"Information for {city.name}/{city.region}\n"
    f"Timezone: {city.timezone}\n"
    f"Latitude: {city.latitude:.02f}; Longitude: {city.longitude:.02f}\n"
))

Information for Seoul/Korea
Timezone: Asia/Seoul
Latitude: 37.57; Longitude: 126.97



In [49]:
s = sun(city.observer, date=datetime.date(2009, 4, 22), tzinfo=city.timezone)
print((
    # f'Dawn:    {s["dawn"]}\n'
    f'Sunrise: {s["sunrise"]}\n'
    # f'Noon:    {s["noon"]}\n'
    f'Sunset:  {s["sunset"]}\n'
    # f'Dusk:    {s["dusk"]}\n'
))

Sunrise: 2009-04-22 05:48:32.186254+09:00
Sunset:  2009-04-22 19:13:25.947829+09:00



In [50]:
master_df['tm']

0       2021-01-01 00:00
1       2021-01-01 01:00
2       2021-01-01 02:00
3       2021-01-01 03:00
4       2021-01-01 04:00
              ...       
3643    2021-06-01 19:00
3644    2021-06-01 20:00
3645    2021-06-01 21:00
3646    2021-06-01 22:00
3647    2021-06-01 23:00
Name: tm, Length: 3648, dtype: object