In [2]:
import pandas as pd
import os 

import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

import seaborn as sns
import numpy as np

from pytimekr import pytimekr

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
database = pd.read_csv('./data/train_preprocessed.csv')
database['tm'] = pd.to_datetime(database['tm'])
database.head()

Unnamed: 0,tm,year,season,month,day,hh24,weekday,week_name,sin_time,cos_time,num,stn,nph_ta,nph_hm,CDH,THI,nph_ws_10m,nph_rn_60m,nph_ta_chi,elec
0,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,13615,140,-8.5,74.5,-34.5,1.538975,0.9,0.0,-5.8,99.63
1,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,18235,565,-8.8,25.6,-34.8,7.943264,1.9,0.0,-4.6,103.49
2,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,18234,565,-8.8,25.6,-34.8,7.943264,1.9,0.0,-4.6,104.43
3,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,18233,512,-6.3,31.3,-32.3,9.196031,2.5,0.0,-7.8,106.29
4,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,11272,941,2.4,68.0,-23.6,19.3536,1.2,0.0,-5.3,102.45


### 그룹화 

격자 종류가 너무 많다. 온도와 습도, 풍속을 측정한 값이 같으면 같은 지역이라고 할 수 있다. 

온도와 습도, 풍속이 같은 값끼리 그룹화를 해보자 


**분석결과** 

같은 AWS 지점번호가 같은 격저넘버의 기상데이터가 같다.

## TrainSet과 ValidationSet 나누기 

Testset에는 elec가 없다. 모델 성능 평가를 위해 trainset과 validationset을 나누자

In [4]:
database['tm'].min(), database['tm'].max()
# train데이터는 2020부터 2023년 1월 1일까지 데이터가 있다.
# test데이터는 2023년 1월 2일부터 1년간의 데이터가 있다.
# 즉 1년의 elec를 예측해야한다.
# 따라서 train데이터에서 2023년, 1년을 검증데이터로 나누자

(Timestamp('2020-01-01 01:00:00'), Timestamp('2023-01-01 00:00:00'))

In [8]:
trainset = database[database['tm']<='2022-01-01 00:00:00']
trainset['tm'].min(), trainset['tm'].max()

(Timestamp('2020-01-01 01:00:00'), Timestamp('2022-01-01 00:00:00'))

In [9]:
valset = database[database['tm']>'2022-01-01 00:00:00']
valset['tm'].min(), valset['tm'].max()

(Timestamp('2022-01-01 01:00:00'), Timestamp('2023-01-01 00:00:00'))

### 먼저 같은 AWS 지점번호를 갖는 데이터로 모델을 훈련시켜보고 평가하자

In [10]:
stn_nums = []
most = 0
for stn in trainset['stn'].unique():
    
    nums_num = len(trainset[trainset['stn']==stn]['num'].unique())
    stn_nums.append([stn,nums_num])
    

stn_nums.sort(key=lambda x:x[1],reverse=True)

print(stn_nums)

[[846, 6], [541, 5], [133, 5], [565, 4], [937, 4], [899, 4], [572, 4], [152, 4], [617, 4], [493, 4], [550, 4], [119, 4], [511, 4], [146, 4], [138, 4], [827, 4], [279, 4], [941, 3], [114, 3], [940, 3], [445, 3], [371, 3], [255, 3], [253, 3], [783, 3], [546, 3], [908, 3], [162, 3], [450, 3], [428, 3], [532, 3], [168, 3], [788, 3], [404, 3], [636, 3], [143, 3], [140, 2], [512, 2], [434, 2], [904, 2], [590, 2], [942, 2], [376, 2], [649, 2], [112, 2], [939, 2], [551, 2], [127, 2], [545, 2], [257, 2], [673, 2], [543, 2], [313, 2], [589, 2], [427, 2], [710, 2], [506, 2], [101, 2], [184, 2], [353, 2], [165, 2], [774, 2], [599, 2], [433, 2], [156, 2], [192, 2], [950, 2], [822, 2], [860, 2], [693, 2], [824, 2], [642, 2], [496, 2], [327, 2], [702, 2], [864, 2], [131, 2], [129, 2], [840, 2], [845, 2], [459, 1], [364, 1], [377, 1], [548, 1], [533, 1], [430, 1], [438, 1], [938, 1], [509, 1], [492, 1], [974, 1], [247, 1], [471, 1], [288, 1], [245, 1], [516, 1], [898, 1], [358, 1], [432, 1], [472, 1],

In [13]:
train_temp = trainset[trainset['stn']==846]
val_temp = valset[valset['stn']==846]

train_temp.head()
# 컬럼에서 없앨거 없애고 ML 모델에 적용 훈련시키자 

Unnamed: 0,tm,year,season,month,day,hh24,weekday,week_name,sin_time,cos_time,num,stn,nph_ta,nph_hm,CDH,THI,nph_ws_10m,nph_rn_60m,nph_ta_chi,elec
232,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,13349,846,-0.1,28.6,-26.1,21.814004,4.7,0.0,-9.3,99.08
245,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,13348,846,-0.1,28.6,-26.1,21.814004,4.7,0.0,-9.3,101.51
247,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,13498,846,-0.1,28.6,-26.1,21.814004,4.7,0.0,-9.3,101.6
255,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,13199,846,-0.1,28.6,-26.1,21.814004,4.7,0.0,-9.3,98.77
258,2020-01-01 01:00:00,2020,3,1,1,1,2,0,0.258819,0.965926,13200,846,-0.1,28.6,-26.1,21.814004,4.7,0.0,-9.3,101.39
