In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout
from keras.layers import LSTM

# 데이터 로드

In [7]:
curr_dir = os.getcwd()

forders_tr = os.listdir(curr_dir+"\\data")

data = pd.read_csv(os.path.join(curr_dir+"\\data\\")+forders_tr[0])
data = data.drop(data.columns[0], axis=1)
data

submission = pd.read_csv('2-1_검증데이터셋.csv')
submission1 = submission.copy()          

# 전처리

In [121]:
soil = pd.read_csv('OBS_AAOS_TIM_20220802210224.csv', encoding='cp949')       # 외부 데이터 로드
stn = list(set(soil['지점'].values))

soil_df1 = []      # 일시 포함된 데이터 프레임
for i in range(len(stn)):
    df = soil[soil['지점']==stn[i]]
    df = df.reset_index()
    df = df.drop('index', axis=1)
    soil_df1.append(df)

    
soil_df = []        # 일시가 제외된 데이터 프레임 (추합을 위해)
for i in range(len(stn)):
    df = soil[soil['지점']==stn[i]]
    df = df.reset_index()
    df = df.drop('index', axis=1)
    df = df.drop(['지점', '지점명', '일시'], axis=1)   
    soil_df.append(df) 
  
day = pd.date_range('2021-11-26', periods=95*24+20*24, freq='1H')    # 21.11.26~22.03.20
day
day1 = pd.DataFrame(day)
day1.columns = ['일자']
day1
day1.to_csv('day_real.csv', index=False)


for i in range(len(stn)):
    df = soil[soil['지점']==stn[i]]
    df.to_csv(os.path.join(str(stn[i])+'.csv'), index=False)   # 관측소마다.csv 파일로 저장

In [251]:
# 관측소마다 일시(일자) 통합을 위해 'day_real'파일과 위에서 저장된 모든 관측소에 대해 엑셀 VLOOKUP 함수 이용하여 
# 중간에 비어 있는 일시를 채워주는 작업 후, 각 파일을 _1.csv 파일로 저장

soil_df2 = []
for i in range(len(stn)):
    df = pd.read_csv(os.path.join(str(stn[i])+'_1.csv'), encoding='cp949')
    df = df.drop('일자', axis=1)
    df = df.fillna(0)
    soil_df2.append(df)



soil_df3 = soil_df2[0].copy()
for i in range(len(soil_df[0].columns)):
    soil_df2[0][soil_df2[0].columns[i]] = 0

# 각 관측소마다 평균 내기 (추합)
for i in range(1, len(soil_df2)):
    soil_df3 = soil_df3 + soil_df2[i]
soil_df3 = soil_df3/len(soil_df2)
soil_df3['일자'] = pd.date_range('2021-11-26', periods=95*24+20*24, freq='1H')


# 제공 데이터와의 통합을 위해 제공 데이터에 없는 일시를 외부 데이터에서 제외시킴
d1 = soil_df3[soil_df3['일자'] == '2021-12-22 0:00'].index[0]
for i in range(d1, d1+24*3):
    soil_df3 = soil_df3.drop(i,axis=0)

d2 = soil_df3[soil_df3['일자'] == '2022-03-01 0:00'].index[0]
for i in range(d2, d2+24*3):
    soil_df3 = soil_df3.drop(i,axis=0)
    
d3 = soil_df3[soil_df3['일자'] == '2022-03-14 0:00'].index[0]
for i in range(d3, d3+24):
    soil_df3 = soil_df3.drop(i,axis=0)
    
d4 = soil_df3[soil_df3['일자'] == '2022-03-16 0:00'].index[0]
for i in range(d4, d4+24*2):
    soil_df3 = soil_df3.drop(i,axis=0)

soil_df3 = soil_df3.reset_index()
soil_df3 = soil_df3.drop('index', axis=1)

# 제공 데이터와 외부 데이터 통합 (시간 데이터 제외 모든 열)
col = ['smart_farm.insolation', 'smart_farm.out_tmperature',\
       'smart_farm.out_humidity', 'smart_farm.wind_speed',\
       'smart_farm.wind_direction', 'smart_farm.in_tmperature',\
       'smart_farm.in_humidity', 'smart_farm.shield_light_h',\
       'smart_farm.shield_tmperature_h', 'smart_farm.shield_energy_h',\
       'smart_farm.shield_energy_v', 'smart_farm.exhaust_fan',\
       'smart_farm.ceiling', 'smart_farm.floating_fan',\
       'smart_farm.fan_coil_b_site',\
       'smart_farm.ventilation_temperature_control',\
       'smart_farm.heating_temperature_set_up']          

data_ = pd.DataFrame()
for k in tqdm(range(len(col))):
    h = []
    for i in range(int(len(data)/60)):      # 분 -> 시간 단위 바꾸기 위해
        h.append(np.mean(data[col[k]][60*i:60*(i+1)]))
    data_[col[k]] = h
    
h1 = []
for i in range(int(len(data)/60)):
    h1.append(np.sum(data['smart_farm.heat_supply'][60*i:60*(i+1)]))
data_['smart_farm.heat_supply'] = h1



real_data = pd.concat([soil_df3, data_], axis=1)
real_data1 = pd.concat([real_data[real_data.columns[4]], real_data[real_data.columns[:4]], real_data[real_data.columns[5:]]], axis=1)
real_data1 = real_data1.reset_index()
real_data1 = real_data1.drop('index', axis=1)
real_data1 = real_data1.dropna(subset = ['일자'])
real_data1.to_csv('real_data_통합.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:05<00:00,  2.86it/s]


In [8]:
# 위에서 저장한 'real_data_통합' 파일과 'day_real' 파일 이용하여
# 엑셀 VLOOKUP 함수 이용하여 21.11.26부터 22.03.20까지 비어 일자 채운 후, 다시 'real_data_결측' 파일로 저장

pre = pd.read_csv('real_data_결측.csv', encoding='cp949')
pre1 = pre.interpolate(method ='linear')      
pre1 = pre1.dropna(subset = ['일자'])
pre1.to_csv('real_data_최종.csv', index=False)  # 최종 전처리 파일(결측치 제거한)

real1 = pd.read_csv('real_data_최종.csv', encoding='utf-8')
real1

# 자른 이유? 20개를 추출하기 위해 일자 슬라이싱이 필요함.
training = real1[:-504]
testing = real1[-504:]
testing = testing.reset_index()
testing = testing.drop('index', axis=1)

# 모델 1

In [13]:
X = training.drop(['10CM 정시 토양수분(%)', '20CM 정시 토양수분(%)', '30CM 정시 토양수분(%)',\
       '50CM 정시 토양수분(%)','일자','smart_farm.heat_supply'], axis=1)
y = training[['smart_farm.heat_supply']].values

training_data, test_data, y_tr, y_te = train_test_split(X, y, test_size=0.3, shuffle=False)  

validation = testing.drop(['10CM 정시 토양수분(%)', '20CM 정시 토양수분(%)', '30CM 정시 토양수분(%)',\
       '50CM 정시 토양수분(%)','일자','smart_farm.heat_supply'], axis=1)


x_train = []
y_train = []

for i in range(24, training_data.shape[0]):
  x_train.append(training_data[i-24:i])
  y_train.append(training_data.iloc[i,-1])
    
    
x_test = []
y_test = []

for i in range(24, test_data.shape[0]):
  x_test.append(test_data[i-24:i])
  y_test.append(test_data.iloc[i,-1])
x_train, y_train = np.array(x_train), np.array(y_train)
x_test, y_test = np.array(x_test), np.array(y_test)


x_validation = []
for i in range(24, validation.shape[0]):
  x_validation.append(validation[i-24:i])
x_validation = np.array(x_validation)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_validation.shape)

model = Sequential()
model.add(LSTM(16, 
               input_shape=(x_train.shape[1], x_train.shape[2]), 
               activation='tanh', 
               return_sequences=False)
          )

model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# model_path = 'model'
# filename = os.path.join(model_path, 'tmp_checkpoint.h5')
# checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

history = model.fit(x_train, y_train, 
                                    epochs=10, #200
                                    batch_size=16,
                                    validation_data=(x_test, y_test))
#                                     callbacks=[early_stop, checkpoint]

(1555, 24, 17)
(1555,)
(653, 24, 17)
(653,)
(480, 24, 17)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
p = model.predict(x_validation)  
p = p.flatten()

h1 = []
for i in range(int(len(p)/24)): 
    h1.append(np.sum(p[24*i:24*(i+1)]))    
    
h1.insert(0,0)     # 한 번만 클릭!
h11 = h1[4:14] + h1[18:21]

submission1['heat_supply_day1'] = h11
submission1 



Unnamed: 0,yy,mm,dd,heat_supply_day1,heat_supply_day2
0,2022,3,4,297.337341,
1,2022,3,5,310.437469,
2,2022,3,6,308.444641,
3,2022,3,7,274.734497,
4,2022,3,8,302.048218,
5,2022,3,9,304.179565,
6,2022,3,10,311.537537,
7,2022,3,11,311.503357,
8,2022,3,12,311.522675,
9,2022,3,13,311.519531,


# 모델 2(절감 모델)

In [15]:
X = training.drop(['일자','smart_farm.heat_supply'], axis=1)
y = training[['smart_farm.heat_supply']].values

training_data, test_data, y_tr, y_te = train_test_split(X, y, test_size=0.3, shuffle=False)   # shuffle은 안 됨. 시간적순서라

validation = testing.drop(['일자','smart_farm.heat_supply'], axis=1)


x_train = []
y_train = []

for i in range(24, training_data.shape[0]):
  x_train.append(training_data[i-24:i])
  y_train.append(training_data.iloc[i,-1])
    
x_test = []
y_test = []

for i in range(24, test_data.shape[0]): 
  x_test.append(test_data[i-24:i])
  y_test.append(test_data.iloc[i,-1])
x_train, y_train = np.array(x_train), np.array(y_train)
x_test, y_test = np.array(x_test), np.array(y_test)



x_validation = []
for i in range(24, validation.shape[0]):
  x_validation.append(validation[i-24:i])
x_validation = np.array(x_validation)


print(x_train.shape) 
print(y_train.shape) 
print(x_test.shape)
print(y_test.shape)
print(x_validation.shape)


model = Sequential()
model.add(LSTM(16, 
               input_shape=(x_train.shape[1], x_train.shape[2]), 
               activation='tanh', 
               return_sequences=False)
          )

model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
# early_stop = EarlyStopping(monitor='val_loss', patience=5)

# model_path = 'model'
# filename = os.path.join(model_path, 'tmp_checkpoint.h5')
# checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

history = model.fit(x_train, y_train, 
                                    epochs=10, #200
                                    batch_size=16,
                                    validation_data=(x_test, y_test))
#                                     callbacks=[early_stop, checkpoint]

(1555, 24, 21)
(1555,)
(653, 24, 21)
(653,)
(480, 24, 21)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
p2 = regression_GRU.predict(x_validation)  
p2 = p2.flatten()

h2 = []
for i in range(int(len(p2)/24)): 
    h2.append(np.sum(p2[24*i:24*(i+1)]))
h2.insert(0,0)     # 한 번만 클릭!
h22 = h2[4:14] + h2[18:21]
h22
submission1['heat_supply_day2'] = h22
submission1   



Unnamed: 0,yy,mm,dd,heat_supply_day1,heat_supply_day2
0,2022,3,4,297.337341,289.212708
1,2022,3,5,310.437469,289.050354
2,2022,3,6,308.444641,297.866333
3,2022,3,7,274.734497,289.450256
4,2022,3,8,302.048218,288.00412
5,2022,3,9,304.179565,296.704285
6,2022,3,10,311.537537,287.675873
7,2022,3,11,311.503357,286.65152
8,2022,3,12,311.522675,311.615356
9,2022,3,13,311.519531,300.276306


In [19]:
submission1.to_csv('220224_최종코드틀_LSTM.csv', index=False) 