## 예측 함수 정의

### 전처리된 데이터 프레임 들고오기
- 데이터는 input과 target으로 잘 분리되어있다.
- input의 컬럼 : "일시", "지점", "평균 풍속(m/s)", "평균 상대습도", "평균 기온(°C)", "평균 수온(°C)", "일강수량(mm)", "합계 일조시간(hr)"
- target의 컬럼 : "일시"(인덱스), "지점_x", "target_x"

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import LSTM, GRU, Dense, Dropout, Flatten, Embedding
from keras import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import Recall, Precision
from keras.utils import set_random_seed, to_categorical, plot_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import load_model

In [11]:
# 사용하는 함수 모음
def get_dens(v):
    """ # 밀도 상한 1000으로 두기"""
    if v >= 1000:
        density = 1000
    else:
        density = v
    return density

# Oversampling을 위해 데이터 복제

# # Oversampling 1. 앞 쪽 데이터 증식
def oversample(df, TRAIN_SPLIT):
    """ Oversampling을 위해 데이터 복제"""
    df['target_oversample'] = df['target']
    for i in range(2,TRAIN_SPLIT):
        if df['target_oversample'].iloc[i-2] == 0 and df['target_oversample'].iloc[i-1] == 0 and df['target_oversample'].iloc[i] != 0:
            df['target_oversample'].iloc[i-2] = (df['target_oversample'].iloc[i])*1/3
            df['target_oversample'].iloc[i-1] = (df['target_oversample'].iloc[i])*2/3
            
        elif df['target_oversample'].iloc[i-1] == 0 and df['target_oversample'].iloc[i] != 0:
            df['target_oversample'].iloc[i-1] = (df['target_oversample'].iloc[i])*1/2
    # # Oversampling 2. 뒤 쪽 데이터 증식
    for i in range(0,TRAIN_SPLIT):
        if df['target_oversample'].iloc[i+2] == 0 and df['target_oversample'].iloc[i+1] == 0 and df['target_oversample'].iloc[i] != 0:
            df['target_oversample'].iloc[i+2] = (df['target_oversample'].iloc[i])*1/3
            df['target_oversample'].iloc[i+1] = (df['target_oversample'].iloc[i])*2/3
            i = i+2
    return df

# step 은 보통 1, start_index, end_index 기준으로 데이터를 쪼갠다. -> train, validation
# history_size로 30, target_size로 7로 우선 생각
def multivariate_data(input, target, start_index, end_index, history_size=30, target_size=7, step=1):
    """ dataset 기간별로 쪼개는 함수"""
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(input) - target_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i, step)
        data.append(input[indices])  # 사용할 데이터 수 (날짜 이전 값)
        labels.append(target[i:i+target_size]) # 예측할 데이터 수 (날짜 이후 값)
    return np.array(data), np.array(labels)

In [28]:
def make_predict(input_addr, target_addr):
    # 데이터 프레임 불러오기
    input_data = pd.read_csv(input_addr, index_col=0)
    target_data = pd.read_csv(target_addr, index_col=0)

    # 사용하는 상수 모음
    BUFFER_SIZE = 10000; TRAIN_SPLIT = len(target_data)-8; BATCH_SIZE = 16

    input_data.drop(columns=["지점", "평균 상대습도(%)", "일시"], inplace=True)
    target_data.drop(columns=["지점_x"], inplace=True)
    input_data.reset_index(inplace=True)
    input_data.drop(columns=["index"], inplace=True)
    target_data.reset_index(inplace=True)
    target_data.drop(columns=["일시"], inplace=True)
    target_data.rename(columns={"target_x": "target"}, inplace=True)
    
    # get_dens(v) 함수 적용
    target_data["target"] = target_data["target"].apply(lambda v : get_dens(v))
    target_data = oversample(target_data, TRAIN_SPLIT)

    # 표준화
    input_data1 = np.load("input_data1.npy")
    st = StandardScaler()
    st.fit(input_data1)
    input_data1 = st.transform(input_data1)
    input_data = st.transform(input_data)
    input_data = input_data.round(2)

    pred_data, pred_target = multivariate_data(input_data, target_data["target_oversample"], 30, TRAIN_SPLIT)

    # 타겟 표준화 0~1000 -> 0~1
    pred_target = pred_target/1000.0
    pred_data_multi = tf.data.Dataset.from_tensor_slices((pred_data, pred_target))

    # 모듈 로딩
    model = load_model("best_model2.h5")
    for x, y in pred_data_multi.take(1):
        print(model.predict(x).shape)
    y_np_array = np.array(y)
    real_value = pd.DataFrame(y_np_array)
    predict_value = pd.DataFrame(model.predict(x))
    result = pd.concat([real_value,predict_value], axis=1)
    result.columns = ['real_1','real_2','real_3','real_4','real_5', 'real_6', 'real','predict_1','predict_2','predict_3', \
        'predict_4', 'predict_5', 'predict_6', 'predict']

    for col in result.columns:    
        for i in range(0,len(result[col])):
            if int(result[col][i]) < 0:
                result[col][i] = 0
            if int(result[col][i]) > 1:
                result[col][i] = 1


    # 적조발생 수치 계산
    red_tied_rate = 0.1  #적조 발생 밀도 기준치
    TP = result.real_1[(result['real_1'] >= red_tied_rate) & (result['predict_1'] >= red_tied_rate)].count()
    TN = result.real_1[(result['real_1'] < red_tied_rate) & (result['predict_1'] < red_tied_rate)].count()
    FN = result.real_1[(result['real_1'] >= red_tied_rate) & (result['predict_1'] < red_tied_rate)].count()
    FP = result.real_1[(result['real_1'] < red_tied_rate) & (result['predict_1'] >= red_tied_rate)].count()
    Pr = result.real_1[(result['real_1'] >= red_tied_rate)].count()
    Nr = result.real_1[(result['real_1'] < red_tied_rate)].count()
    Pp = result.real_1[(result['predict_1'] >= red_tied_rate)].count()
    Np = result.real_1[(result['real_1'] < red_tied_rate)].count()
    All = Pr + Nr

    Precision = TP /(TP+FP)
    Recall = TP /(TP+FN)
    F1_Score = (2*Precision*Recall) / (Precision + Recall)
    print("Precision = {}".format(Precision))
    print("Recall = {}".format(Recall))
    print("F1_Score = {}".format(F1_Score))

    print("Accuracy = {}".format((TP+TN)/ All))
    print("Error Rate = {}".format((FP+FN)/ All))
    print("Sensitivity = {}".format(TP /Pr))
    print("Specificity = {}".format(TN /Nr))
    print("Precision = {}".format(Precision))
    print("Recall = {}".format(Recall))
    print("F1_Score = {}".format(F1_Score))

    # 결과 그래프
    result[['real_1','predict_1']].plot()

make_predict("example_input", "example_target")



ValueError: in user code:

    File "c:\ProgramData\Anaconda3\envs\EV_PY39\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "c:\ProgramData\Anaconda3\envs\EV_PY39\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\ProgramData\Anaconda3\envs\EV_PY39\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "c:\ProgramData\Anaconda3\envs\EV_PY39\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "c:\ProgramData\Anaconda3\envs\EV_PY39\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\ProgramData\Anaconda3\envs\EV_PY39\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_9" is incompatible with the layer: expected shape=(None, 30, 5), found shape=(None, 5)


In [23]:
pd.read_csv("example_target", index_col=0)

Unnamed: 0_level_0,지점_x,target_x
일시,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,거제,0.0
2017-01-02,거제,0.0
2017-01-03,거제,0.0
2017-01-04,거제,0.0
2017-01-05,거제,0.0
...,...,...
2022-09-26,거제,0.0
2022-09-27,거제,0.0
2022-09-28,거제,0.0
2022-09-29,거제,0.0
