In [None]:
# data : 활용할 데이터
# feature : feature의 이름
# label : label의 이름
# sequence_length : lstm 훈련에 쓰일 데이터의 수
# train_size : train data의 크기
import pandas as pd
from sklearn.preprocessing import *
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf

def make_dataset(data, label, window_size=20):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

def LSTM_model(data,features,labels,sequence_length,train_size,epoch_,batch_,layer_,activation_,optimizer_,plot = True):
    # scailing
    feature_scaler = MinMaxScaler()
    label_scaler = MinMaxScaler()
    ms_feaures = feature_scaler.fit_transform(data[features])
    ms_label = label_scaler.fit_transform(data[labels].values.reshape(-1,1))
    df_ms_feaures = pd.DataFrame(ms_feaures, columns=features)
    df_ms_labels = pd.DataFrame(ms_label, columns=labels)

    # save index
    data_idx = data.index
    data_idx  
    
    # train test spilt
    train_feature, train_label= make_dataset(df_ms_feaures[:train_size],df_ms_labels[:train_size],sequence_length)
    test_feature, test_label = make_dataset(df_ms_feaures[train_size:],df_ms_labels[train_size:],sequence_length)

    # LSTM
    x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2,random_state=56)
    model = Sequential()
    model.add(LSTM(layer_,
                input_shape=(train_feature.shape[1], train_feature.shape[2]),
                activation=activation_,
                return_sequences=False)
            )
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer=optimizer_, metrics=["mean_squared_error"])
    history = model.fit(x_train, y_train, 
                            epochs=epoch_, 
                            batch_size=batch_,
                            validation_data=(x_valid, y_valid))
    pred_test = model.predict(test_feature)

    # rescaled
    rescaled_actual = data[labels].iloc[-len(test_label):]
    rescaled_pred = label_scaler.inverse_transform(np.array(pred_test).reshape(-1,1))
    
    def make_graph():
        # print graph
        plt.rc('font',family = 'Malgun Gothic')
        plt.figure(figsize=(12, 9))
        plt.plot(rescaled_actual, label='actual')
        plt.plot(rescaled_pred, label='prediction')
        plt.tick_params(color = 'white',colors = 'white')
        plt.xlabel(xlabel="날짜",color = "white",size = 15)
        plt.ylabel(ylabel="가격",color = "white",size = 15)
        plt.xticks(range(0,len(test_label),4),[data_idx[i] for i in range(train_size+sequence_length,len(data_idx),4)], rotation = 45)
        plt.legend()
        plt.show()

    if plot:
        make_graph()
        # try:
        #     make_graph()
        # except:
        #     print("can't print plot")
        #     print()
    data_tm = []
    data_tm.append(np.array(df_ms_feaures.iloc[-sequence_length:]))
    data_tm = np.array(data_tm)
    pred_price = model.predict(data_tm)
    rescaled_pred_price = label_scaler.inverse_transform(np.array(pred_price).reshape(-1,1))
    predict_price = rescaled_pred_price[0]

    print("_____parameters______")
    print(f"sequence length : {sequence_length}, train size : {train_size}, epoch = {epoch_}, batch size = {batch_}")
    print(f"activation : {activation_}, optimzer : {optimizer_}")
    print('______________________')

    rmse = np.sqrt(mean_squared_error(rescaled_actual,rescaled_pred))
    r2 = r2_score(rescaled_actual,rescaled_pred)
    return predict_price, rmse, r2
    
df = pd.read_csv(r"C:\sh\study\파이널프로젝트\data\kakao_1_year.csv")
df = df.set_index("Unnamed: 0")
df_drop_row = df.dropna(axis=0)
date = df_drop_row.index

# batch size 통일,
b = 16
e = 50
optimizers = ["rmsprop","adam"]
activation_function = ["relu","tanh"]

# 모든 피쳐
df_features = ["나스닥","코스피","코스닥","기관합계매수량","기타법인매수량",
               "개인매수량","외국인합계매수량","등락률","거래량","전일비",
               "score1","score2","score3","변동금리","vix"]
df_labels = ["종가"]
all_params = []
for o in optimizers:
    for a in activation_function:
        for j in range(5,25,3):
            for i in range(100,180,10):
                tf.random.set_seed(56)
                tp , rmse, r2 = LSTM_model(data=df_drop_row,features=df_features,labels=df_labels,
                                    sequence_length=j,train_size=i,
                                    epoch_=e,batch_=b,layer_=32,
                                    activation_=a,optimizer_=o,plot=False)
                params = [o, a, j, i, e, b, rmse, r2]
                all_params.append(params)
    param_result_1 = pd.DataFrame(all_params,columns=["optimizer","activation_fucntion","sequence_length","train_size","epochs","batch_size","RMSE","R square"])

# 선택된 피쳐
df_features = ["나스닥","코스피","코스닥","기관합계매수량","기타법인매수량",
               "개인매수량","외국인합계매수량","등락률",
               "score1","score2","score3","변동금리","vix"]
df_labels = ["종가"]
all_params = []
for o in optimizers:
    for a in activation_function:
        for j in range(5,25,3):
            for i in range(100,180,10):
                tf.random.set_seed(56)
                tp , rmse, r2 = LSTM_model(data=df_drop_row,features=df_features,labels=df_labels,
                                    sequence_length=j,train_size=i,
                                    epoch_=e,batch_=b,layer_=32,
                                    activation_=a,optimizer_=o,plot=False)
                params = [o, a, j, i, e, b, rmse, r2]
                all_params.append(params)
    param_result_2 = pd.DataFrame(all_params,columns=["optimizer","activation_fucntion","sequence_length","train_size","epochs","batch_size","RMSE","R square"])

param_result_1.to_csv("best_params_LSTM_1.csv")
param_result_2.to_csv("best_params_LSTM_2.csv")


df = pd.read_csv(r"C:\sh\study\파이널프로젝트\data\naver_1_year.csv")
df = df.set_index("Unnamed: 0")
df_drop_row = df.dropna(axis=0)
date = df_drop_row.index

# batch size 통일,
b = 16
e = 50
optimizers = ["rmsprop","adam"]
activation_function = ["relu","tanh"]

# 모든 피쳐
df_features = ["나스닥","코스피","코스닥","기관합계매수량","기타법인매수량",
               "개인매수량","외국인합계매수량","등락률","거래량","전일비",
               "score1","score2","score3","변동금리","vix"]
df_labels = ["종가"]
all_params = []
for o in optimizers:
    for a in activation_function:
        for j in range(5,25,3):
            for i in range(100,180,10):
                tf.random.set_seed(56)
                tp , rmse, r2 = LSTM_model(data=df_drop_row,features=df_features,labels=df_labels,
                                    sequence_length=j,train_size=i,
                                    epoch_=e,batch_=b,layer_=32,
                                    activation_=a,optimizer_=o,plot=False)
                params = [o, a, j, i, e, b, rmse, r2]
                all_params.append(params)
    param_result_3 = pd.DataFrame(all_params,columns=["optimizer","activation_fucntion","sequence_length","train_size","epochs","batch_size","RMSE","R square"])

# 선택된 피쳐
df_features = ["나스닥","코스피","코스닥","기관합계매수량","기타법인매수량",
               "개인매수량","외국인합계매수량","등락률",
               "score1","score2","score3","변동금리","vix"]
df_labels = ["종가"]
all_params = []
for o in optimizers:
    for a in activation_function:
        for j in range(5,25,3):
            for i in range(100,180,10):
                tf.random.set_seed(56)
                tp , rmse, r2 = LSTM_model(data=df_drop_row,features=df_features,labels=df_labels,
                                    sequence_length=j,train_size=i,
                                    epoch_=e,batch_=b,layer_=32,
                                    activation_=a,optimizer_=o,plot=False)
                params = [o, a, j, i, e, b, rmse, r2]
                all_params.append(params)
    param_result_4 = pd.DataFrame(all_params,columns=["optimizer","activation_fucntion","sequence_length","train_size","epochs","batch_size","RMSE","R square"])

param_result_3.to_csv("best_params_LSTM_3.csv")
param_result_4.to_csv("best_params_LSTM_4.csv")