In [1]:
# -*- coding: utf-8 -*-
"""working.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1llsccnklCzs7EZSELP6MVysncixaZnQR

## Notebook settings
"""
# region Import
# Data download
# Import basic
import csv
import math
import os
import warnings
# Init google drive
# from google.colab import drive
from datetime import datetime
from timeit import default_timer as timer

import numpy as np
import pandas as pd
# Plottool
import plotly.graph_objs as go
# IPython
from IPython.display import display
# Hyperopt bayesian optimization
from hyperopt import hp, Trials, tpe, fmin, STATUS_OK, partial
# Keras
from keras import Sequential
from keras.activations import softmax
from keras.callbacks import EarlyStopping, ModelCheckpoint  
from keras.initializers import Ones
from keras.layers import LSTM, Dropout, Input
from keras.models import Model
import keras.backend as K
# SKLearn
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# endregion

Using TensorFlow backend.


In [2]:
current_timestamp = datetime.now().strftime('%d%m%Y_%H%M%S')

# region File mount and config
# drive.mount('/content/gdrive', force_remount=True)
root_dir = ""

time_dir = os.path.join(root_dir, "result")
time_dir = os.path.join(time_dir, current_timestamp)

data_dir = root_dir + 'data'
model_dir = os.path.join(time_dir, 'model')
plot_dir = os.path.join(time_dir, 'plot')
result_dir = os.path.join(time_dir, 'result')
# Create folder if not exists

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)
    
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
    
pd.options.display.max_columns = 12
pd.options.display.max_rows = 24

# disable warnings in Anaconda
warnings.filterwarnings('ignore')

# endregion

In [3]:
# region Data Loading
stock_name = '000001.SS'  # SSE Composite Index
# df_org = yf.download(stock_name, start="1991-01-01", end="2016-12-31", interval="1wk")
df_org = pd.read_csv(f'{data_dir}/{stock_name}.csv', parse_dates=['Date'])
df_org = df_org.sort_values('Date')
# df_org.to_csv(f'{base_dir}/{stock_name}.csv')
df_org.reset_index(inplace=True)
df_org = df_org[['Date', 'Close', 'Open', 'High', 'Low', 'Adj Close', 'Volume']]

# endregion

In [4]:
# region Data ploting
def plot_ohlc(df):
    trace = go.Ohlc(x=df['Date'],
                    open=df['Open'],
                    high=df['High'],
                    low=df['Low'],
                    close=df['Close'],
                    increasing=dict(line=dict(color='#58FA58')),
                    decreasing=dict(line=dict(color='#FA5858')))

    layout = {
        'title': f'{stock_name} Historical Price',
        'xaxis': {'title': 'Date',
                  'rangeslider': {'visible': False}},
        'yaxis': {'title': f'Price'}
    }

    data = [trace]

    fig = go.Figure(data=data, layout=layout)
    fig.write_html(os.path.join(plot_dir, '%s_ohlc.html' % (stock_name)), auto_open=False)


plot_ohlc(df_org)
# endregion

In [5]:
# region Create csv result file
# File to save first results
result_save_fname = os.path.join(result_dir, 'result_%s-%s.csv' % (stock_name, current_timestamp))
of_connection = open(result_save_fname, 'w')
writer = csv.writer(of_connection)
# Write the headers to the file
writer.writerow(['stock_name', 'year', 'loss', 'params', 'iteration', 'windows_size', 'train_time'])
of_connection.close()

# Create file to save bayer best
bayer_save_fname = os.path.join(result_dir, 'bayer_best_%s-%s.csv' % (stock_name, current_timestamp))
of_connection = open(bayer_save_fname, 'w')
writer = csv.writer(of_connection)
# Write the headers to the file
writer.writerow(['stock_name', 'year', 'params', 'model_save_location'])
of_connection.close()
# endregion

In [6]:
# region Sample data

df_org.sample(10)

# endregion

Unnamed: 0,Date,Close,Open,High,Low,Adj Close,Volume
210,1995-01-09,597.840027,626.0,626.0,597.840027,597.840027,0
1283,2015-09-07,3200.233887,3149.379883,3256.74292,3011.116943,3200.233887,1425100
1187,2013-11-04,2106.126953,2156.086914,2166.170898,2103.51001,2106.126953,433400
596,2002-06-03,1529.506958,1510.246948,1540.824951,1455.305054,1529.506958,0
753,2005-06-06,1108.286011,1010.380981,1146.416992,998.228027,1108.286011,136000
277,1996-04-22,707.609985,613.969971,707.609985,613.969971,707.609985,0
189,1994-08-15,713.849976,665.880005,748.559998,665.880005,713.849976,0
1309,2016-03-14,2955.149902,2830.083984,2971.551025,2819.794922,2955.149902,1058300
298,1996-09-16,805.539978,778.289978,805.539978,760.76001,805.539978,0
867,2007-08-20,5107.667969,4773.832031,5125.358887,4758.396973,5107.667969,520000


In [7]:
# region Const
# Declare const
input_col = ['Close', 'Open', 'High', 'Low', 'Adj Close', 'Volume']
output_col = ['Close']
time_col = ['Date']

# Input dimension
input_dim = len(input_col)
# Output dimension
output_dim = len(output_col)

# Number of session to prediction as one time
prediction_size = 1
# For each time model is train, the first is display
sample_display_test_size = 5
# Max bayer iteration
bayer_max_evals = 100


# endregion

In [8]:
# region Declare model
# declare model
def softMaxAxis1(x):
    return softmax(x, axis=1)


def get_model(input_dim, window_size, output_dim, lstm_layer_count=5, drop_rate=0.2):
    model = Sequential()
    model.add(LSTM(units=100, input_shape=(window_size, input_dim), return_sequences=True, kernel_initializer=Ones()))
    model.add(Dropout(rate=0.2))

    for i in range(lstm_layer_count - 2):
        model.add(LSTM(units=100, return_sequences=True))
        model.add(Dropout(rate=drop_rate))
    
    model.add(LSTM(output_dim, activation=softMaxAxis1))
    # TODO: custom loss function
    model.compile(loss='MAPE', optimizer='adam', metrics=['accuracy'])
    
    return model


# endregion

In [9]:
# region Error metric
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def root_mean_square_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)

    return np.mean((y_true - y_pred) / y_true)


def relative_root_mean_square_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    res = (y_true - y_pred) / y_true
    res = np.power(res, 2)
    res = np.mean(res)
    res = math.sqrt(res)

    return res


# endregion

In [10]:
# region Data preprocessing
# reprocessing data
def next_window(df, i, windows_size, prediction_size, input_col, output_col, time_col):
    '''Generates the next data window from the given index location i'''
    window = df[i: i + windows_size + prediction_size]
    x = window[input_col][:-prediction_size]
    y = window[output_col][-prediction_size:]
    y_time = window[time_col][-prediction_size:]
    return x, y, y_time

def smooting_data(df, window_size):
    return df.ewm(span=window_size).mean()

def preprocessing_data(df, windows_size, prediction_size, input_col, output_col, time_col):
    '''
    Create x, y train data windows
    Warning: batch method, not generative, make sure you have enough memory to
    load data, otherwise use generate_training_window() method.
    '''


    data_x = []
    data_y = []
    data_y_time = []
    for i in range(len(df) - windows_size - prediction_size):
        x, y, y_time = next_window(df, i, windows_size, prediction_size, input_col, output_col, time_col)
        data_x.append(x.values)
        data_y.append(y.values)
        data_y_time.append(y_time)

    time = pd.concat(data_y_time)

    return np.array(data_x), np.array(data_y), time.values


def split_train_test_data(X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

    return X_train, y_train, X_valid, y_valid


# endregion

In [11]:
# region Model train
# Trainning model
def train_model(model, X_train, y_train, X_valid, y_valid, stock_name, year, window_size):
    model_save_fname = os.path.join(model_dir, '%s-%s-w%d.h5' % (stock_name, year, window_size))
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=100),
        ModelCheckpoint(filepath=model_save_fname, monitor='val_loss', save_best_only=True)
    ]
    history = model.fit(
        X_train,
        y_train,
        epochs=1000,
        batch_size=10000,
        validation_data=(X_valid, y_valid),
        verbose=1,
        callbacks=callbacks,
        shuffle=False)
    model.save(model_save_fname)
    
    return history


# endregion

In [12]:
# region Test model
def test_model(model, test_data, window_size, prediction_size, input_col, output_col, time_col):
    X, y, time = preprocessing_data(test_data, window_size, prediction_size, input_col, output_col, time_col)
    
    y_pred = model.predict(X)
    
    y_pred = np.repeat(y_pred, input_dim, axis=1)
    y_pred = scaler.inverse_transform(y_pred)[:, [0]]
    y_pred = pd.Series(y_pred.flatten())

    df_test_result = pd.DataFrame(time, columns=['Date'])
    df_test_result['Prediction'] = y_pred
    df_test_result.set_index('Date', inplace=True)

    return df_test_result


def plot_test_result(test_result, stock_name, year, window_size):
    # Plotly
    trace0 = go.Scatter(
        x=test_result.index,
        y=test_result['Close'],
        name='Thực tế',
        line=dict(
            color=('#5042f4'),
            width=2)
    )

    trace1 = go.Scatter(
        x=test_result.index,
        y=test_result['Prediction'],
        name='Dự đoán',
        line=dict(
            color=('#005b4e'),
            width=2,
            dash='dot'
        )  # dash options include 'dash', 'dot', and 'dashdot'
    )

    data = [trace0, trace1]

    # Edit the layout
    layout = dict(title='Biểu đồ dự đoán',
                  xaxis=dict(title='Date'),
                  yaxis=dict(title='Price'),
                  paper_bgcolor='#FFF9F5',
                  plot_bgcolor='#FFF9F5'
                  )

    fig = go.Figure(data=data, layout=layout)
    fig.write_html(os.path.join(plot_dir, '%s_%s_w%d.html' % (stock_name, year, window_size)), auto_open=False)

# endregion

In [None]:
# region Bayers
def objective(params, df):
    # Keep track of evals
    global ITERATION

    ITERATION += 1

    # Make sure windows_size is int
    windows_size = int(params['windows_size'])
    print(f'Window size is {windows_size}')

    model = get_model(input_dim, windows_size, output_dim)

    start = timer()

    # Handle data
    df.describe()
    # TODO: smoothing ddata
    df[input_col] = smooting_data(df[input_col], windows_size)

    X, y, time = preprocessing_data(df, windows_size, prediction_size, input_col, output_col, time_col)

    # Reshape data
    y = y.reshape((y.shape[0], y.shape[1]))

    X_train, y_train, X_valid, y_valid = split_train_test_data(X, y)

    # Perform n_train
    history = train_model(model, X_train, y_train, X_valid, y_valid, stock_name, year, windows_size)

    run_time = timer() - start

    # Test generated loss
    test_result = test_model(model, df, windows_size, prediction_size, input_col, output_col, time_col)
    test_result = test_result.join(df_org.set_index('Date'))

    mae = mean_absolute_error(test_result['Close'], test_result['Prediction'])
    mse = mean_squared_error(test_result['Close'], test_result['Prediction'])
    mape = mean_absolute_percentage_error(test_result['Close'], test_result['Prediction'])
    rrmse = relative_root_mean_square_error(test_result['Close'], test_result['Prediction'])

    #print(f'{stock_name} prediction for {prediction_size} day ahead')
    #print(f'MAE = {mae}')
    #print(f'MSE = {mse}')
    #print(f'MAPE = {mape}')
    #print(f'RRMSE = {rrmse}')

    plot_test_result(test_result, stock_name, year, windows_size)
    loss = mape

    # write row
    of_connection = open(result_save_fname, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([stock_name, year, loss, params, ITERATION, windows_size, run_time])
    of_connection.close()

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION, 'test_result': test_result,
            'train_time': run_time, 'status': STATUS_OK}

start_year = df_org['Date'].values[:1][0]
start_year = pd.to_datetime(start_year).year

end_year = df_org['Date'].values[-1:][0]
end_year = pd.to_datetime(end_year).year

windows_size_best = []
# Global variable
global ITERATION

for year in range(start_year, end_year + 1):
    df = df_org[df_org['Date'].dt.year == year]

    # Data too small, skip
    if df.shape[0] < 10:
        continue

    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_cols = scaler.fit_transform(df[input_col])
    df[input_col] = scaled_cols

    # Hyperparameter grid
    param_grid = {
        'windows_size': hp.choice('windows_size', np.arange(1, 8, dtype=int))
    }

    bayes_trials = Trials()

    # Create the algorithm
    bayes_algo = tpe.suggest

    ITERATION = 0

    fmin_objective = partial(objective, df=df)
    bayes_best = fmin(fn=fmin_objective, space=param_grid,
                      algo=bayes_algo, trials=bayes_trials,
                      max_evals=bayer_max_evals)

    best_model_fname = os.path.join(model_dir, '%s-%s-w%d.h5' % (stock_name, year, bayes_best['window_size']))
    of_connection = open(bayer_save_fname, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([stock_name, year, bayes_best, best_model_fname])
    of_connection.close()

    windows_size_best.append([year, bayes_best])
# endregion


Window size is 3                                     



Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 38 samples, validate on 10 samples          
Epoch 1/1000                                         





 - 11s 290ms/step - loss: 581.0742 - acc: 0.0000e+00 - val_loss: 27.3235 - val_acc: 0.0000e+00

Epoch 2/1000                                         
 - 0s 2ms/step - loss: 573.1251 - acc: 0.0000e+00 - val_loss: 30.0864 - val_acc: 0.0000e+00

Epoch 3/1000                                         
 - 0s 2ms/step - loss: 565.3142 - acc: 0.0000e+00 - val_loss: 33.4165 - val_acc: 0.0000e+00

Epoch 4/1000                                         
 - 0s 2ms/step - loss: 555.6868 - acc: 0.0000e+00 - val_loss: 37.5962 - val_acc: 0.0000e+00

Epoch 5/1000                                         
 - 0s 2ms/step - loss: 54


Epoch 24/1000                                        
 - 0s 3ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 25/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 26/1000                                        
 - 0s 3ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 27/1000                                        
 - 0s 3ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 28/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 29/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 30/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_l

 - 0s 3ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 65/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 66/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 67/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 68/1000                                        
 - 0s 3ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 69/1000                                        
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 70/1000                                        
 - 0s 1ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 71/1000     

 - 0s 1ms/step - loss: 495.9534 - acc: 0.0000e+00 - val_loss: 28.0470 - val_acc: 0.0000e+00

Epoch 3/1000                                                                    
 - 0s 1ms/step - loss: 491.5462 - acc: 0.0000e+00 - val_loss: 29.7825 - val_acc: 0.0000e+00

Epoch 4/1000                                                                    
 - 0s 992us/step - loss: 486.8152 - acc: 0.0000e+00 - val_loss: 31.7845 - val_acc: 0.0000e+00

Epoch 5/1000                                                                    
 - 0s 865us/step - loss: 481.2051 - acc: 0.0000e+00 - val_loss: 34.2095 - val_acc: 0.0000e+00

Epoch 6/1000                                                                    
 - 0s 1ms/step - loss: 475.3090 - acc: 0.0000e+00 - val_loss: 37.2247 - val_acc: 0.0000e+00

Epoch 7/1000                                                                    
 - 0s 1ms/step - loss: 468.4413 - acc: 0.0000e+00 - val_loss: 41.0096 - val_acc: 0.0000e+00

Epoch 8/1000                     

 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 35/1000                                                                   
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 36/1000                                                                   
 - 0s 1ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 37/1000                                                                   
 - 0s 1ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 38/1000                                                                   
 - 0s 954us/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 39/1000                                                                   
 - 0s 1ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 40/1000                

 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 67/1000                                                                   
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 68/1000                                                                   
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 69/1000                                                                   
 - 0s 3ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 70/1000                                                                   
 - 0s 1ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 71/1000                                                                   
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 72/1000                  

 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 99/1000                                                                   
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 100/1000                                                                  
 - 0s 2ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Epoch 101/1000                                                                  
 - 0s 1ms/step - loss: 100.0000 - acc: 0.0000e+00 - val_loss: 100.0000 - val_acc: 0.0000e+00

Window size is 3                                                                
Train on 38 samples, validate on 10 samples                                     
Epoch 1/1000                                                                    
  2%|▏         | 2/100 [01:32<1:03:10, 38.67s/it, best loss: 29.133024312122892]