In [1]:
# -*- coding: utf-8 -*-
"""working.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1llsccnklCzs7EZSELP6MVysncixaZnQR

## Notebook settings
"""
# region Import
# Data download
# Import basic
import csv
import math
import os
import warnings
# Init google drive
# from google.colab import drive
from datetime import datetime
from timeit import default_timer as timer

import numpy as np
import pandas as pd
# Plottool
import plotly.graph_objs as go
# IPython
from IPython.display import display
# Hyperopt bayesian optimization
from hyperopt import hp, Trials, tpe, fmin, STATUS_OK, partial
# Keras
from keras import Sequential
from keras import optimizers
from keras.activations import softmax
from keras.callbacks import EarlyStopping, ModelCheckpoint  
from keras.initializers import random_normal, Ones 
from keras.layers import LSTM, Dropout, Input
from keras.models import Model
import keras.backend as K
# SKLearn
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# endregion

Using TensorFlow backend.


In [2]:
try:
  from google.colab import drive
  IN_COLAB = True
except:
  IN_COLAB = False

current_timestamp = datetime.now().strftime('%d%m%Y_%H%M%S')
if not IN_COLAB:
    
    # region File mount and config
    # drive.mount('/content/gdrive', force_remount=True)
    root_dir = ""
    
    time_dir = os.path.join(root_dir, "result")
    time_dir = os.path.join(time_dir, current_timestamp)
    
    data_dir = root_dir + 'data'
    model_dir = os.path.join(time_dir, 'model')
    plot_dir = os.path.join(time_dir, 'plot')
    result_dir = os.path.join(time_dir, 'result')
    # Create folder if not exists
    
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)
        
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
else:
    # region File mount and config
    drive.mount('/content/gdrive', force_remount=True)
    root_dir = "/content/gdrive/My Drive/stock"
    
    time_dir = os.path.join(root_dir, "result")
    
    data_dir = os.path.join(root_dir, "data")
    model_dir = os.path.join(time_dir, 'model')
    plot_dir = os.path.join(time_dir, 'plot')
    result_dir = os.path.join(time_dir, 'result')
    
pd.options.display.max_columns = 12
pd.options.display.max_rows = 24

# disable warnings in Anaconda
warnings.filterwarnings('ignore')

# endregion

Mounted at /content/gdrive


In [0]:
# region Data Loading
stock_name = '000001.SS'  # SSE Composite Index
# df_org = yf.download(stock_name, start="1991-01-01", end="2016-12-31", interval="1wk")
df_org = pd.read_csv(f'{data_dir}/{stock_name}.csv', parse_dates=['Date'])
df_org = df_org.sort_values('Date')
# df_org.to_csv(f'{base_dir}/{stock_name}.csv')
df_org.reset_index(inplace=True)
df_org = df_org[['Date', 'Close', 'Open', 'High', 'Low', 'Adj Close', 'Volume']]

# endregion

In [0]:
# region Data ploting
def plot_ohlc(df):
    trace = go.Ohlc(x=df['Date'],
                    open=df['Open'],
                    high=df['High'],
                    low=df['Low'],
                    close=df['Close'],
                    increasing=dict(line=dict(color='#58FA58')),
                    decreasing=dict(line=dict(color='#FA5858')))

    layout = {
        'title': f'{stock_name} Historical Price',
        'xaxis': {'title': 'Date',
                  'rangeslider': {'visible': False}},
        'yaxis': {'title': f'Price'}
    }

    data = [trace]

    fig = go.Figure(data=data, layout=layout)
    fig.write_html(os.path.join(plot_dir, '%s_ohlc.html' % (stock_name)), auto_open=False)


plot_ohlc(df_org)
# endregion

In [0]:
# region Create csv result file
# File to save first results
result_save_fname = os.path.join(result_dir, 'result_%s-%s.csv' % (stock_name, current_timestamp))
of_connection = open(result_save_fname, 'w')
writer = csv.writer(of_connection)
# Write the headers to the file
writer.writerow(['stock_name', 'year', 'loss', 'params', 'iteration', 'windows_size', 'train_time'])
of_connection.close()

# Create file to save bayer best
bayer_save_fname = os.path.join(result_dir, 'bayer_best_%s-%s.csv' % (stock_name, current_timestamp))
of_connection = open(bayer_save_fname, 'w')
writer = csv.writer(of_connection)
# Write the headers to the file
writer.writerow(['stock_name', 'year', 'params', 'model_save_location'])
of_connection.close()
# endregion

In [6]:
# region Sample data

df_org.sample(10)

# endregion

Unnamed: 0,Date,Close,Open,High,Low,Adj Close,Volume
773,2005-10-25,1092.817017,1140.172974,1140.172974,1067.406982,1092.817017,86200
1007,2010-04-27,2870.610107,2962.14502,2962.14502,2820.948975,2870.610107,370200
742,2005-03-22,1200.113037,1230.682007,1231.676025,1185.456055,1200.113037,58000
1184,2013-09-17,2221.043945,2230.387939,2230.927979,2172.042969,2221.043945,402800
682,2004-01-27,1623.880005,1600.430054,1650.291016,1585.958008,1623.880005,77000
623,2002-12-10,1408.515015,1400.557983,1409.942993,1367.312012,1408.515015,0
764,2005-08-23,1154.427979,1158.828979,1174.536987,1139.239014,1154.427979,117200
117,1993-03-30,947.869995,958.049988,958.049988,925.909973,947.869995,0
136,1993-08-10,1023.869995,862.080017,1023.869995,862.080017,1023.869995,0
1020,2010-07-27,2672.516113,2581.590088,2675.761963,2564.156006,2672.516113,562600


In [0]:
# region Const
# Declare const
input_col = ['Close', 'Open', 'High', 'Low', 'Adj Close', 'Volume']
output_col = ['Close']
time_col = ['Date']

# Input dimension
input_dim = len(input_col)
# Output dimension
output_dim = len(output_col)

# Number of session to prediction as one time
prediction_size = 1
# For each time model is train, the first is display
sample_display_test_size = 5
# Max bayer iteration
bayer_max_evals = 100


# endregion

In [0]:
# region Declare model
# declare model
def softMaxAxis1(x):
    return softmax(x, axis=1)


def get_model(input_dim, window_size, output_dim, lstm_layer_count=5, drop_rate=0.2):
    model = Sequential()
    model.add(LSTM(units=100, input_shape=(window_size, input_dim), return_sequences=True))
    model.add(Dropout(rate=drop_rate))

    for i in range(lstm_layer_count - 2):
        model.add(LSTM(units=100, return_sequences=True))
        model.add(Dropout(rate=drop_rate))
    
    model.add(LSTM(output_dim, activation=softMaxAxis1))
    opt = optimizers.Adam(lr=0.05, beta_1=0.99, beta_2=0.999)
    model.compile(loss='MAE', optimizer=opt)
    
    return model


# endregion

In [0]:
# region Error metric
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def root_mean_square_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)

    return np.mean((y_true - y_pred) / y_true)


def relative_root_mean_square_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    res = (y_true - y_pred) / y_true
    res = np.power(res, 2)
    res = np.mean(res)
    res = math.sqrt(res)

    return res


# endregion

In [0]:
# region Data preprocessing
# reprocessing data
def next_window(df, i, windows_size, prediction_size, input_col, output_col, time_col):
    '''Generates the next data window from the given index location i'''
    window = df[i: i + windows_size + prediction_size]
    x = window[input_col][:-prediction_size]
    y = window[output_col][-prediction_size:]
    y_time = window[time_col][-prediction_size:]
    return x, y, y_time

def smooting_data(df, window_size):
    return df.ewm(span=window_size).mean()

def preprocessing_data(df, windows_size, prediction_size, input_col, output_col, time_col):
    '''
    Create x, y train data windows
    Warning: batch method, not generative, make sure you have enough memory to
    load data, otherwise use generate_training_window() method.
    '''


    data_x = []
    data_y = []
    data_y_time = []
    for i in range(len(df) - windows_size - prediction_size):
        x, y, y_time = next_window(df, i, windows_size, prediction_size, input_col, output_col, time_col)
        data_x.append(x.values)
        data_y.append(y.values)
        data_y_time.append(y_time)

    time = pd.concat(data_y_time)

    return np.array(data_x), np.array(data_y), time.values


def split_train_test_data(X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

    return X_train, y_train, X_valid, y_valid


# endregion

In [0]:
# region Model train
# Trainning model
def train_model(model, X_train, y_train, X_valid, y_valid, stock_name, year, window_size):
    if not IN_COLAB:
        model_save_fname = os.path.join(model_dir, '%s-%s-w%d.h5' % (stock_name, year, window_size))
    else:
        model_save_fname = os.path.join(model_dir, '%s-%s-w%d-%s.h5' % (stock_name, year, window_size, datetime.now().strftime('%d%m%Y_%H%M%S')))
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=100),
        ModelCheckpoint(filepath=model_save_fname, monitor='val_loss', save_best_only=True)
    ]
    history = model.fit(
        X_train,
        y_train,
        epochs=1000,
        batch_size=10000,
        validation_data=(X_valid, y_valid),
        verbose=1,
        callbacks=callbacks,
        shuffle=False)
    model.save(model_save_fname)
    
    return history


# endregion

In [0]:
# region Test model
def test_model(model, test_data, window_size, prediction_size, input_col, output_col, time_col):
    X, y, time = preprocessing_data(test_data, window_size, prediction_size, input_col, output_col, time_col)
    
    y_pred = model.predict(X)
    
    y_pred = np.repeat(y_pred, input_dim, axis=1)
    y_pred = scaler.inverse_transform(y_pred)[:, [0]]
    y_pred = pd.Series(y_pred.flatten())

    df_test_result = pd.DataFrame(time, columns=['Date'])
    df_test_result['Prediction'] = y_pred
    df_test_result.set_index('Date', inplace=True)

    return df_test_result


def plot_test_result(test_result, stock_name, year, window_size):
    # Plotly
    trace0 = go.Scatter(
        x=test_result.index,
        y=test_result['Close'],
        name='Thực tế',
        line=dict(
            color=('#5042f4'),
            width=2)
    )

    trace1 = go.Scatter(
        x=test_result.index,
        y=test_result['Prediction'],
        name='Dự đoán',
        line=dict(
            color=('#005b4e'),
            width=2,
            dash='dot'
        )  # dash options include 'dash', 'dot', and 'dashdot'
    )

    data = [trace0, trace1]

    # Edit the layout
    layout = dict(title='Biểu đồ dự đoán',
                  xaxis=dict(title='Date'),
                  yaxis=dict(title='Price'),
                  paper_bgcolor='#FFF9F5',
                  plot_bgcolor='#FFF9F5'
                  )

    fig = go.Figure(data=data, layout=layout)
    if not IN_COLAB:
        fig.write_html(os.path.join(plot_dir, '%s_%s_w%d.html' % (stock_name, year, window_size)), auto_open=False)
    else:
        fig.write_html(os.path.join(plot_dir, '%s_%s_w%d_%s.html' % (stock_name, year, window_size, datetime.now().strftime('%d%m%Y_%H%M%S'))), auto_open=False)
        
# endregion

In [0]:
# region Bayers
def objective(params, df):
    # Keep track of evals
    global ITERATION

    ITERATION += 1

    # Make sure windows_size is int
    windows_size = int(params['windows_size'])
    print(f'Window size is {windows_size}')

    model = get_model(input_dim, windows_size, output_dim)

    start = timer()

    # Handle data
    df.describe()
    # TODO: smoothing ddata
    df[input_col] = smooting_data(df[input_col], windows_size)

    X, y, time = preprocessing_data(df, windows_size, prediction_size, input_col, output_col, time_col)

    # Reshape data
    y = y.reshape((y.shape[0], y.shape[1]))

    X_train, y_train, X_valid, y_valid = split_train_test_data(X, y)

    # Perform n_train
    history = train_model(model, X_train, y_train, X_valid, y_valid, stock_name, year, windows_size)

    run_time = timer() - start

    # Test generated loss
    test_result = test_model(model, df, windows_size, prediction_size, input_col, output_col, time_col)
    test_result = test_result.join(df_org.set_index('Date'))
    plot_test_result(test_result, stock_name, year, windows_size)

    score = model.evaluate(X, y, 10000, 1)
    print(f'Window size {windows_size} score = {score}')
    #mae = mean_absolute_error(test_result['Close'], test_result['Prediction'])
    #mse = mean_squared_error(test_result['Close'], test_result['Prediction'])
    #mape = mean_absolute_percentage_error(test_result['Close'], test_result['Prediction'])
    #rrmse = relative_root_mean_square_error(test_result['Close'], test_result['Prediction'])

    #print(f'{stock_name} prediction for {prediction_size} day ahead')
    #print(f'MAE = {mae}')
    #print(f'MSE = {mse}')
    #print(f'MAPE = {mape}')
    #print(f'RRMSE = {rrmse}')
    #loss = mape
    loss = score
    # write row
    of_connection = open(result_save_fname, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([stock_name, year, loss, params, ITERATION, windows_size, run_time])
    of_connection.close()

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION, 'test_result': test_result,
            'train_time': run_time, 'status': STATUS_OK}

start_year = df_org['Date'].values[:1][0]
start_year = pd.to_datetime(start_year).year

end_year = df_org['Date'].values[-1:][0]
end_year = pd.to_datetime(end_year).year

windows_size_best = []
# Global variable
global ITERATION

for year in range(start_year, end_year + 1):
    df = df_org[df_org['Date'].dt.year == year]

    # Data too small, skip
    if df.shape[0] < 10:
        continue

    scaler = MinMaxScaler(feature_range=(0, 2))
    scaled_cols = scaler.fit_transform(df[input_col])
    df[input_col] = scaled_cols

    # Hyperparameter grid
    param_grid = {
        'windows_size': hp.choice('windows_size', np.arange(1, 8, dtype=int))
    }

    bayes_trials = Trials()

    # Create the algorithm
    bayes_algo = tpe.suggest

    ITERATION = 0

    fmin_objective = partial(objective, df=df)
    bayes_best = fmin(fn=fmin_objective, space=param_grid,
                      algo=bayes_algo, trials=bayes_trials,
                      max_evals=bayer_max_evals)

    of_connection = open(bayer_save_fname, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([stock_name, year, bayes_best, best_model_fname])
    of_connection.close()

    windows_size_best.append([year, bayes_best])
# endregion


Window size is 7




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 36 samples, validate on 9 samples
Epoch 1/1000





 - 8s 228ms/step - loss: 0.2805 - val_loss: 0.8803

Epoch 2/1000
 - 0s 2ms/step - loss: 0.2727 - val_loss: 0.9343

Epoch 3/1000
 - 0s 3ms/step - loss: 0.2575 - val_loss: 1.0437

Epoch 4/1000
 - 0s 3ms/step - loss: 0.2343 - val_loss: 1.2447

Epoch 5/1000
 - 0s 4ms/step - loss: 0.2112 - val_loss: 1.2957

Epoch 6/1000
 - 0s 4ms/step - loss: 0.2407 - val_loss: 1.1468

Epoch 7/1000
 - 0s 4ms/step - loss: 0.1970 - val_loss: 1.0348

Epoch 8/1000
 - 0s 4ms/step - loss: 0.1949 - val_loss: 0.9656

Epoch 9/1000
 - 0s 4ms/step - loss: 0.2007 - val_loss: 0.9175

Epoch 10/1000
 - 0s 4ms/step - loss: 0.2074 - val_loss: 0.8794

Epoch 11/1000
 - 0s 3ms/step - loss: 0.2030 - val_loss: 0.8447

Epoch 12/1000
 - 0s 3