In [1]:
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM

import math
import numpy as np
import pandas as pd
import sklearn.preprocessing as prep
import matplotlib.pyplot as plt2

import os
from datetime import date, timedelta, datetime

Using TensorFlow backend.


In [2]:
num_classes = 3
_window_ = 30 # days

_scaler_ = prep.MinMaxScaler() # StandardScaler()

_loss_ = "categorical_crossentropy"
_optimizer_ = "adam" # rmsprop
_verbose_ = 0

_batch_size_ = 512
_epochs_ = 300

_train_test_split_ = 1.0
_validation_split_ = 0.0

_date_ = '20180608'

In [3]:
def tbrain_loss(y_true, y_pred):
    w = np.array([[0.10], [0.15], [0.20], [0.25], [0.30]])
    
    weights = np.concatenate((w,w,w,w,w,w))
    weights = tf.convert_to_tensor(weights, np.float32)
    
    squares = K.square(y_pred - y_true)
    squares = squares * weights
    
    return K.mean(squares, axis=-1)

In [4]:
def build_model(shape):
    model = Sequential()
    
    model.add(LSTM(20, input_shape=shape, return_sequences=True))
    model.add(LSTM(10, input_shape=(shape[0], 20), return_sequences=True))
    model.add(Dense(num_classes, activation="softmax"))
    
    model.compile(loss=tbrain_loss, optimizer=_optimizer_, metrics=['accuracy'])
    
    return model

In [5]:
def scale_data(X):
    samples, nx, ny = X.shape
    X = X.reshape((samples, nx * ny))
    
    scaler = _scaler_.fit(X)
    X = scaler.transform(X)
    
    return X.reshape((samples, nx, ny))

def scale_backward(d, y):
    if type(_scaler_) is prep.data.MinMaxScaler:
        return y * (max(d) - min(d)) + min(d)

def scale_toward(d, x):
    if type(_scaler_) is prep.data.MinMaxScaler:
        return (x - min(d)) / (max(d) - min(d))

In [6]:
def to_classes(i, j):
    diff = float( format(j, '.2f') ) - float( format(i, '.2f') )
    if diff > 0:
        return 0  # 漲
    elif diff < 0:
        return 1  # 跌
    else:
        return 2  # 平

def preprocess_data(data, _column_):
    feature_len = len(data.columns)
    
    data = data.as_matrix()
    
    predicted_days = 5
    seq_len = _window_ + predicted_days  ## _window_s + predicted_days
    
    result = []
    for index in range(len(data) - seq_len):
        result.append(data[index : index + seq_len])
    
    result = np.array(result)
    scaled_result = scale_data(result)
    
    """"""
    close_price = data[:,3]
    classes = [0] + [to_classes(i,j) for i, j in zip(close_price[:-1], close_price[1:])]
    data2 = data
    data2[:,3] = np.array(classes)
    
    result2 = []
    for index in range(len(data2) - seq_len):
        result2.append(data2[index : index + seq_len])
    result2 = np.array(result2)
    """"""
    
    row = round(_train_test_split_ * scaled_result.shape[0])
    train = scaled_result[:int(row)]
#     test = scaled_result[int(row):]
    train2 = result2[:int(row)]
#     test2 = result2[int(row):]

    X_test, y_test = [], []

    X_train = train[:, :-predicted_days, :]
    y_train = train2[:, predicted_days:, _column_]  ## change

#     X_test = test[:, :-predicted_days, :]
#     y_test = test2[:, predicted_days:, _column_]  ## change

    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], feature_len))
    y_train = y_train.reshape( y_train.shape[0], y_train.shape[1], 1 )
    
#     X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], feature_len))
#     y_test = y_test.reshape( y_test.shape[0], y_test.shape[1], 1 )
    
    return [X_train, y_train, X_test, y_test]

In [7]:
def plot_result(pred, y):
    plt2.figure()
    plt2.plot(pred, color='red', label='Prediction')
    plt2.plot(y, color='blue', label='Ground Truth')
    plt2.legend()
    plt2.show()
    
def plot_postprocessing( input ):
    output = [[1, -1, 0][np.argmax(i)] for i in input[0][-5:].tolist()]
    
    for i in range( 1, input.shape[0] ):
        output = np.append(
            output,
            [[1, -1, 0][np.argmax(i)] for i in input[i][-1].tolist()]
        )
    
    return output

In [8]:
# ↓↓↓ __main__

In [9]:
def show_msg(now, last):
    diff = float( format(now, '.2f') ) - float( format(last, '.2f') )    
    if diff > 0:
        return "1"  # 漲
    elif diff < 0:
        return "-1" # 跌
    else:
        return "0"  # 平

def format_n_days(price):
    output = str(_code_) + '\t'
    for i in range(0, 5 if len(price)>=5 else len(price)):
        if i==0:
            output += '{}\t{:.2f}\t'.format(show_msg(price[i], last), price[i])
        else:
            output += '{}\t{:.2f}\t'.format(show_msg(price[i], price[i-1]), price[i])
    return output

def TBrain_score(predict_str, real_str):
    predict = predict_str.split('\t')[1:-1]
    real = real_str.split('\t')[1:-1]

    weights = [0.10, 0.15, 0.20, 0.25, 0.30]

    # (實際價格 – 絕對值(預測價格 – 實際價格)) /實際價格)*0.5
    p_score = [
        ( ( float(r) - abs(float(p) - float(r)) ) / float(r) ) * 0.5
        for (r, p) in zip(real[1::2], predict[1::2]) 
    ]

    # 預測正確得 0.5
    q_score = [
        0.5 if float(p) == float(r) else 0.0
        for (r, p) in zip(real[0::2], predict[0::2])
    ]

    return sum([ p*w + q*w for (p, q, w) in zip(p_score, q_score, weights) ])

In [10]:
from ipywidgets import IntProgress
from IPython.display import display

bar = IntProgress(min=0, max=18)
display(bar)

df = pd.read_csv('/home/ddl/Desktop/Notebooks/TBrain/' + _date_ + '/18ETF.csv', thousands=',')

df['sort'] = df['代碼'].astype(str)
df = df.sort_values(by='sort', ascending=True).drop(columns=['sort'])

for _code_ in df.代碼.unique():
    
    data = df[df['代碼'] == _code_].sort_values(by='日期', ascending=True) \
                                    .drop(columns=['代碼', '日期', '中文簡稱'])
    
    # close price
    X_train, y_train, X_test, y_test = preprocess_data(data, 3)

    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    model_close = build_model(X_train.shape[1:])
    model_close.fit(X_train, y_train, batch_size=_batch_size_, epochs=_epochs_, validation_split=_validation_split_, verbose=_verbose_)
    model_close.save("Model/Binary-{}/{}_model_close.h5".format(_date_,_code_))
    
    bar.value += 1
    
    last = np.array(data)[-1][3]
    date = int(df[df['代碼'] == _code_].sort_values(by='日期', ascending=False).head(1).日期)

    # predict 5 days
    dataArr = np.array(data)
    X = dataArr[-_window_:]
    X[:, 0] = scale_toward(list(dataArr[:, 0]), X[:, 0])
    X[:, 1] = scale_toward(list(dataArr[:, 1]), X[:, 1])
    X[:, 2] = scale_toward(list(dataArr[:, 2]), X[:, 2])
    X[:, 3] = scale_toward(list(dataArr[:, 3]), X[:, 3])
    X[:, 4] = scale_toward(list(dataArr[:, 4]), X[:, 4])
    X = np.expand_dims(X, axis=0)

    predict = model_close.predict(X)
    predict = [[1, -1, 0][np.argmax(i)] for i in predict[0][-5:].tolist()]

    # print real
    real_date = (datetime.strptime(_date_, '%Y%m%d')  + timedelta(days=7)).strftime('%Y%m%d')
    real_path = '/home/ddl/Desktop/Notebooks/TBrain/' + real_date + '/18ETF.csv'
    if os.path.exists(real_path):
        df2 = pd.read_csv(real_path, thousands=',')
        df2 = df2[df2['代碼'] == _code_]
        df2 = list( df2[df2['日期'] > date]['收盤價(元)'] )
        real = format_n_days(df2).split('\t')[1::2][0:-1]

        print( str(_code_) + '\t' + '\t'.join([str(i) for i in predict]) )
        print( str(_code_) + '\t' + '\t'.join(real) + ' (real)' )
    
    
#     break

IntProgress(value=0, max=18)