## 0.Import libraries

In [32]:
import pandas as pd
import os, sys
import numpy as np
from dataclasses import dataclass
import random, math, operator
from pprint import pprint
import pickle, gc, itertools
from pathlib import Path
import time, datetime
from collections import *
import matplotlib.pyplot as plt
import subprocess
import gc

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.utils import *
from keras.layers import LSTM
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.models import load_model

import warnings

## 1.Setup configuration

In [33]:
from requests import get
#colab_filename = get('http://172.28.0.2:9000/api/sessions').json()[0]['name']
colab_filename = 'FxBERT_upstream.ipynb'
result_filename = colab_filename.replace('.ipynb', '.csv')
result_filename

'FxBERT_upstream.csv'

In [34]:
@dataclass
class Config:
    BATCH_SIZE = 128
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 1024  # used in bert model
    NUM_LAYERS = 12
    EPOCH = 3 # 10000
    SEQLENGTH = 100
    CURRENCY ='USD_JPY_M15'
    MASK_PERCT = 0.15
    maxlen = 100
    patience = 30
    seqlen = 10
    isTrain = True

config = Config()

In [35]:
from google.colab import drive
drive.mount('/content/drive')
base_folder = '/content/drive/MyDrive/TradingBERT/'

gd_rawdata_folder = base_folder + '1.rawData/'
gd_ref_folder = base_folder + '2.refData/'
gd_processedData_folder = base_folder + '3.processedData/'
gd_mdl_folder = base_folder + '4.model/downstream/'+config.CURRENCY+'/'
gd_mdl_upstream_folder = base_folder + '4.model/upstream/'
gd_result_folder = base_folder + '5.result/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
Path(gd_processedData_folder).mkdir(parents=True, exist_ok=True)
Path(gd_ref_folder).mkdir(parents=True, exist_ok=True)
Path(gd_mdl_folder).mkdir(parents=True, exist_ok=True)
Path(gd_result_folder).mkdir(parents=True, exist_ok=True)
# Path(gd_rawdata_folder).mkdir(parents=True, exist_ok=True)

## 2.FxBERT

### 2.1 Read tfrecords - upstream

In [37]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
def read_dataset_Fintech(channel, channel_name, maxlen):

    def _parse_seq_function(example_proto):

        seq_feature_description = {
            'isNext': tf.io.FixedLenFeature([], tf.int64),
            'input_ids': tf.io.FixedLenFeature([], tf.string),
            'segment_ids': tf.io.FixedLenFeature([], tf.string),
            'input_ids_ori': tf.io.FixedLenFeature([], tf.string),
            'sample_weight': tf.io.FixedLenFeature([], tf.string),
            'segment_ids_3num': tf.io.FixedLenFeature([], tf.string),
        }

        features = tf.io.parse_single_example(example_proto, seq_feature_description)

        # upstream
        x_input_ids = tf.io.decode_raw(features['input_ids'], tf.int64)
        x_input_ids = tf.reshape(x_input_ids, [maxlen,6])
        x_input_ids_candle, x_input_ids_pos, x_input_ids_die, x_input_ids_cle, x_input_ids_ule, x_input_ids_lle = \
                        tf.split(x_input_ids, 6, axis=1)
        x_input_ids_candle, x_input_ids_pos, x_input_ids_die, x_input_ids_cle, x_input_ids_ule, x_input_ids_lle = \
                        tf.squeeze(x_input_ids_candle, axis=1), tf.squeeze(x_input_ids_pos, axis=1), \
                        tf.squeeze(x_input_ids_die, axis=1), tf.squeeze(x_input_ids_cle, axis=1), \
                        tf.squeeze(x_input_ids_ule, axis=1), tf.squeeze(x_input_ids_lle, axis=1)

        x_segment_ids = tf.io.decode_raw(features['segment_ids_3num'], tf.int64)
        x_segment_ids = tf.reshape(x_segment_ids, [maxlen,])

        out_input_ids = tf.io.decode_raw(features['input_ids_ori'], tf.int64)
        out_input_ids = tf.reshape(out_input_ids, [maxlen,6])
        out_input_ids_candle, out_input_ids_pos, out_input_ids_die, out_input_ids_cle, out_input_ids_ule, out_input_ids_lle = \
                        tf.split(out_input_ids, 6, axis=1)
        out_input_ids_candle, out_input_ids_pos, out_input_ids_die, out_input_ids_cle, out_input_ids_ule, out_input_ids_lle = \
                        tf.squeeze(out_input_ids_candle, axis=1), tf.squeeze(out_input_ids_pos, axis=1), \
                        tf.squeeze(out_input_ids_die, axis=1), tf.squeeze(out_input_ids_cle, axis=1), \
                        tf.squeeze(out_input_ids_ule, axis=1), tf.squeeze(out_input_ids_lle, axis=1)

        label_isNext = features['isNext']

        s_weight = tf.io.decode_raw(features['sample_weight'], tf.int64)
        s_weight = tf.reshape(s_weight, [maxlen,])

        inputs = {}
        inputs['input_ids_candle'] = x_input_ids_candle
        inputs['input_ids_pos'] = x_input_ids_pos
        inputs['input_ids_die'] = x_input_ids_die
        inputs['input_ids_cle'] = x_input_ids_cle
        inputs['input_ids_ule'] = x_input_ids_ule
        inputs['input_ids_lle'] = x_input_ids_lle
        inputs['segment_ids'] = x_segment_ids

        targets = {}
        targets['out_input_ids_candle'] = out_input_ids_candle
        targets['out_input_ids_pos'] = out_input_ids_pos
        targets['out_input_ids_die'] = out_input_ids_die
        targets['out_input_ids_cle'] = out_input_ids_cle
        targets['out_input_ids_ule'] = out_input_ids_ule
        targets['out_input_ids_lle'] = out_input_ids_lle
        targets['label_isNext'] = label_isNext

        sample_weights = {}
        sample_weights['out_input_ids_candle'] = s_weight
        sample_weights['out_input_ids_pos'] = s_weight
        sample_weights['out_input_ids_die'] = s_weight
        sample_weights['out_input_ids_cle'] = s_weight
        sample_weights['out_input_ids_ule'] = s_weight
        sample_weights['out_input_ids_lle'] = s_weight
        sample_weights['label_isNext'] = tf.ones_like(label_isNext)

        return inputs, targets, sample_weights

    filenames = [os.path.join(channel, c) for c in channel_name]
    print(filenames)
    dataset = tf.data.TFRecordDataset(filenames)

    dataset = dataset.map(_parse_seq_function, num_parallel_calls=AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=4000000)
    dataset = dataset.prefetch(AUTOTUNE)

    dataset = dataset.batch(config.BATCH_SIZE, drop_remainder=False)

    return dataset

In [38]:
upstream_tfrecordfile = [gd_processedData_folder+'FxBERT_upstream_AUD_USD_M15_sample.tfrecords']
ds_train = read_dataset_Fintech('', upstream_tfrecordfile, config.maxlen)

total_batches = sum(1 for _ in ds_train)
upstream_valid = ds_train.skip(total_batches - 1).take(1)
upstream_train = ds_train.take(total_batches - 1)

['/content/drive/MyDrive/MyResearch_FT/14.BERT/11.github/TradingBERT/3.processedData/FxBERT_upstream_AUD_USD_M15_sample.tfrecords']


### 2.2 basic functions
code ref for BERT_module: https://keras.io/examples/nlp/masked_language_modeling/

In [39]:
def bert_module(query, key, value, i, pad_mask=None):
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name="encoder_{}/multiheadattention".format(i),
    )(query, key, value, attention_mask=pad_mask)
    attention_output = layers.Dropout(0.1, name="encoder_{}/att_dropout".format(i))(
        attention_output
    )
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/att_layernormalization".format(i)
    )(query + attention_output)

    # Feed-forward layer
    ffn = keras.Sequential(
        [
            layers.Dense(config.FF_DIM, activation="relu"),
            layers.Dense(config.EMBED_DIM),
        ],
        name="encoder_{}/ffn".format(i),
    )
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name="encoder_{}/ffn_dropout".format(i))(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/ffn_layernormalization".format(i)
    )(attention_output + ffn_output)

    return sequence_output

def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc

def get_attn_pad_mask(seq_q, seq_k):
   batch_size, len_q = seq_q.get_shape()
   batch_size, len_k = seq_k.get_shape()
   # eq(zero) is PAD token
   pad_attn_mask = tf.equal(seq_k, 0)
   pad_attn_mask = tf.expand_dims(pad_attn_mask, 1)  # batch_size x 1 x len_k(=len_q), one is masking
   pad_attn_mask = tf.repeat(pad_attn_mask, repeats=[len_q], axis=1)
   return pad_attn_mask  # batch_size x len_q x len_k

### 2.3 create FxBERT model

In [40]:
candlestick_vocab_size=39083
pos_vocab_size=6109
direction_vocab_size=7
candlepiece_vocab_size=138
segment_size=3

In [41]:
def create_FxBERT(maxlen):
    input_wtoken_candle = layers.Input((maxlen,), dtype=tf.int64, name='input_ids_candle')
    input_wtoken_pos = layers.Input((maxlen,), dtype=tf.int64, name='input_ids_pos')
    input_wtoken_die = layers.Input((maxlen,), dtype=tf.int64, name='input_ids_die')
    input_wtoken_cle = layers.Input((maxlen,), dtype=tf.int64, name='input_ids_cle')
    input_wtoken_ule = layers.Input((maxlen,), dtype=tf.int64, name='input_ids_ule')
    input_wtoken_lle = layers.Input((maxlen,), dtype=tf.int64, name='input_ids_lle')

    input_segment = layers.Input((maxlen,), dtype=tf.int64, name='segment_ids')

    wtoken_candle_embeddings = layers.Embedding(candlestick_vocab_size, config.EMBED_DIM, name='wtoken_candle_embedding')(input_wtoken_candle)
    wtoken_pos_embeddings = layers.Embedding(pos_vocab_size, config.EMBED_DIM, name='wtoken_embedding')(input_wtoken_pos)
    wtoken_die_embeddings = layers.Embedding(direction_vocab_size, config.EMBED_DIM, name='direction_embedding')(input_wtoken_die)
    wtoken_cle_embeddings = layers.Embedding(candlepiece_vocab_size, config.EMBED_DIM, name='candlelen_embedding')(input_wtoken_cle)
    wtoken_ule_embeddings = layers.Embedding(candlepiece_vocab_size, config.EMBED_DIM, name='upperlen_embedding')(input_wtoken_ule)
    wtoken_lle_embeddings = layers.Embedding(candlepiece_vocab_size, config.EMBED_DIM, name='lowerlen_embedding')(input_wtoken_lle)

    segment_embeddings = layers.Embedding(segment_size, config.EMBED_DIM, name='segment_embedding')(input_segment)

    mask = layers.Embedding(candlestick_vocab_size, config.EMBED_DIM, mask_zero=True).compute_mask(input_wtoken_candle)
    attn_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")

    position_embeddings = layers.Embedding(
        input_dim=maxlen,
        output_dim=config.EMBED_DIM,
        weights=[get_pos_encoding_matrix(maxlen, config.EMBED_DIM)],
        name="position_embedding")(tf.range(start=0, limit=maxlen, delta=1))

    embeddings = wtoken_candle_embeddings + wtoken_pos_embeddings + position_embeddings + segment_embeddings + \
                    wtoken_die_embeddings + wtoken_cle_embeddings + wtoken_ule_embeddings + wtoken_lle_embeddings

    encoder_input = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_input, encoder_input, encoder_input, i, attn_mask)

    output_wtoken_candle = layers.Dense(candlestick_vocab_size, name='out_input_ids_candle', activation='softmax')(encoder_output)
    output_wtoken_pos = layers.Dense(pos_vocab_size, name='out_input_ids_pos', activation='softmax')(encoder_output)
    output_wtoken_die = layers.Dense(direction_vocab_size, name='out_input_ids_die', activation='softmax')(encoder_output)
    output_wtoken_cle = layers.Dense(candlepiece_vocab_size, name='out_input_ids_cle', activation='softmax')(encoder_output)
    output_wtoken_ule = layers.Dense(candlepiece_vocab_size, name='out_input_ids_ule', activation='softmax')(encoder_output)
    output_wtoken_lle = layers.Dense(candlepiece_vocab_size, name='out_input_ids_lle', activation='softmax')(encoder_output)

    flat_encoder_output = layers.Flatten(name="flat_encoder_output")(encoder_output)
    output_label_isNext = layers.Dense(2, name='label_isNext', activation='softmax')(flat_encoder_output)

    model = tf.keras.Model(
                            inputs=[input_wtoken_candle, input_wtoken_pos, input_wtoken_die, input_wtoken_cle,
                                    input_wtoken_ule, input_wtoken_lle, input_segment],
                            outputs=[output_wtoken_candle, output_wtoken_pos, output_wtoken_die, output_wtoken_cle,
                                     output_wtoken_ule, output_wtoken_lle, output_label_isNext],
                            name='FxBERT')

    return model

### 2.4 Train - upstream

#### 2.4.1 filenames

In [42]:
savepath_FxBERT = gd_mdl_upstream_folder+colab_filename.replace('.ipynb', '_'+config.CURRENCY+'.h5')
savepath_FxBERT_weights = savepath_FxBERT.replace('.h5', '_weights.h5')
savepath_FxBERT_checkpoint = savepath_FxBERT.replace('.h5', '_cp.h5')

In [43]:
def learning_rate_schedule(epoch, lr, current_lr=None):
    warmup_epochs = 20
    peak_lr = 1e-4
    total_epochs = config.EPOCH

    if current_lr:  # If resuming training, set warmup_lr to current_lr
        warmup_lr = current_lr
    else:
        warmup_lr = 1e-6

    if epoch < warmup_epochs:
        return warmup_lr + (peak_lr - warmup_lr) * (epoch / warmup_epochs)
    else:
        decay_factor = max(0, (total_epochs - epoch) / (total_epochs - warmup_epochs))  # Ensure non-negative
        return peak_lr * decay_factor

#### 2.4.2 training

In [44]:
if config.isTrain:
    print("FxBERT training upstream for", config.CURRENCY)

    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(savepath_FxBERT_checkpoint, monitor="val_loss", verbose=0, mode="min",
                                                            save_best_only=True, save_weights_only=False, save_freq="epoch" )
    early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=config.patience, verbose=1, mode="min", restore_best_weights=True)

    FxBERT = create_FxBERT(config.maxlen)
    opt = keras.optimizers.Adam()
    lr_callback = LearningRateScheduler(learning_rate_schedule, verbose=1)

    FxBERT.compile(
        optimizer=opt,
        loss={
                "out_input_ids_candle": keras.losses.SparseCategoricalCrossentropy(),
                "out_input_ids_pos": keras.losses.SparseCategoricalCrossentropy(),
                "out_input_ids_die": keras.losses.SparseCategoricalCrossentropy(),
                "out_input_ids_cle": keras.losses.SparseCategoricalCrossentropy(),
                "out_input_ids_ule": keras.losses.SparseCategoricalCrossentropy(),
                "out_input_ids_lle": keras.losses.SparseCategoricalCrossentropy(),
                "label_isNext": keras.losses.SparseCategoricalCrossentropy(),
            },
        weighted_metrics={
                "out_input_ids_candle": 'accuracy',
                "out_input_ids_pos": 'accuracy',
                "out_input_ids_die": 'accuracy',
                "out_input_ids_cle": 'accuracy',
                "out_input_ids_ule": 'accuracy',
                "out_input_ids_lle": 'accuracy',
                "label_isNext": 'accuracy',
            },
        )

    FxBERT.fit(
        upstream_train,
        validation_data=upstream_valid,
        epochs=config.EPOCH,
        callbacks=[lr_callback, checkpoint_cb, early_stop],
        verbose=2)

    FxBERT.save(savepath_FxBERT)
    FxBERT.save_weights(savepath_FxBERT_weights)

FxBERT training upstream for USD_JPY_M15

Epoch 1: LearningRateScheduler setting learning rate to 1e-06.
Epoch 1/3


  saving_api.save_model(


576/576 - 130s - loss: 1.8373 - out_input_ids_candle_loss: 0.3033 - out_input_ids_pos_loss: 0.2506 - out_input_ids_die_loss: 0.0609 - out_input_ids_cle_loss: 0.1573 - out_input_ids_ule_loss: 0.1598 - out_input_ids_lle_loss: 0.1797 - label_isNext_loss: 0.7258 - out_input_ids_candle_accuracy: 1.8898e-05 - out_input_ids_pos_accuracy: 1.2284e-04 - out_input_ids_die_accuracy: 0.2541 - out_input_ids_cle_accuracy: 5.2442e-04 - out_input_ids_ule_accuracy: 1.7953e-04 - out_input_ids_lle_accuracy: 5.3859e-04 - label_isNext_accuracy: 0.5003 - val_loss: 1.8087 - val_out_input_ids_candle_loss: 0.3052 - val_out_input_ids_pos_loss: 0.2526 - val_out_input_ids_die_loss: 0.0574 - val_out_input_ids_cle_loss: 0.1575 - val_out_input_ids_ule_loss: 0.1605 - val_out_input_ids_lle_loss: 0.1795 - val_label_isNext_loss: 0.6959 - val_out_input_ids_candle_accuracy: 0.0000e+00 - val_out_input_ids_pos_accuracy: 0.0000e+00 - val_out_input_ids_die_accuracy: 0.3172 - val_out_input_ids_cle_accuracy: 0.0000e+00 - val_out