In [19]:
import os, sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
sys.path.append('/tf/crypto_prediction_ml_dl/script')
from trino_operations import trino_operator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Bidirectional
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from keras import optimizers

from keras.losses import mean_squared_error
from sklearn import metrics

import tensorflow
print("Num GPUs Available: ", len(tensorflow.config.list_physical_devices('GPU')))

import copy

# %matplotlib notebook
# plt.rcParams['figure.figsize'] = [10, 10]


warnings.filterwarnings("ignore")
trino_ope = trino_operator.Operator()



def create_dataframe_from_query(query,column_list):
    res = trino_ope.run_query(query)

    indicators_query_result = {}
    for idx, row_data in enumerate(res,1):
        indicators_query_result[int(idx)] = row_data

    indicators_raw_df = pd.DataFrame.from_dict(
        indicators_query_result, orient="index", columns=column_list
    )

    return indicators_raw_df



def calculate_gain_percent(df, col_name: str, past_data_points: list):
    """
        Calculate gain(%) comparing the current data and the data from P points before.
    """
    df_with_gain = copy.deepcopy(df)
    for data_point in past_data_points:
        new_col_name = f'p{data_point}_gain_percent'
        past_gain_df = df[col_name].shift(data_point)
        df_with_gain[new_col_name] = (df[col_name] / past_gain_df - 1.0) * 100.0
    return df_with_gain


def calculate_gain_bool(df, col_name: str, past_data_points: list):
    """
        Calculate gain(%) comparing the current data and the data from P points before.
    """
    df_with_gain = copy.deepcopy(df)
    for data_point in past_data_points:
        new_col_name = f'p{data_point}_gain_bool'
        past_gain_df = df[col_name].shift(data_point)
        df_with_gain[new_col_name] = np.heaviside((df[col_name] / past_gain_df - 1.0), 1.0)
    return df_with_gain


Num GPUs Available:  1


In [2]:
########################
# Create BTC_USDT featrue dataframe
########################
query = f"""
    with 
    ohlcv_minute_data as
    (
        select 
            open,
            high,
            low,
            close,
            amount as total_volume,
            quantity as total_quantity,
            tradeCount as total_trade_count,
            date_trunc('minute',ts_create_utc) as ts
        from 
            hive.crypto_raw.candles_minute 
        where 
            id = 'BTC_USDT'
    ),
    buy_tacker_market_data as
    (
        select
            date_trunc('minute',ts_create_utc) as ts,
            count(trade_id) as buy_trade_count,
            sum(amount) as sum_buy_taker_amount,
            sum(quantity)  as sum_buy_taker_quantity,
            avg(price) as avg_buy_trade_price
        from 
            hive.crypto_raw.market_trade
        where
            id = 'BTC_USDT'
            and takerSide = 'buy'
        group by
            1
    ), 
    sell_tacker_market_data as
    (
        select
            date_trunc('minute',ts_create_utc) as ts,
            count(trade_id) as sell_trade_count,
            sum(amount) as sum_sell_taker_amount,
            sum(quantity) as sum_sell_taker_quantity,
            avg(price) as avg_sell_trade_price
        from 
            hive.crypto_raw.market_trade
        where
            id = 'BTC_USDT'
            and takerSide = 'sell'
        group by
            1
    )
    select
        a.open as open,
        a.high as high,
        a.low as low,
        a.close as close,
        a.total_volume as total_volume,
        a.total_quantity as total_quantity,
        a.total_trade_count as total_trade_count,
        b.buy_trade_count as buy_trade_count,
        b.sum_buy_taker_amount as sum_buy_taker_amount,
        b.sum_buy_taker_quantity as sum_buy_taker_quantity,
        b.avg_buy_trade_price as avg_buy_trade_price,
        c.sell_trade_count as sell_trade_count,
        c.sum_sell_taker_amount as sum_sell_taker_amount,
        c.sum_sell_taker_quantity as sum_sell_taker_quantity,
        c.avg_sell_trade_price as avg_sell_trade_price,
        a.ts as ts
    from
        ohlcv_minute_data as a
    left join
        buy_tacker_market_data as b
    on a.ts = b.ts
    left join
        sell_tacker_market_data as c
    on 
        a.ts = c.ts
    where 
        b.buy_trade_count is not null
        and c.sell_trade_count is not null
    order by
        a.ts
"""

column_list = [
    'open',
    'high',
    'low',
    'close',
    'total_volume',
    'total_quantity',
    'total_trade_count',
    'buy_trade_count',
    'sum_buy_taker_amount',
    'sum_buy_taker_quantity',
    'avg_buy_trade_price',
    'sell_trade_count',
    'sum_sell_taker_amount',
    'sum_sell_taker_quantity',
    'avg_sell_trade_price',
    'ts'
]
btc_raw_df = create_dataframe_from_query(query,column_list)
btc_raw_df.tail()

Unnamed: 0,open,high,low,close,total_volume,total_quantity,total_trade_count,buy_trade_count,sum_buy_taker_amount,sum_buy_taker_quantity,avg_buy_trade_price,sell_trade_count,sum_sell_taker_amount,sum_sell_taker_quantity,avg_sell_trade_price,ts
84855,34679.89,34690.18,34678.37,34689.94,68922.0,1.987109,53,30,35230.82,1.015705,34685.645,23,33691.184,0.971404,34683.227,2023-11-04 00:56:00
84856,34683.09,34694.5,34680.32,34694.5,55045.92,1.58693,48,22,24462.814,0.705217,34687.832,26,30583.105,0.881713,34686.098,2023-11-04 00:57:00
84857,34692.95,34695.91,34685.34,34690.24,94609.47,2.727226,75,42,52213.31,1.505105,34690.734,33,42396.164,1.222121,34690.445,2023-11-04 00:58:00
84858,34695.54,34700.26,34682.67,34696.45,41751.65,1.203386,33,14,16575.506,0.477744,34694.1,19,25176.15,0.725642,34694.57,2023-11-04 00:59:00
84859,34696.25,34698.63,34680.34,34685.49,54621.1,1.574398,44,1,1835.5253,0.052907,34693.43,5,6360.48,0.183323,34695.67,2023-11-04 01:00:00


In [24]:
past_data_points = [1,3,5,10,30,60]
col_name = "close"
btc_raw_df_with_gain_df = calculate_gain_bool(btc_raw_df, col_name, past_data_points)
btc_raw_df_with_gain_df = calculate_gain_percent(btc_raw_df_with_gain_df, col_name, past_data_points)
btc_raw_df_with_gain_df[['close','p1_gain_bool','p1_gain_percent']][-10:]

Unnamed: 0,close,p1_gain_bool,p1_gain_percent
84850,34660.6,0.0,-0.005337
84851,34666.31,1.0,0.016474
84852,34679.7,1.0,0.038625
84853,34683.03,1.0,0.009602
84854,34680.89,0.0,-0.00617
84855,34689.94,1.0,0.026095
84856,34694.5,1.0,0.013145
84857,34690.24,0.0,-0.012279
84858,34696.45,1.0,0.017901
84859,34685.49,0.0,-0.031588


In [25]:
####################################
# Create Dataset with target value
# Predict future gain (wethere increasing or decreasing)
####################################
predicting_points = 1
btc_df_with_target = btc_raw_df_with_gain_df
target_column_to_predict = 'p1_gain_bool'
target = btc_raw_df_with_gain_df[target_column_to_predict].shift(-predicting_points)
btc_df_with_target['target'] = target
btc_df_with_target[['close','target']].tail(10)

Unnamed: 0,close,target
84850,34660.6,1.0
84851,34666.31,1.0
84852,34679.7,1.0
84853,34683.03,0.0
84854,34680.89,1.0
84855,34689.94,1.0
84856,34694.5,0.0
84857,34690.24,1.0
84858,34696.45,0.0
84859,34685.49,


In [26]:
btc_df_with_target.describe()

Unnamed: 0,open,high,low,close,total_volume,total_quantity,total_trade_count,buy_trade_count,sum_buy_taker_amount,sum_buy_taker_quantity,...,p10_gain_bool,p30_gain_bool,p60_gain_bool,p1_gain_percent,p3_gain_percent,p5_gain_percent,p10_gain_percent,p30_gain_percent,p60_gain_percent,target
count,84859.0,84859.0,84859.0,84859.0,84859.0,84859.0,84859.0,84859.0,84859.0,84859.0,...,84849.0,84829.0,84799.0,84858.0,84856.0,84854.0,84849.0,84829.0,84799.0,84858.0
mean,28368.074573,28375.138801,28361.175035,28368.587945,30781.805962,1.082564,27.080616,13.338361,15480.373766,0.544019,...,0.506677,0.509885,0.514181,0.000361,0.001088,0.001814,0.003623,0.01084,0.021657,0.508944
min,24947.39,25004.33,24923.65,24947.41,5.4,0.000173,2.0,1.0,0.026938,1e-06,...,0.0,0.0,0.0,-1.50061,-2.794074,-2.762742,-3.943787,-5.113861,-4.144511,0.0
25%,26486.875,26491.79,26482.105,26487.475,15685.855,0.558441,14.0,7.0,7277.7425,0.256861,...,0.0,0.0,0.0,-0.01671,-0.031083,-0.040416,-0.057074,-0.094633,-0.129959,0.0
50%,27101.73,27108.16,27096.62,27102.34,23917.29,0.838438,22.0,10.0,11915.555,0.421353,...,1.0,1.0,1.0,5.9e-05,0.000368,0.000541,0.001204,0.003083,0.006334,1.0
75%,28582.51,28592.85,28572.955,28586.13,37753.83,1.357889,33.0,16.0,19212.807,0.685738,...,1.0,1.0,1.0,0.017551,0.033554,0.044228,0.0635,0.108288,0.154796,1.0
max,35897.82,35915.21,35889.01,35898.13,601811.3,22.802782,887.0,613.0,475487.72,16.123856,...,1.0,1.0,1.0,2.450037,5.412309,6.056635,6.507225,7.308152,8.046886,1.0
std,2960.906744,2963.740215,2958.205506,2961.094374,25801.065913,0.865062,21.588618,11.997325,14133.302433,0.474477,...,0.499958,0.499905,0.499802,0.052191,0.094313,0.122644,0.171544,0.285992,0.390133,0.499923


In [27]:
# NaN value might be included in the dataset.
# Count the number of NaN for each column.
labels = []
values = []
print('# of records:',btc_df_with_target.shape[0])
print('<column name>: # of NaN records in the column')
for col in btc_df_with_target.columns:
    labels.append(col)
    values.append(btc_df_with_target[col].isnull().sum())
    print("{}: {}".format(col, values[-1]))

# of records: 84859
<column name>: # of NaN records in the column
open: 0
high: 0
low: 0
close: 0
total_volume: 0
total_quantity: 0
total_trade_count: 0
buy_trade_count: 0
sum_buy_taker_amount: 0
sum_buy_taker_quantity: 0
avg_buy_trade_price: 0
sell_trade_count: 0
sum_sell_taker_amount: 0
sum_sell_taker_quantity: 0
avg_sell_trade_price: 0
ts: 0
p1_gain_bool: 1
p3_gain_bool: 3
p5_gain_bool: 5
p10_gain_bool: 10
p30_gain_bool: 30
p60_gain_bool: 60
p1_gain_percent: 1
p3_gain_percent: 3
p5_gain_percent: 5
p10_gain_percent: 10
p30_gain_percent: 30
p60_gain_percent: 60
target: 1


In [28]:
btc_df_with_target = btc_df_with_target.dropna()

# NaN value might be included in the dataset.
# Count the number of NaN for each column.
labels = []
values = []
print('# of records:',btc_df_with_target.shape[0])
print('<column name>: # of NaN records in the column')
for col in btc_df_with_target.columns:
    labels.append(col)
    values.append(btc_df_with_target[col].isnull().sum())
    print("{}: {}".format(col, values[-1]))

# of records: 84798
<column name>: # of NaN records in the column
open: 0
high: 0
low: 0
close: 0
total_volume: 0
total_quantity: 0
total_trade_count: 0
buy_trade_count: 0
sum_buy_taker_amount: 0
sum_buy_taker_quantity: 0
avg_buy_trade_price: 0
sell_trade_count: 0
sum_sell_taker_amount: 0
sum_sell_taker_quantity: 0
avg_sell_trade_price: 0
ts: 0
p1_gain_bool: 0
p3_gain_bool: 0
p5_gain_bool: 0
p10_gain_bool: 0
p30_gain_bool: 0
p60_gain_bool: 0
p1_gain_percent: 0
p3_gain_percent: 0
p5_gain_percent: 0
p10_gain_percent: 0
p30_gain_percent: 0
p60_gain_percent: 0
target: 0


In [29]:
# Drop timestamp column
btc_ts = btc_df_with_target['ts']
dataset_raw_df = btc_df_with_target.drop(['ts'], axis=1)
dataset_raw_df.tail()

Unnamed: 0,open,high,low,close,total_volume,total_quantity,total_trade_count,buy_trade_count,sum_buy_taker_amount,sum_buy_taker_quantity,...,p10_gain_bool,p30_gain_bool,p60_gain_bool,p1_gain_percent,p3_gain_percent,p5_gain_percent,p10_gain_percent,p30_gain_percent,p60_gain_percent,target
84854,34674.69,34682.17,34668.37,34680.89,66058.22,1.904938,50,18,23420.5,0.675369,...,0.0,1.0,0.0,-0.00617,0.042058,0.053199,-0.00839,0.005536,-0.096041,1.0
84855,34679.89,34690.18,34678.37,34689.94,68922.0,1.987109,53,30,35230.82,1.015705,...,1.0,1.0,0.0,0.026095,0.029527,0.084649,0.012397,0.031229,-0.070058,1.0
84856,34683.09,34694.5,34680.32,34694.5,55045.92,1.58693,48,22,24462.814,0.705217,...,1.0,1.0,0.0,0.013145,0.033071,0.081318,0.08051,0.047725,-0.043965,0.0
84857,34692.95,34695.91,34685.34,34690.24,94609.47,2.727226,75,42,52213.31,1.505105,...,1.0,1.0,0.0,-0.012279,0.02696,0.030392,0.077459,0.02347,-0.075181,1.0
84858,34695.54,34700.26,34682.67,34696.45,41751.65,1.203386,33,14,16575.506,0.477744,...,1.0,1.0,0.0,0.017901,0.018766,0.038693,0.10915,0.040741,-0.057552,0.0


In [30]:
# Scale data (MinMaxSclaer, Normalization)
from sklearn.preprocessing import MinMaxScaler
sc_features = MinMaxScaler(feature_range = (0,1))
feature_columns = [
    'open',
    'high',
    'low',
    'close',
    'total_volume',
    'total_quantity',
    'total_trade_count',
    'buy_trade_count',
    'sum_buy_taker_amount',
    'sum_buy_taker_quantity',
    'avg_buy_trade_price',
    'sell_trade_count',
    'sum_sell_taker_amount',
    'sum_sell_taker_quantity',
    'avg_sell_trade_price'
]
feature_columns = [
    'p1_gain_percent'
]
dataset_feature_scaled = sc_features.fit_transform(dataset_raw_df[feature_columns])
print(dataset_feature_scaled.shape)
dataset_feature_scaled

(84798, 1)


array([[0.37349386],
       [0.37767109],
       [0.37884814],
       ...,
       [0.38316629],
       [0.37673098],
       [0.38437021]])

In [31]:
dataset_feature_scaled = sc_features.fit_transform(dataset_raw_df[feature_columns])
print(dataset_feature_scaled.shape)
dataset_feature_scaled

(84798, 1)


array([[0.37349386],
       [0.37767109],
       [0.37884814],
       ...,
       [0.38316629],
       [0.37673098],
       [0.38437021]])

In [32]:
sc_target = MinMaxScaler(feature_range = (0,1))
dataset_target_scaled = sc_target.fit_transform(dataset_raw_df['target'].to_frame())
print(dataset_target_scaled.shape)
dataset_target_scaled

(84798, 1)


array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [1.],
       [0.]])

In [36]:
# Add past close price data to the dataset.
train_size = 0.8
test_size = 0.2

train_index_to = int(len(dataset_feature_scaled)*train_size)
test_index_to = len(dataset_feature_scaled)

past_N = 30
X_train = []
y_train = []
for i in range(past_N,train_index_to):
    X_train.append(np.concatenate((dataset_feature_scaled[i],dataset_feature_scaled[i-past_N+1:i].flatten()), axis = 0))
    y_train.append(dataset_target_scaled[i])
X_train,y_train = np.array(X_train),np.array(y_train)
print('X_train shape',X_train.shape)
print('y_train shape',y_train.shape)

X_test = []
y_test = []
for i in range(train_index_to,test_index_to):
    X_test.append(np.concatenate((dataset_feature_scaled[i], dataset_feature_scaled[i-past_N+1:i].flatten()), axis = 0))
    y_test.append(dataset_target_scaled[i])
X_test,y_test = np.array(X_test),np.array(y_test)
print('X_test shape',X_test.shape)
print('y_test shape',y_test.shape)

X_train shape (67808, 30)
y_train shape (67808, 1)
X_test shape (16960, 30)
y_test shape (16960, 1)


In [37]:
X_train[-10:]

array([[0.38169311, 0.38164841, 0.38665345, 0.38248458, 0.37737114,
        0.37606924, 0.37535858, 0.37607647, 0.3670107 , 0.38126883,
        0.35856168, 0.38704464, 0.37815447, 0.37782421, 0.37925482,
        0.3808888 , 0.38358093, 0.37867085, 0.37688467, 0.37970353,
        0.37972892, 0.37063627, 0.37415612, 0.38022866, 0.38310879,
        0.38469224, 0.38133787, 0.37614703, 0.38847739, 0.37919556],
       [0.38342845, 0.38665345, 0.38248458, 0.37737114, 0.37606924,
        0.37535858, 0.37607647, 0.3670107 , 0.38126883, 0.35856168,
        0.38704464, 0.37815447, 0.37782421, 0.37925482, 0.3808888 ,
        0.38358093, 0.37867085, 0.37688467, 0.37970353, 0.37972892,
        0.37063627, 0.37415612, 0.38022866, 0.38310879, 0.38469224,
        0.38133787, 0.37614703, 0.38847739, 0.37919556, 0.38169311],
       [0.37997442, 0.38248458, 0.37737114, 0.37606924, 0.37535858,
        0.37607647, 0.3670107 , 0.38126883, 0.35856168, 0.38704464,
        0.37815447, 0.37782421, 0.37925482, 0.

In [38]:
y_train[-10:]

array([[1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.]])

In [40]:
# Create a Sequential model
regressor = Sequential()

"""
tf.keras.layers.LSTM(128, return_sequences=True, activation='relu'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, activation='relu')),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(2, activation='sigmoid')
"""

regressor.add(LSTM(units = 16, dropout=0.2, recurrent_dropout=0.2, return_sequences = True,input_shape = (X_train.shape[1],1)))
regressor.add(Dense(1, activation='sigmoid'))

adam = optimizers.Adam(learning_rate=0.001)
regressor.compile(loss='binary_crossentropy', optimizer=adam, metrics=['binary_accuracy'])
regressor.summary()

# second fit
callback = EarlyStopping(monitor='val_loss', patience=5)
history = regressor.fit(X_train, y_train, 
                        epochs = 500, 
                        batch_size = 64,
                        validation_split=0.3, 
                        shuffle=True, 
                        callbacks=[callback])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, None, 100)         3100      
                                                                 
 dropout (Dropout)           (None, None, 100)         0         
                                                                 
 dense_2 (Dense)             (None, None, 100)         10100     
                                                                 
 dense_3 (Dense)             (None, None, 1)           101       
                                                                 
Total params: 13301 (51.96 KB)
Trainable params: 13301 (51.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500


In [None]:
loss_history = history.history['loss']
val_loss_history = history.history['val_loss']
plt.plot(loss_history, color = 'red', label = 'train loss')
plt.plot(val_loss_history, color = 'blue', label = 'validation loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Predict test data and revert the price to the original scale
predicted_price = regressor.predict(X_test)
predicted_price = sc_target.inverse_transform(predicted_price)[:-predicting_points]
print('predicted_price shape:',predicted_price.shape)
# predicted_price

real_price = sc_target.inverse_transform(y_test)[:-predicting_points]
print('real_price shape:',real_price.shape)
# real_price

plt.plot(real_price, color = 'red', label = 'Real Price')
plt.plot(predicted_price, color = 'blue', label = 'Predicted Price')
plt.title('BTC_USDT Price Prediction')
plt.xlabel('Time')
plt.ylabel('BTC Price in US$')
plt.legend()
plt.show()

In [None]:

plt.plot(real_price[2000:2100], color = 'red', label = 'Real Price')
plt.plot(predicted_price[2000:2100], color = 'blue', label = 'Predicted Price')
plt.title('BTC_USDT Price Prediction')
plt.xlabel('Time')
plt.ylabel('BTC Price in US$')
plt.legend()
plt.show()

In [None]:
mse = metrics.mean_squared_error(real_price.reshape(real_price.shape[0]),
                   predicted_price.reshape(predicted_price.shape[0]))
print('mse:',mse)

In [None]:
predicted_price