In [2]:
import os,sys
sys.path.append('/tf/crypto_prediction_ml_dl/script')
sys.path.append('/tf/crypto_prediction_ml_dl/notebook/TFT_models')
from trino_operations import trino_operator
import pandas as pd
import numpy as np
from datetime import datetime
from pprint import pprint
import copy
!echo "pwd: `pwd`"

pwd: /tf/crypto_prediction_ml_dl/notebook/TFT_models/models/minute_models


In [3]:
# Create a new folder for new crypto dataset
path = "./data/crypto_minute"
isExist = os.path.exists(path)
if not isExist:
   os.makedirs(path)
   print("The new directory is created!")
    
# The dataset is gonna be saved as out_file.
out_file = os.path.join(path, 'crypto_minute.csv')

In [4]:
# Load crypto data from Hive mart tables via Trino
trino_ope = trino_operator.Operator()

def create_dataframe_from_query(query,column_list):
    res = trino_ope.run_query(query)

    indicators_query_result = {}
    for idx, row_data in enumerate(res,1):
        indicators_query_result[int(idx)] = row_data

    indicators_raw_df = pd.DataFrame.from_dict(
        indicators_query_result, orient="index", columns=column_list
    )

    return indicators_raw_df

##################################
# Extract dataset from hive tables
##################################
query = f"""

    with 
    ohlcv_minute_data as
    (
        select
            id,
            open,
            high,
            low,
            close,
            amount as total_volume,
            quantity as total_quantity,
            tradeCount as total_trade_count,
            date_trunc('minute',ts_create_utc) as ts
        from 
            hive.crypto_raw.candles_minute 
        where 
            id = 'BTC_USDT'
    ),
    buy_tacker_market_data as
    (
        select
            date_trunc('minute',ts_create_utc) as ts,
            count(trade_id) as buy_trade_count,
            sum(amount) as sum_buy_taker_amount,
            sum(quantity)  as sum_buy_taker_quantity,
            avg(price) as avg_buy_trade_price
        from 
            hive.crypto_raw.market_trade
        where
            id = 'BTC_USDT'
            and takerSide = 'buy'
        group by
            1
    ), 
    sell_tacker_market_data as
    (
        select
            date_trunc('minute',ts_create_utc) as ts,
            count(trade_id) as sell_trade_count,
            sum(amount) as sum_sell_taker_amount,
            sum(quantity) as sum_sell_taker_quantity,
            avg(price) as avg_sell_trade_price
        from 
            hive.crypto_raw.market_trade
        where
            id = 'BTC_USDT'
            and takerSide = 'sell'
        group by
            1
    )
    select
        row_number() over(order by a.ts) as row_id,
        a.id as symbol_id,
        a.open as open,
        a.high as high,
        a.low as low,
        a.close as close,
        a.total_volume as total_volume,
        a.total_quantity as total_quantity,
        a.total_trade_count as total_trade_count,
        b.buy_trade_count as buy_trade_count,
        b.sum_buy_taker_amount as sum_buy_taker_amount,
        b.sum_buy_taker_quantity as sum_buy_taker_quantity,
        b.avg_buy_trade_price as avg_buy_trade_price,
        c.sell_trade_count as sell_trade_count,
        c.sum_sell_taker_amount as sum_sell_taker_amount,
        c.sum_sell_taker_quantity as sum_sell_taker_quantity,
        c.avg_sell_trade_price as avg_sell_trade_price,
        a.ts as ts
    from
        ohlcv_minute_data as a
    left join
        buy_tacker_market_data as b
    on a.ts = b.ts
    left join
        sell_tacker_market_data as c
    on 
        a.ts = c.ts
    where 
        b.buy_trade_count is not null
        and c.sell_trade_count is not null
    order by
        a.ts
"""

column_list = [
    'row_id',
    'symbol_id',
    'open',
    'high',
    'low',
    'close',
    'total_volume',
    'total_quantity',
    'total_trade_count',
    'buy_trade_count',
    'sum_buy_taker_amount',
    'sum_buy_taker_quantity',
    'avg_buy_trade_price',
    'sell_trade_count',
    'sum_sell_taker_amount',
    'sum_sell_taker_quantity',
    'avg_sell_trade_price',
    'ts'
]
btc_raw_df = create_dataframe_from_query(query,column_list)
btc_raw_df.tail()


Unnamed: 0,row_id,symbol_id,open,high,low,close,total_volume,total_quantity,total_trade_count,buy_trade_count,sum_buy_taker_amount,sum_buy_taker_quantity,avg_buy_trade_price,sell_trade_count,sum_sell_taker_amount,sum_sell_taker_quantity,avg_sell_trade_price,ts
110284,110284,BTC_USDT,37244.63,37263.38,37240.24,37263.36,97561.86,2.619227,94,47,53090.176,1.425303,37247.996,47,44471.684,1.193924,37250.215,2023-11-23 00:56:00
110285,110285,BTC_USDT,37263.42,37276.2,37250.26,37255.62,109905.84,2.94909,89,41,43894.125,1.177821,37265.742,48,66011.73,1.771269,37268.49,2023-11-23 00:57:00
110286,110286,BTC_USDT,37255.81,37265.89,37242.2,37260.31,106369.65,2.855064,88,39,47718.348,1.280762,37257.54,49,58651.31,1.574302,37254.16,2023-11-23 00:58:00
110287,110287,BTC_USDT,37259.96,37265.89,37237.6,37252.99,84385.72,2.265181,71,37,47563.605,1.276728,37254.996,34,36822.117,0.988453,37251.59,2023-11-23 00:59:00
110288,110288,BTC_USDT,37251.29,37252.94,37240.32,37251.63,102361.88,2.7481,86,8,6707.4346,0.180089,37246.4,7,11374.063,0.305359,37248.543,2023-11-23 01:00:00


In [5]:

##################
# Create Dataset #
##################
# Create target value for prediction
ts_column = btc_raw_df["ts"]
btc_raw_df.drop(["ts"], axis=1)

predicting_days = 1 # predicting after N day price

##################
# Create Dataset #
##################
# Create target value for prediction
btc_df_with_target = btc_raw_df
target = btc_raw_df['close'].shift(-predicting_days)
btc_df_with_target['target'] = target

# Replace NaN value to forward or backward values.
btc_df_with_target = btc_df_with_target.fillna(method='ffill')
btc_df_with_target = btc_df_with_target.fillna(method='bfill')

# Get the columns with NaN values
nan_columns = btc_df_with_target.columns[btc_df_with_target.isna().any()].tolist()

# Drop columns containing NaN values even after filling the value.
btc_df_with_target = btc_df_with_target.dropna(axis=1)

# NULL(NaN) check: False means no NULL data, True means contains NULL data
if btc_df_with_target.isnull().any().any():
    print("ERROR: NULL is in the dataset")

btc_df_with_target.to_csv(out_file)
btc_df_with_target

Unnamed: 0,row_id,symbol_id,open,high,low,close,total_volume,total_quantity,total_trade_count,buy_trade_count,sum_buy_taker_amount,sum_buy_taker_quantity,avg_buy_trade_price,sell_trade_count,sum_sell_taker_amount,sum_sell_taker_quantity,avg_sell_trade_price,ts,target
1,1,BTC_USDT,25824.44,25831.18,25819.53,25825.24,95769.52,3.708438,72,41,53700.4500,2.079428,25825.186,31,42069.074,1.629010,25824.785,2023-09-02 12:11:00,25825.60
2,2,BTC_USDT,25825.69,25831.18,25824.56,25825.60,84266.64,3.262884,61,24,28694.2970,1.111076,25825.957,37,55572.350,2.151808,25825.865,2023-09-02 12:12:00,25825.53
3,3,BTC_USDT,25825.46,25826.37,25820.69,25825.53,75424.89,2.920544,56,29,36727.9600,1.422152,25825.590,27,38696.934,1.498392,25825.506,2023-09-02 12:13:00,25826.63
4,4,BTC_USDT,25825.41,25830.08,25824.86,25826.63,64327.65,2.490819,51,24,27203.9940,1.053362,25826.016,27,37123.660,1.437457,25825.879,2023-09-02 12:14:00,25826.26
5,5,BTC_USDT,25826.25,25831.17,25825.55,25826.26,66004.63,2.555701,44,24,33439.0300,1.294760,25826.838,20,32565.605,1.260941,25826.398,2023-09-02 12:15:00,25825.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110284,110284,BTC_USDT,37244.63,37263.38,37240.24,37263.36,97561.86,2.619227,94,47,53090.1760,1.425303,37247.996,47,44471.684,1.193924,37250.215,2023-11-23 00:56:00,37255.62
110285,110285,BTC_USDT,37263.42,37276.20,37250.26,37255.62,109905.84,2.949090,89,41,43894.1250,1.177821,37265.742,48,66011.730,1.771269,37268.490,2023-11-23 00:57:00,37260.31
110286,110286,BTC_USDT,37255.81,37265.89,37242.20,37260.31,106369.65,2.855064,88,39,47718.3480,1.280762,37257.540,49,58651.310,1.574302,37254.160,2023-11-23 00:58:00,37252.99
110287,110287,BTC_USDT,37259.96,37265.89,37237.60,37252.99,84385.72,2.265181,71,37,47563.6050,1.276728,37254.996,34,36822.117,0.988453,37251.590,2023-11-23 00:59:00,37251.63


In [6]:
btc_df_with_target[['ts','close','target']].tail(20)

Unnamed: 0,ts,close,target
110269,2023-11-23 00:41:00,37253.0,37253.77
110270,2023-11-23 00:42:00,37253.77,37250.61
110271,2023-11-23 00:43:00,37250.61,37251.53
110272,2023-11-23 00:44:00,37251.53,37240.99
110273,2023-11-23 00:45:00,37240.99,37254.64
110274,2023-11-23 00:46:00,37254.64,37272.13
110275,2023-11-23 00:47:00,37272.13,37301.05
110276,2023-11-23 00:48:00,37301.05,37290.86
110277,2023-11-23 00:49:00,37290.86,37302.85
110278,2023-11-23 00:50:00,37302.85,37263.94
