In [7]:
import os,sys
sys.path.append('/tf/crypto_prediction_ml_dl/script')
sys.path.append('/tf/crypto_prediction_ml_dl/notebook/TFT_models')
from trino_operations import trino_operator
import pandas as pd
import numpy as np
from datetime import datetime
from pprint import pprint
import copy
!echo "pwd: `pwd`"

pwd: /tf/crypto_prediction_ml_dl/notebook/TFT_models/day_models


In [8]:
# Create a new folder for new crypto dataset
path = "./data/crypto_day"
isExist = os.path.exists(path)
if not isExist:
   os.makedirs(path)
   print("The new directory is created!")
    
# The dataset is gonna be saved as out_file.
out_file = os.path.join(path, 'crypto_day.csv')

In [12]:
# Load crypto data from Hive mart tables via Trino
trino_ope = trino_operator.Operator()

def create_dataframe_from_hive_mart_table(
    target_schema, target_table, target_columns, target_symbol, column_prefix
):
    query = f"""
    select
        row_number() over(order by dt) as row_id,
        {target_columns}
    from 
        hive.{target_schema}.{target_table}
    where
        id = '{target_symbol}'
    order by dt
    """

    res = trino_ope.run_query(query)

    indicators_query_result = {}
    for idx, row_data in enumerate(res,1):
        indicators_query_result[int(idx)] = row_data

    _target_columns_list = ['row_id'] + target_columns.strip().split(",")
    target_columns_list = [
        column_prefix + column.strip() for column in _target_columns_list
    ]
    indicators_raw_df = pd.DataFrame.from_dict(
        indicators_query_result, orient="index", columns=target_columns_list
    )

    return indicators_raw_df

##################################
# Extract dataset from hive tables
##################################

# Crypto
target_schema = "crypto_mart"
target_table = "crypto_indicator_day"
target_symbol = "BTC_USDT"
# target_columns = """
#     id,
#     dt,
#     low,
#     high,
#     open,
#     close,
#     volume,
#     macd,
#     macd_single,
#     rsi,
#     bollinger_bands_sma,
#     bollinger_bands_lower_band,
#     bollinger_bands_upper_band,
#     obv,
#     obv_sma,
#     ichimoku_chikou_span,
#     ichimoku_kijun_sen,
#     ichimoku_tenkan_sen,
#     ichimoku_senkou_span_a,
#     ichimoku_senkou_span_b,
#     stoch_oscillator,
#     stoch_signal,
#     stoch_percent_j,
#     aroon_up,
#     aroon_down,
#     aroon_oscillator,
#     sma5,
#     sma10,
#     sma30,
#     ema5,
#     ema10,
#     ema30
#     """

target_columns = """
    id,
    dt,
    low,
    high,
    open,
    close,
    volume
    """

column_prefix = "btc_"
btc_raw_df = create_dataframe_from_hive_mart_table(
    target_schema, target_table, target_columns, target_symbol, column_prefix
)

##################
# Create Dataset #
##################
# Create target value for prediction
dt_column = btc_raw_df["btc_dt"]

# Check if all "date" exist in the dataframe.
first_dt = str(btc_raw_df.iloc[0]['btc_dt'])
last_dt = str(btc_raw_df.iloc[-1]['btc_dt'])
desired_date_range = pd.date_range(start=first_dt, end=last_dt, freq='D')
all_dates_exist = all(str(date)[:11].strip() == str(btc_raw_df.iloc[i]['btc_dt']).strip() for i,date in enumerate(desired_date_range))
if all_dates_exist:
    print("All dates exist in the DataFrame's index.")
else:
    print("Not all dates exist in the DataFrame's index.")

# btc_dataset_df = btc_raw_df[["btc_low","btc_high","btc_open","btc_close","btc_volume"]]
btc_raw_df.drop(["btc_dt"], axis=1)

predicting_days = 1 # predicting after N day price

##################
# Create Dataset #
##################
# Create target value for prediction
btc_df_with_target = btc_raw_df
target = btc_raw_df['btc_close'].shift(-predicting_days)
btc_df_with_target['target'] = target

# Replace NaN value to forward or backward values.
btc_df_with_target = btc_df_with_target.fillna(method='ffill')
btc_df_with_target = btc_df_with_target.fillna(method='bfill')

# Get the columns with NaN values
nan_columns = btc_df_with_target.columns[btc_df_with_target.isna().any()].tolist()

# Drop columns containing NaN values even after filling the value.
btc_df_with_target = btc_df_with_target.dropna(axis=1)

# NULL(NaN) check: False means no NULL data, True means contains NULL data
if btc_df_with_target.isnull().any().any():
    print("ERROR: NULL is in the dataset")

btc_df_with_target.to_csv(out_file)
btc_df_with_target

All dates exist in the DataFrame's index.


Unnamed: 0,btc_row_id,btc_id,btc_dt,btc_low,btc_high,btc_open,btc_close,btc_volume,target
1,1,BTC_USDT,2020-11-01,13620.57,13888.45,13790.61,13747.25,9018406.0,13558.84
2,2,BTC_USDT,2020-11-02,13207.51,13825.00,13749.99,13558.84,19482722.0,14014.59
3,3,BTC_USDT,2020-11-03,13287.61,14046.57,13556.05,14014.59,17432730.0,14133.45
4,4,BTC_USDT,2020-11-04,13532.19,14240.00,14014.59,14133.45,19503998.0,15586.95
5,5,BTC_USDT,2020-11-05,14092.32,15740.00,14143.70,15586.95,38253920.0,15579.32
...,...,...,...,...,...,...,...,...,...
1104,1104,BTC_USDT,2023-11-09,35500.00,37946.03,35616.56,36694.77,127368312.0,36583.10
1105,1105,BTC_USDT,2023-11-10,36333.42,36891.00,36697.40,36583.10,61001320.0,37120.00
1106,1106,BTC_USDT,2023-11-11,36450.00,37276.57,36600.00,37120.00,88956424.0,37054.26
1107,1107,BTC_USDT,2023-11-12,36790.00,37500.00,37104.63,37054.26,150024256.0,37104.15
