In [1]:
#!/usr/bin/python3
from binance.client import Client
from time import time
import pickle as pickle
from datetime import datetime
import pandas as pd
import pandas_ta as ta
from pycaret.classification import load_model, predict_model
from pycaret.classification import *
from pycaret.classification import ClassificationExperiment
from lightgbm import LGBMClassifier
import numpy as np
import random


def get_client():
    fn = '../../key/binance-key.pickle'
    with open(fn, 'rb') as handle:
        k = pickle.load(handle)
    return Client(k['API_KEY'], k['API_SECRET'])


client = get_client()


def get_unix_timestamp(date_string):
    """
    Converts the input date string to Unix timestamp.

    Parameters:
        date_string (str): Input date string in the format "dd/mm/yyyy hh:mm:ss".

    Returns:
        int: Unix timestamp of the given date.
    """
    try:
        date_obj = datetime.strptime(date_string, "%d/%m/%Y %H:%M:%S")
        timestamp = int(date_obj.timestamp())
        return timestamp
    except ValueError:
        print("Invalid date format. Please use the format 'dd/mm/yyyy hh:mm:ss'.")
        return None

def get_historical_data(start_timestamp, end_timestamp, coin_pair): 
    data = []
    tot = (end_timestamp - start_timestamp)/(900*500)
    cntr = 0
    for current_sts in range(start_timestamp, end_timestamp+1, 900*500):
        next_ets = current_sts + 900*500 if (current_sts + 900*500) < end_timestamp else end_timestamp
        print(current_sts, next_ets, f'100% completed') if next_ets == end_timestamp else print(current_sts, next_ets, f'{round(cntr*100/tot, 1)}% completed')
        cntr += 1
        # Futures market
        klines = client.futures_historical_klines(coin_pair, '15m', current_sts*1000, next_ets*1000, limit=500)
        # Spot market
        # klines = client.get_historical_klines(coin_pair, interval, current_sts*1000, next_ets*1000, limit=500)
        
        for kline in klines:
            timestamp = kline[0]/1000
            open_price = float(kline[1])
            high_price = float(kline[2])
            low_price = float(kline[3])
            close_price = float(kline[4])
            volume = float(kline[5])

            data.append([timestamp, open_price, high_price, low_price, close_price, volume])

    df = pd.DataFrame(data, columns=['time', 'open', 'high', 'low', 'close', 'volume'])
    df.to_csv(f'../../data/{coin_pair}-ohlcv.csv', index=False)
    # print('Data Exported')
    print(f'Historical Data of {coin_pair} Downloaded')
    return df


Download Historical Data

In [None]:
# coins_list = ['BTC', 'ETH', 'BNB', 'XRP', 'ADA', 'DOGE', 'SOL', 'TRX', 'DOT', 'MATIC', 'LTC', 'BCH', 'AVAX', 'XLM', 'LINK', 'UNI', 'XMR', 'ATOM', 'ETC', 'HBAR', 'ICP', 'FIL', 'LDO', 'APT', 'ARB', 'QNT', 'VET', 'NEAR', 'OP', 'MKR', 'GRT', 'AAVE', 'ALGO', 'AXS', 'EGLD', 'STX', 'SAND', 'XTZ', 'EOS', 'INJ', 'THETA', 'IMX', 'SNX', 'MANA', 'FTM', 'RUNE', 'APE', 'RNDR', 'NEO', 'KAVA', 'FLOW', 'CHZ', 'GALA', 'KLAY', 'SUI', 'FXS', 'ZEC', 'CFX', 'CRV', 'MINA', 'COMP', 'GMX', 'DYDX', 'WOO', 'ASTR']
coins_list = ['ETH']
for i in range(len(coins_list)):
    coin_pair = f'{coins_list[i]}USDT'
    print('working on', coin_pair)
    # start_timestamp = get_unix_timestamp('1/1/2016 00:00:00')
    start_timestamp = get_unix_timestamp('1/1/2023 00:00:00')
    end_timestamp = get_unix_timestamp('1/6/2023 00:00:00')
    get_historical_data(start_timestamp, end_timestamp, coin_pair)

In [4]:
import pandas as pd
import pandas_ta as ta
import numpy as np
import random


ohlc_df = pd.read_csv('../../data/ETHUSDT-ohlcv.csv')
ohlc_df['LOGRET_1'] = list(ohlc_df.ta.log_return())
ohlc_df['NATR_14'] = ohlc_df.ta.natr()
ohlc_df['SLOPE_1'] = ohlc_df.ta.slope()
bbands = ohlc_df.ta.bbands()
ohlc_df['BBB_5_2.0'] = list(bbands['BBB_5_2.0'])
ohlc_df['BBP_5_2.0'] = list(bbands['BBP_5_2.0'])
ohlc_df['PGO_14'] = list(ohlc_df.ta.pgo())
eri = ohlc_df.ta.eri()
ohlc_df['BEARP_13'] = list(eri['BEARP_13'])
ohlc_df['BULLP_13'] = list(eri['BULLP_13'])
ohlc_df['EFI_13'] = ohlc_df.ta.efi()
adx = ohlc_df.ta.adx()
ohlc_df['DMP_14'] = list(adx['DMP_14'])
ohlc_df['DMN_14'] = list(adx['DMN_14'])
ohlc_df['CG_10'] = ohlc_df.ta.cg()
trix = ohlc_df.ta.trix()
ohlc_df['TRIX_30_9'] = trix['TRIX_30_9']
ohlc_df['TRIXs_30_9'] = trix['TRIXs_30_9']
smi = ohlc_df.ta.smi()
ohlc_df['SMI_5_20_5'] = smi['SMI_5_20_5']
ohlc_df['SMIs_5_20_5'] = smi['SMIs_5_20_5']
ohlc_df['SMIo_5_20_5'] = smi['SMIo_5_20_5']
ohlc_df['AD'] = ohlc_df.ta.ad()
ohlc_df['TRUERANGE_1'] = ohlc_df.ta.true_range()
ohlc_df['PVT'] = ohlc_df.ta.pvt()
ohlc_df['EBSW_40_10'] = ohlc_df.ta.ebsw()
ohlc_df['PVOL'] = ohlc_df.ta.pvol()
thermo = ohlc_df.ta.thermo()
ohlc_df['THERMO_20_2_0.5'] = thermo['THERMO_20_2_0.5']
ohlc_df['THERMOma_20_2_0.5'] = thermo['THERMOma_20_2_0.5']
ohlc_df['CHOP_14_1_100'] = ohlc_df.ta.chop()
ohlc_df['PVI_1'] = ohlc_df.ta.pvi()
ohlc_df['VHF_28'] = ohlc_df.ta.vhf()



df = ohlc_df

ts = list(df['time'])
open = list(df['open'])
high = list(df['high'])
low = list(df['low'])
close = list(df['close'])
volume = list(df['volume'])
tot = len(ts)
long_runup_lst = []
long_drawdown_lst = []
short_runup_lst = []
short_drawdown_lst = []
candlestick_frame = 12
pnl_threshold = 3

for idx in range(tot):
    if (idx >= candlestick_frame) and (idx <= tot - candlestick_frame):
        max_high = max(high[idx+1:idx+candlestick_frame])
        min_low = min(low[idx+1:idx+candlestick_frame])
        entry_price = open[idx+1]
        long_runup_lst.append(round((max_high*100/entry_price)-100, 6))
        long_drawdown_lst.append(round((min_low*100/entry_price)-100, 6))
        short_runup_lst.append(round((entry_price*100/min_low)-100, 6))
        short_drawdown_lst.append(round((entry_price*100/max_high)-100, 6))
    else:
        long_runup_lst.append(0)
        long_drawdown_lst.append(0)
        short_runup_lst.append(0)
        short_drawdown_lst.append(0)     


long=[]
short=[]
dont_trade=[]
signal = []

for idx in range(tot):
    if (idx >= candlestick_frame) and (idx <= tot - candlestick_frame):
        if long_runup_lst[idx] >= pnl_threshold:
            signal.append('long')
        elif short_runup_lst[idx] >= pnl_threshold:
            signal.append('short')
        else:
            signal.append('dont_trade')
    else:
        signal.append('dont_trade')

df['signal'] = signal

# long_indices = df[df['signal'].str.contains('long', case=False)].index
# short_indices = df[df['signal'].str.contains('short', case=False)].index
# dont_trade_indices = list(df[df['signal'].str.contains('dont_trade', case=False)].index)
# num_indices_to_pick  = len(dont_trade_indices) - min([len(long_indices), len(short_indices)])
# random_indices = random.sample(dont_trade_indices, num_indices_to_pick)
# df = df.drop(random_indices)

# df.reset_index(drop=True, inplace=True)

# df = df.drop(columns = ['time', 'open', 'high', 'low', 'close', 'volume'], axis=1)
df = df.drop(columns = ['open', 'high', 'low', 'close', 'volume'], axis=1)
df = df.dropna()
df

Unnamed: 0,time,LOGRET_1,NATR_14,SLOPE_1,BBB_5_2.0,BBP_5_2.0,PGO_14,BEARP_13,BULLP_13,EFI_13,...,TRUERANGE_1,PVT,EBSW_40_10,PVOL,THERMO_20_2_0.5,THERMOma_20_2_0.5,CHOP_14_1_100,PVI_1,VHF_28,signal
96,1.672603e+09,0.000582,0.135551,0.70,0.244778,0.369547,0.724317,0.181239,1.271239,12774.348088,...,1.09,-1.342970e+03,0.421382,1.339800e+07,0.33,1.110772,43.226537,999.481713,0.439059,dont_trade
97,1.672604e+09,0.000665,0.141520,0.80,0.236105,0.659200,0.950988,0.833919,3.483919,13071.922818,...,2.65,-1.070910e+02,0.075447,2.234121e+07,2.49,1.242127,47.421792,999.548260,0.426417,dont_trade
98,1.672605e+09,-0.000882,0.142343,-1.06,0.166927,0.361469,0.113267,0.367645,2.187645,9625.104667,...,1.82,-1.026133e+03,0.054274,1.253593e+07,1.21,1.239067,52.064293,999.548260,0.428841,dont_trade
99,1.672606e+09,-0.000732,0.140242,-0.88,0.218700,0.165219,-0.440460,-0.380590,0.959410,6776.386023,...,1.34,-1.884428e+03,-0.346432,1.407925e+07,1.28,1.242965,59.187156,999.475043,0.426429,dont_trade
100,1.672607e+09,-0.000391,0.138546,-0.47,0.283316,0.156944,-0.718438,-1.109077,0.280923,4852.536729,...,1.39,-2.441496e+03,-0.812430,1.709025e+07,0.84,1.204588,58.870621,999.435910,0.421296,dont_trade
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14492,1.685560e+09,0.000113,0.237255,0.21,0.171857,0.232171,0.247141,-2.465251,0.974749,-4249.755565,...,3.44,1.114550e+07,0.098294,2.642970e+07,1.28,2.506997,50.888711,994.786340,0.337674,dont_trade
14493,1.685560e+09,0.000204,0.227929,0.38,0.148010,0.422425,0.236563,-0.120215,1.879785,-3058.487850,...,2.00,1.114572e+07,-0.884603,2.005379e+07,2.45,2.501569,49.988119,994.786340,0.342373,dont_trade
14494,1.685561e+09,0.002112,0.233194,3.94,0.356803,0.976240,0.966341,-0.743042,5.006958,5402.711092,...,5.75,1.114873e+07,0.028938,2.662411e+07,3.78,2.623324,52.302836,994.997760,0.328259,dont_trade
14495,1.685562e+09,-0.002477,0.243684,-4.62,0.376961,0.338047,-0.048199,-1.232607,5.707393,-9355.125839,...,6.94,1.114349e+07,0.494923,3.947682e+07,0.60,2.430627,53.194407,994.750374,0.308918,dont_trade


In [13]:
candlestick_frame = 12

tot = len(df)
dta_lst = []

columns = list(df.columns)[:-1]
columns.pop(0)
f_str = 'id,time,LOGRET_1,NATR_14,SLOPE_1,BBB_5_2.0,BBP_5_2.0,PGO_14,BEARP_13,BULLP_13,EFI_13,DMP_14,DMN_14,CG_10,TRIX_30_9,TRIXs_30_9,SMI_5_20_5,SMIs_5_20_5,SMIo_5_20_5,AD,TRUERANGE_1,PVT,EBSW_40_10,PVOL,THERMO_20_2_0.5,THERMOma_20_2_0.5,CHOP_14_1_100,PVI_1,VHF_28\n'
time = list(df['time'])
lst_columns = {}
y_tsfresh = []
y = list(df['signal'])
for c in columns:
    lst_columns[c] = list(df[c])
for id in range(candlestick_frame, tot, 1):
    y_tsfresh.append(y[id])
    app_id = id - candlestick_frame + 1
    for i in range(candlestick_frame):
        app = {}
        app['signal']=y[id]
        app['id'] = app_id
        app['time'] = i
        for c in columns:
            app[c] = lst_columns[c][id-i]
        dta_lst.append(app)

app_df = pd.DataFrame(dta_lst)
training_y = list(app_df['signal'])
app_df = app_df.drop(columns = ['signal'], axis=1)
app_df.to_csv('../../data/df.csv', index=False)
pd.DataFrame({'': y_tsfresh}).to_csv('../../data/y.csv', index=False)
app_df

Unnamed: 0,id,time,LOGRET_1,NATR_14,SLOPE_1,BBB_5_2.0,BBP_5_2.0,PGO_14,BEARP_13,BULLP_13,...,AD,TRUERANGE_1,PVT,EBSW_40_10,PVOL,THERMO_20_2_0.5,THERMOma_20_2_0.5,CHOP_14_1_100,PVI_1,VHF_28
0,1,0,0.000907,0.126446,1.09,0.159488,0.585576,0.002785,-2.016611,0.933389,...,7.091819e+04,2.95,-9.144058e+02,-0.840170,2.627271e+07,0.70,0.919449,50.841402,999.407723,0.311526
1,1,1,-0.000816,0.117397,-0.98,0.207233,0.063137,-0.733877,-1.554380,0.265620,...,6.002450e+04,1.82,-2.899018e+03,-0.556091,1.804141e+07,1.60,0.942549,51.201628,999.316942,0.343707
2,1,2,-0.000191,0.114674,-0.23,0.100919,0.180159,-0.228181,-0.098443,0.301557,...,6.365713e+04,0.40,-1.673610e+03,0.577839,8.077065e+06,0.40,0.873344,53.845606,999.398495,0.348169
3,1,3,-0.000116,0.120911,-0.14,0.080615,0.270919,-0.145969,0.056817,0.696817,...,6.802612e+04,0.64,-1.544985e+03,0.884187,8.273187e+06,0.53,0.923170,57.857548,999.398495,0.348335
4,1,4,-0.000457,0.126101,-0.55,0.226948,0.568192,-0.004433,0.329619,1.259619,...,7.189805e+04,0.93,-1.464814e+03,0.945365,1.080960e+07,0.04,0.964556,58.033383,999.398495,0.387590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172663,14389,7,-0.000091,0.248966,-0.17,0.279342,0.606002,0.825165,-0.305839,3.144161,...,1.112730e+07,3.45,1.114683e+07,0.977801,3.386835e+07,2.10,2.769984,52.745632,994.865718,0.330976
172664,14389,8,-0.000177,0.253862,-0.33,0.466238,0.700304,0.912564,2.171522,3.991522,...,1.111840e+07,1.82,1.114699e+07,0.995309,2.190318e+07,1.67,2.840509,54.695772,994.874834,0.325862
172665,14389,9,0.000445,0.265836,0.83,0.516054,0.830723,1.067049,0.970109,4.720109,...,1.112188e+07,3.75,1.114720e+07,0.930165,4.405077e+07,1.71,2.963720,54.315926,994.874834,0.319993
172666,14389,10,0.001487,0.270940,2.77,0.426137,0.948435,1.052618,-0.138207,6.921793,...,1.111225e+07,7.06,1.114615e+07,0.839009,7.795027e+07,5.47,3.095691,48.071478,994.874834,0.319940


In [10]:
len(y)

14401

In [11]:
len(y_tsfresh)+12

14401

In [1]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
import pandas as pd

df = pd.read_csv('../../data/df.csv')
y = pd.read_csv('../../data/y.csv')

extraction_settings = ComprehensiveFCParameters()
chunk_size = 6000
chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
tot = len(chunks)
for idx, chunk_df in enumerate(chunks):
    print(f'{tot - idx} remaining \t\t {round((idx*100/tot),1)}% done')
    X = extract_features(chunk_df, column_id='id', column_sort='time',
                        default_fc_parameters=extraction_settings,
                        # we impute = remove all NaN features automatically
                        impute_function=impute)
    X.to_csv(f'../../data/chunks/{idx}.csv', index=False)

3 remaining 		 89.7% done


Feature Extraction: 100%|██████████| 50/50 [00:45<00:00,  1.09it/s]


2 remaining 		 93.1% done


Feature Extraction: 100%|██████████| 50/50 [00:58<00:00,  1.17s/it]


1 remaining 		 96.6% done


Feature Extraction: 100%|██████████| 50/50 [00:47<00:00,  1.04it/s]


In [3]:
dfs=[]
dataset_path = '../../data/chunks/'

for i in range(len(chunks)):
    dfs.append(pd.read_csv(f'{dataset_path}{i}.csv'))

concatenated_df = pd.concat(dfs, ignore_index=True)  # Set ignore_index=True to reset index
pd.DataFrame(concatenated_df).to_csv(f'../../data/tsfresh_testing_dataset.csv', index=False)


In [10]:
concatenated_df

Unnamed: 0,LOGRET_1__variance_larger_than_standard_deviation,LOGRET_1__has_duplicate_max,LOGRET_1__has_duplicate_min,LOGRET_1__has_duplicate,LOGRET_1__sum_values,LOGRET_1__abs_energy,LOGRET_1__mean_abs_change,LOGRET_1__mean_change,LOGRET_1__mean_second_derivative_central,LOGRET_1__median,...,VHF_28__fourier_entropy__bins_5,VHF_28__fourier_entropy__bins_10,VHF_28__fourier_entropy__bins_100,VHF_28__permutation_entropy__dimension_3__tau_1,VHF_28__permutation_entropy__dimension_4__tau_1,VHF_28__permutation_entropy__dimension_5__tau_1,VHF_28__permutation_entropy__dimension_6__tau_1,VHF_28__permutation_entropy__dimension_7__tau_1,VHF_28__query_similarity_count__query_None__threshold_0.0,VHF_28__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,0.0,-0.000324,0.000005,0.000791,-0.000022,0.000164,-0.000154,...,0.796312,0.796312,1.549826,1.220607,1.676988,2.079442,1.945910,1.791759,0.0,0.414968
1,0.0,0.0,0.0,0.0,-0.002614,0.000007,0.000880,0.000067,-0.000134,-0.000291,...,0.410116,0.955700,1.945910,0.950271,1.427061,1.906155,1.945910,1.791759,0.0,0.409198
2,0.0,0.0,0.0,0.0,-0.001549,0.000007,0.001031,-0.000083,0.000073,-0.000154,...,0.410116,0.955700,1.549826,0.950271,1.427061,1.667462,1.747868,1.791759,0.0,0.397697
3,0.0,0.0,0.0,0.0,-0.000891,0.000006,0.001023,-0.000029,-0.000092,-0.000096,...,0.410116,0.796312,1.549826,0.950271,1.149060,1.386294,1.475076,1.560710,0.0,0.386517
4,0.0,0.0,0.0,0.0,-0.000950,0.000006,0.000914,0.000149,0.000037,-0.000096,...,0.796312,0.796312,1.475076,0.801819,1.002718,1.073543,1.153742,1.242453,0.0,0.375433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14384,0.0,0.0,0.0,0.0,0.001762,0.000006,0.000707,0.000055,0.000136,0.000102,...,0.955700,1.475076,1.747868,1.220607,1.581094,1.667462,1.747868,1.791759,0.0,0.333386
14385,0.0,0.0,0.0,0.0,0.001251,0.000006,0.000550,-0.000119,-0.000055,0.000102,...,1.153742,1.475076,1.747868,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333895
14386,0.0,0.0,0.0,0.0,0.004470,0.000009,0.000614,-0.000184,0.000092,0.000132,...,0.796312,1.153742,1.747868,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333976
14387,0.0,0.0,0.0,0.0,0.001902,0.000015,0.001026,0.000239,-0.000275,0.000132,...,0.796312,0.796312,1.475076,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333847


In [11]:
y = pd.read_csv('../../data/y.csv')
y

Unnamed: 0.1,Unnamed: 0
0,dont_trade
1,dont_trade
2,dont_trade
3,dont_trade
4,dont_trade
...,...
14384,dont_trade
14385,dont_trade
14386,dont_trade
14387,dont_trade
