In [1]:
#!/usr/bin/python3
from binance.client import Client
from time import time
import pickle as pickle
from datetime import datetime
import pandas as pd
import pandas_ta as ta
from pycaret.classification import load_model, predict_model
from pycaret.classification import *
from pycaret.classification import ClassificationExperiment
from lightgbm import LGBMClassifier
import numpy as np
import random


def get_client():
    fn = '../../key/binance-key.pickle'
    with open(fn, 'rb') as handle:
        k = pickle.load(handle)
    return Client(k['API_KEY'], k['API_SECRET'])


client = get_client()


def get_unix_timestamp(date_string):
    """
    Converts the input date string to Unix timestamp.

    Parameters:
        date_string (str): Input date string in the format "dd/mm/yyyy hh:mm:ss".

    Returns:
        int: Unix timestamp of the given date.
    """
    try:
        date_obj = datetime.strptime(date_string, "%d/%m/%Y %H:%M:%S")
        timestamp = int(date_obj.timestamp())
        return timestamp
    except ValueError:
        print("Invalid date format. Please use the format 'dd/mm/yyyy hh:mm:ss'.")
        return None

def get_historical_data(start_timestamp, end_timestamp, coin_pair): 
    data = []
    tot = (end_timestamp - start_timestamp)/(900*500)
    cntr = 0
    for current_sts in range(start_timestamp, end_timestamp+1, 900*500):
        next_ets = current_sts + 900*500 if (current_sts + 900*500) < end_timestamp else end_timestamp
        print(current_sts, next_ets, f'100% completed') if next_ets == end_timestamp else print(current_sts, next_ets, f'{round(cntr*100/tot, 1)}% completed')
        cntr += 1
        # Futures market
        klines = client.futures_historical_klines(coin_pair, '15m', current_sts*1000, next_ets*1000, limit=500)
        # Spot market
        # klines = client.get_historical_klines(coin_pair, interval, current_sts*1000, next_ets*1000, limit=500)
        
        for kline in klines:
            timestamp = kline[0]/1000
            open_price = float(kline[1])
            high_price = float(kline[2])
            low_price = float(kline[3])
            close_price = float(kline[4])
            volume = float(kline[5])

            data.append([timestamp, open_price, high_price, low_price, close_price, volume])

    df = pd.DataFrame(data, columns=['time', 'open', 'high', 'low', 'close', 'volume'])
    df.to_csv(f'../../data/{coin_pair}-ohlcv.csv', index=False)
    # print('Data Exported')
    print(f'Historical Data of {coin_pair} Downloaded')
    return df


Download Historical Data

In [2]:
# coins_list = ['BTC', 'ETH', 'BNB', 'XRP', 'ADA', 'DOGE', 'SOL', 'TRX', 'DOT', 'MATIC', 'LTC', 'BCH', 'AVAX', 'XLM', 'LINK', 'UNI', 'XMR', 'ATOM', 'ETC', 'HBAR', 'ICP', 'FIL', 'LDO', 'APT', 'ARB', 'QNT', 'VET', 'NEAR', 'OP', 'MKR', 'GRT', 'AAVE', 'ALGO', 'AXS', 'EGLD', 'STX', 'SAND', 'XTZ', 'EOS', 'INJ', 'THETA', 'IMX', 'SNX', 'MANA', 'FTM', 'RUNE', 'APE', 'RNDR', 'NEO', 'KAVA', 'FLOW', 'CHZ', 'GALA', 'KLAY', 'SUI', 'FXS', 'ZEC', 'CFX', 'CRV', 'MINA', 'COMP', 'GMX', 'DYDX', 'WOO', 'ASTR']
coins_list = ['ETH']
for i in range(len(coins_list)):
    coin_pair = f'{coins_list[i]}USDT'
    print('working on', coin_pair)
    # start_timestamp = get_unix_timestamp('1/1/2016 00:00:00')
    start_timestamp = get_unix_timestamp('1/1/2016 00:00:00')
    end_timestamp = get_unix_timestamp('1/1/2023 00:00:00')
    get_historical_data(start_timestamp, end_timestamp, coin_pair)

working on ETHUSDT
1451592000 1452042000 0.0% completed
1452042000 1452492000 0.2% completed
1452492000 1452942000 0.4% completed
1452942000 1453392000 0.6% completed
1453392000 1453842000 0.8% completed
1453842000 1454292000 1.0% completed
1454292000 1454742000 1.2% completed
1454742000 1455192000 1.4% completed
1455192000 1455642000 1.6% completed
1455642000 1456092000 1.8% completed
1456092000 1456542000 2.0% completed
1456542000 1456992000 2.2% completed
1456992000 1457442000 2.4% completed
1457442000 1457892000 2.6% completed
1457892000 1458342000 2.9% completed
1458342000 1458792000 3.1% completed
1458792000 1459242000 3.3% completed
1459242000 1459692000 3.5% completed
1459692000 1460142000 3.7% completed
1460142000 1460592000 3.9% completed
1460592000 1461042000 4.1% completed
1461042000 1461492000 4.3% completed
1461492000 1461942000 4.5% completed
1461942000 1462392000 4.7% completed
1462392000 1462842000 4.9% completed
1462842000 1463292000 5.1% completed
1463292000 14637420

In [3]:
import pandas as pd
import pandas_ta as ta
import numpy as np
import random


ohlc_df = pd.read_csv('../../data/ETHUSDT-ohlcv.csv')
ohlc_df['LOGRET_1'] = list(ohlc_df.ta.log_return())
ohlc_df['NATR_14'] = ohlc_df.ta.natr()
ohlc_df['SLOPE_1'] = ohlc_df.ta.slope()
bbands = ohlc_df.ta.bbands()
ohlc_df['BBB_5_2.0'] = list(bbands['BBB_5_2.0'])
ohlc_df['BBP_5_2.0'] = list(bbands['BBP_5_2.0'])
ohlc_df['PGO_14'] = list(ohlc_df.ta.pgo())
eri = ohlc_df.ta.eri()
ohlc_df['BEARP_13'] = list(eri['BEARP_13'])
ohlc_df['BULLP_13'] = list(eri['BULLP_13'])
ohlc_df['EFI_13'] = ohlc_df.ta.efi()
adx = ohlc_df.ta.adx()
ohlc_df['DMP_14'] = list(adx['DMP_14'])
ohlc_df['DMN_14'] = list(adx['DMN_14'])
ohlc_df['CG_10'] = ohlc_df.ta.cg()
trix = ohlc_df.ta.trix()
ohlc_df['TRIX_30_9'] = trix['TRIX_30_9']
ohlc_df['TRIXs_30_9'] = trix['TRIXs_30_9']
smi = ohlc_df.ta.smi()
ohlc_df['SMI_5_20_5'] = smi['SMI_5_20_5']
ohlc_df['SMIs_5_20_5'] = smi['SMIs_5_20_5']
ohlc_df['SMIo_5_20_5'] = smi['SMIo_5_20_5']
ohlc_df['AD'] = ohlc_df.ta.ad()
ohlc_df['TRUERANGE_1'] = ohlc_df.ta.true_range()
ohlc_df['PVT'] = ohlc_df.ta.pvt()
ohlc_df['EBSW_40_10'] = ohlc_df.ta.ebsw()
ohlc_df['PVOL'] = ohlc_df.ta.pvol()
thermo = ohlc_df.ta.thermo()
ohlc_df['THERMO_20_2_0.5'] = thermo['THERMO_20_2_0.5']
ohlc_df['THERMOma_20_2_0.5'] = thermo['THERMOma_20_2_0.5']
ohlc_df['CHOP_14_1_100'] = ohlc_df.ta.chop()
ohlc_df['PVI_1'] = ohlc_df.ta.pvi()
ohlc_df['VHF_28'] = ohlc_df.ta.vhf()



df = ohlc_df

ts = list(df['time'])
open = list(df['open'])
high = list(df['high'])
low = list(df['low'])
close = list(df['close'])
volume = list(df['volume'])
tot = len(ts)
long_runup_lst = []
long_drawdown_lst = []
short_runup_lst = []
short_drawdown_lst = []
candlestick_frame = 12
pnl_threshold = 3

for idx in range(tot):
    if (idx >= candlestick_frame) and (idx <= tot - candlestick_frame):
        max_high = max(high[idx+1:idx+candlestick_frame])
        min_low = min(low[idx+1:idx+candlestick_frame])
        entry_price = open[idx+1]
        long_runup_lst.append(round((max_high*100/entry_price)-100, 6))
        long_drawdown_lst.append(round((min_low*100/entry_price)-100, 6))
        short_runup_lst.append(round((entry_price*100/min_low)-100, 6))
        short_drawdown_lst.append(round((entry_price*100/max_high)-100, 6))
    else:
        long_runup_lst.append(0)
        long_drawdown_lst.append(0)
        short_runup_lst.append(0)
        short_drawdown_lst.append(0)     


long=[]
short=[]
dont_trade=[]
signal = []

for idx in range(tot):
    if (idx >= candlestick_frame) and (idx <= tot - candlestick_frame):
        if long_runup_lst[idx] >= pnl_threshold:
            signal.append('long')
        elif short_runup_lst[idx] >= pnl_threshold:
            signal.append('short')
        else:
            signal.append('dont_trade')
    else:
        signal.append('dont_trade')

df['signal'] = signal

long_indices = df[df['signal'].str.contains('long', case=False)].index
short_indices = df[df['signal'].str.contains('short', case=False)].index
dont_trade_indices = list(df[df['signal'].str.contains('dont_trade', case=False)].index)
num_indices_to_pick  = len(dont_trade_indices) - min([len(long_indices), len(short_indices)])
random_indices = random.sample(dont_trade_indices, num_indices_to_pick)
df = df.drop(random_indices)

df.reset_index(drop=True, inplace=True)

df = df.drop(columns = ['time', 'open', 'high', 'low', 'close', 'volume'], axis=1)
df = df.dropna()
df

Unnamed: 0,LOGRET_1,NATR_14,SLOPE_1,BBB_5_2.0,BBP_5_2.0,PGO_14,BEARP_13,BULLP_13,EFI_13,DMP_14,...,TRUERANGE_1,PVT,EBSW_40_10,PVOL,THERMO_20_2_0.5,THERMOma_20_2_0.5,CHOP_14_1_100,PVI_1,VHF_28,signal
17,-0.001437,0.967371,-0.22,0.821459,0.135169,0.407859,0.719557,1.309557,276.806201,37.537908,...,0.59,1.977424e+04,0.967839,3.006152e+05,0.33,1.328884,47.265699,1006.700576,0.391884,dont_trade
18,-0.005467,0.828126,-0.84,1.145127,0.078039,-0.342214,-0.513037,0.986963,-113.549510,33.706556,...,1.50,1.988586e+04,-0.965491,3.624824e+05,1.25,0.971797,59.065167,1006.867855,0.439567,dont_trade
19,-0.000130,0.658862,-0.02,0.334778,0.780226,0.057016,0.183918,0.773918,65.196045,27.412833,...,0.59,2.041040e+04,0.877197,3.222322e+05,0.27,0.691762,51.898354,1006.900584,0.383416,short
20,-0.006596,0.667391,-1.01,0.974769,0.028987,-0.622985,-0.639498,0.420502,-207.935448,25.642890,...,1.06,1.920841e+04,0.566475,2.790748e+05,0.93,0.714451,50.903606,1006.900584,0.357509,short
21,0.000720,0.658300,0.11,1.150117,0.234404,-0.492572,-0.713856,0.206144,-144.350226,24.183437,...,0.92,1.936380e+04,-0.296293,3.293090e+05,0.29,0.674028,51.871809,1006.972654,0.327286,short
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25011,-0.000400,0.212556,-0.48,0.757939,0.769221,1.394025,2.136751,4.086751,49669.734072,30.505192,...,1.95,-8.161827e+07,0.864766,3.495210e+07,2.06,1.553004,46.886044,560.135372,0.342519,dont_trade
25012,-0.001035,0.168459,-1.24,0.314769,0.234282,-0.658575,-1.156491,0.103509,2740.368121,19.333633,...,1.26,-8.162127e+07,-0.977898,9.960975e+06,0.57,1.048469,70.668653,560.035583,0.224651,dont_trade
25013,-0.000293,0.121261,-0.35,0.095815,0.107420,-0.501650,-1.108136,-0.188136,-32.534969,14.860744,...,0.92,-8.162221e+07,-0.503188,1.086783e+07,0.65,0.689524,55.324575,560.053167,0.244777,dont_trade
25014,0.000008,0.125687,0.01,0.244485,0.700647,0.265133,-0.172880,0.507120,-1664.038909,10.866634,...,0.68,-8.162572e+07,0.794275,6.637566e+06,0.32,0.953046,47.880039,559.831546,0.269850,dont_trade


In [4]:
candlestick_frame = 12

tot = len(df)
dta_lst = []

columns = list(df.columns)[:-1]
columns.pop(0)
lst_columns = {}
y_tsfresh = []
y = list(df['signal'])
for c in columns:
    lst_columns[c] = list(df[c])
for id in range(candlestick_frame, tot, 1):
    y_tsfresh.append(y[id])
    app_id = id - candlestick_frame + 1
    for i in range(candlestick_frame):
        app = {}
        app['signal']=y[id]
        app['id'] = app_id
        app['time'] = i
        for c in columns:
            app[c] = lst_columns[c][id-i]
        dta_lst.append(app)

app_df = pd.DataFrame(dta_lst)
training_y = list(app_df['signal'])
app_df = app_df.drop(columns = ['signal'], axis=1)
app_df.to_csv('../../data/df.csv', index=False)
pd.DataFrame({'signal': y_tsfresh}).to_csv('../../data/y.csv', index=False)
app_df

Unnamed: 0,id,time,NATR_14,SLOPE_1,BBB_5_2.0,BBP_5_2.0,PGO_14,BEARP_13,BULLP_13,EFI_13,...,AD,TRUERANGE_1,PVT,EBSW_40_10,PVOL,THERMO_20_2_0.5,THERMOma_20_2_0.5,CHOP_14_1_100,PVI_1,VHF_28
0,1,0,0.592073,-1.81,1.804750,0.342655,-0.557163,-0.556792,1.273208,-331.420344,...,2.739264e+03,1.830000e+00,1.788448e+04,0.570038,1.473921e+05,1.81,0.718740,60.704908,1006.763303,0.190283
1,1,1,0.492575,1.69,1.768962,1.000000,1.018014,1.160409,1.420409,-94.903168,...,3.706404e+03,1.950000e+00,1.901964e+04,0.111786,5.631749e+03,1.95,0.603870,55.729785,1007.937027,0.216390
2,1,2,0.377958,0.00,0.398438,0.375000,-0.372140,-0.336190,-0.336190,-121.006829,...,3.742924e+03,2.220446e-16,1.897917e+04,-0.977357,0.000000e+00,0.00,0.462172,55.588697,1006.828976,0.248152
3,1,3,0.436105,0.00,0.654622,0.303944,-0.394786,-0.392221,-0.392221,-141.174634,...,3.742924e+03,2.220446e-16,1.897917e+04,-0.994511,0.000000e+00,0.00,0.510822,58.776048,1006.828976,0.237854
4,1,4,0.503198,0.00,0.602593,0.239278,-0.431663,-0.457591,-0.457591,-164.703740,...,3.742924e+03,2.220446e-16,1.897917e+04,-0.959453,0.000000e+00,0.39,0.564593,61.085799,1006.828976,0.222327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299839,24987,7,0.257983,-1.32,0.274894,0.528035,0.374460,0.332533,2.312533,18628.581851,...,5.243770e+07,1.980000e+00,-8.165499e+07,0.599871,1.826629e+07,0.16,2.170209,38.613214,559.634423,0.232202
299840,24987,8,0.312023,0.73,0.386825,0.285947,-0.468670,-3.886296,-0.606296,-64275.826355,...,5.250630e+07,3.280000e+00,-8.172160e+07,-0.604459,7.667558e+07,7.02,2.571027,44.032166,559.291514,0.210967
299841,24987,9,0.262286,-0.25,0.850926,0.272709,-2.201722,-5.383888,-3.443888,-86638.882226,...,5.225830e+07,1.940000e+00,-8.169571e+07,-0.992976,4.915321e+07,3.36,2.278200,38.099613,559.143221,0.431713
299842,24987,10,0.231811,-0.05,0.294491,0.506238,-0.340823,-0.678023,1.071977,-2157.608170,...,5.219952e+07,1.750000e+00,-8.161812e+07,0.902657,1.992332e+07,1.61,1.723168,56.712722,559.963291,0.255371


In [10]:
len(y)

14401

In [11]:
len(y_tsfresh)+12

14401

In [5]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
import pandas as pd

df = pd.read_csv('../../data/df.csv')
y = pd.read_csv('../../data/y.csv')

extraction_settings = ComprehensiveFCParameters()
chunk_size = 6000
chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
tot = len(chunks)
for idx, chunk_df in enumerate(chunks):
    print(f'{tot - idx} remaining \t\t {round((idx*100/tot),1)}% done')
    X = extract_features(chunk_df, column_id='id', column_sort='time',
                        default_fc_parameters=extraction_settings,
                        # we impute = remove all NaN features automatically
                        impute_function=impute)
    X.to_csv(f'../../data/chunks/{idx}.csv', index=False)

50 remaining 		 0.0% done


Feature Extraction: 100%|██████████| 40/40 [01:42<00:00,  2.57s/it]


49 remaining 		 2.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.34s/it]


48 remaining 		 4.0% done


Feature Extraction: 100%|██████████| 40/40 [01:32<00:00,  2.31s/it]


47 remaining 		 6.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.33s/it]


46 remaining 		 8.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.35s/it]


45 remaining 		 10.0% done


Feature Extraction: 100%|██████████| 40/40 [01:41<00:00,  2.53s/it]


44 remaining 		 12.0% done


Feature Extraction: 100%|██████████| 40/40 [01:49<00:00,  2.73s/it]


43 remaining 		 14.0% done


Feature Extraction: 100%|██████████| 40/40 [01:55<00:00,  2.88s/it]


42 remaining 		 16.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.33s/it]


41 remaining 		 18.0% done


Feature Extraction: 100%|██████████| 40/40 [01:34<00:00,  2.35s/it]


40 remaining 		 20.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.34s/it]


39 remaining 		 22.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.34s/it]


38 remaining 		 24.0% done


Feature Extraction: 100%|██████████| 40/40 [01:32<00:00,  2.32s/it]


37 remaining 		 26.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.33s/it]


36 remaining 		 28.0% done


Feature Extraction: 100%|██████████| 40/40 [01:35<00:00,  2.38s/it]


35 remaining 		 30.0% done


Feature Extraction: 100%|██████████| 40/40 [01:36<00:00,  2.42s/it]


34 remaining 		 32.0% done


Feature Extraction: 100%|██████████| 40/40 [01:35<00:00,  2.40s/it]


33 remaining 		 34.0% done


Feature Extraction: 100%|██████████| 40/40 [01:38<00:00,  2.45s/it]


32 remaining 		 36.0% done


Feature Extraction: 100%|██████████| 40/40 [01:48<00:00,  2.71s/it]


31 remaining 		 38.0% done


Feature Extraction: 100%|██████████| 40/40 [01:47<00:00,  2.70s/it]


30 remaining 		 40.0% done


Feature Extraction: 100%|██████████| 40/40 [01:42<00:00,  2.56s/it]


29 remaining 		 42.0% done


Feature Extraction: 100%|██████████| 40/40 [01:21<00:00,  2.04s/it]


28 remaining 		 44.0% done


Feature Extraction: 100%|██████████| 40/40 [01:21<00:00,  2.04s/it]


27 remaining 		 46.0% done


Feature Extraction: 100%|██████████| 40/40 [01:22<00:00,  2.06s/it]


26 remaining 		 48.0% done


Feature Extraction: 100%|██████████| 40/40 [01:25<00:00,  2.13s/it]


25 remaining 		 50.0% done


Feature Extraction: 100%|██████████| 40/40 [01:19<00:00,  2.00s/it]


24 remaining 		 52.0% done


Feature Extraction: 100%|██████████| 40/40 [01:09<00:00,  1.74s/it]


23 remaining 		 54.0% done


Feature Extraction: 100%|██████████| 40/40 [01:09<00:00,  1.73s/it]


22 remaining 		 56.0% done


Feature Extraction: 100%|██████████| 40/40 [01:36<00:00,  2.41s/it]


21 remaining 		 58.0% done


Feature Extraction: 100%|██████████| 40/40 [01:30<00:00,  2.26s/it]


20 remaining 		 60.0% done


Feature Extraction: 100%|██████████| 40/40 [01:53<00:00,  2.84s/it]


19 remaining 		 62.0% done


Feature Extraction: 100%|██████████| 40/40 [01:33<00:00,  2.34s/it]


18 remaining 		 64.0% done


Feature Extraction: 100%|██████████| 40/40 [01:28<00:00,  2.21s/it]


17 remaining 		 66.0% done


Feature Extraction: 100%|██████████| 40/40 [01:18<00:00,  1.97s/it]


16 remaining 		 68.0% done


Feature Extraction: 100%|██████████| 40/40 [01:12<00:00,  1.81s/it]


15 remaining 		 70.0% done


Feature Extraction: 100%|██████████| 40/40 [01:24<00:00,  2.12s/it]


14 remaining 		 72.0% done


Feature Extraction: 100%|██████████| 40/40 [01:20<00:00,  2.01s/it]


13 remaining 		 74.0% done


Feature Extraction: 100%|██████████| 40/40 [01:47<00:00,  2.68s/it]


12 remaining 		 76.0% done


Feature Extraction: 100%|██████████| 40/40 [01:49<00:00,  2.74s/it]


11 remaining 		 78.0% done


Feature Extraction: 100%|██████████| 40/40 [01:41<00:00,  2.55s/it]


10 remaining 		 80.0% done


Feature Extraction: 100%|██████████| 40/40 [01:30<00:00,  2.25s/it]


9 remaining 		 82.0% done


Feature Extraction: 100%|██████████| 40/40 [01:35<00:00,  2.38s/it]


8 remaining 		 84.0% done


Feature Extraction: 100%|██████████| 40/40 [01:42<00:00,  2.56s/it]


7 remaining 		 86.0% done


Feature Extraction: 100%|██████████| 40/40 [02:11<00:00,  3.29s/it]


6 remaining 		 88.0% done


Feature Extraction: 100%|██████████| 40/40 [02:15<00:00,  3.38s/it]


5 remaining 		 90.0% done


Feature Extraction: 100%|██████████| 40/40 [02:09<00:00,  3.25s/it]


4 remaining 		 92.0% done


Feature Extraction: 100%|██████████| 40/40 [02:53<00:00,  4.33s/it]


3 remaining 		 94.0% done


Feature Extraction: 100%|██████████| 40/40 [02:32<00:00,  3.81s/it]


2 remaining 		 96.0% done


Feature Extraction: 100%|██████████| 40/40 [02:28<00:00,  3.70s/it]


1 remaining 		 98.0% done


Feature Extraction: 100%|██████████| 40/40 [01:53<00:00,  2.83s/it]


In [2]:
import pandas as pd
dfs=[]
dataset_path = '../../data/chunks/'

for i in range(50):
    dfs.append(pd.read_csv(f'{dataset_path}{i}.csv'))

concatenated_df = pd.concat(dfs, ignore_index=True)  # Set ignore_index=True to reset index
pd.DataFrame(concatenated_df).to_csv(f'../../data/tsfresh_training_dataset.csv', index=False)


In [3]:
concatenated_df

Unnamed: 0,NATR_14__variance_larger_than_standard_deviation,NATR_14__has_duplicate_max,NATR_14__has_duplicate_min,NATR_14__has_duplicate,NATR_14__sum_values,NATR_14__abs_energy,NATR_14__mean_abs_change,NATR_14__mean_change,NATR_14__mean_second_derivative_central,NATR_14__median,...,VHF_28__fourier_entropy__bins_5,VHF_28__fourier_entropy__bins_10,VHF_28__fourier_entropy__bins_100,VHF_28__permutation_entropy__dimension_3__tau_1,VHF_28__permutation_entropy__dimension_4__tau_1,VHF_28__permutation_entropy__dimension_5__tau_1,VHF_28__permutation_entropy__dimension_6__tau_1,VHF_28__permutation_entropy__dimension_7__tau_1,VHF_28__query_similarity_count__query_None__threshold_0.0,VHF_28__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,0.0,7.033034,4.281606,0.061940,0.021459,0.013438,0.599485,...,0.796312,0.796312,1.747868,1.609438,2.043192,2.079442,1.94591,1.791759,0.0,0.316587
1,0.0,0.0,0.0,0.0,7.284589,4.761524,0.090881,-0.038256,0.023954,0.599485,...,1.153742,1.475076,1.945910,1.695743,2.197225,2.079442,1.94591,1.791759,0.0,0.300297
2,0.0,0.0,0.0,0.0,7.933540,6.037801,0.110845,-0.058220,0.011861,0.599485,...,1.153742,1.475076,1.945910,1.695743,2.043192,2.079442,1.94591,1.791759,0.0,0.309682
3,0.0,0.0,0.0,0.0,8.773670,7.865011,0.128173,-0.077202,0.011354,0.599485,...,0.796312,1.153742,1.945910,1.609438,1.889159,1.906155,1.94591,1.791759,0.0,0.326869
4,0.0,0.0,0.0,0.0,10.282562,12.128373,0.185656,-0.139660,0.034186,0.599485,...,1.277034,1.277034,1.945910,1.470808,1.889159,1.906155,1.94591,1.791759,0.0,0.339137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24982,0.0,0.0,0.0,0.0,2.780657,0.656029,0.025333,0.001018,0.001671,0.223786,...,1.475076,1.747868,1.945910,1.359237,1.831020,2.079442,1.94591,1.791759,0.0,0.310433
24983,0.0,0.0,0.0,0.0,2.725359,0.634341,0.028774,0.004459,-0.002753,0.220662,...,1.277034,1.747868,1.945910,1.359237,1.676988,1.906155,1.94591,1.791759,0.0,0.301148
24984,0.0,0.0,0.0,0.0,2.629109,0.601734,0.032069,0.009746,-0.001757,0.220114,...,1.549826,1.747868,1.945910,1.220607,1.522955,1.732868,1.94591,1.791759,0.0,0.301304
24985,0.0,0.0,0.0,0.0,2.526329,0.565334,0.031376,0.008248,-0.000149,0.214485,...,1.277034,1.747868,1.945910,0.950271,1.368922,1.732868,1.94591,1.791759,0.0,0.304885


In [4]:
y = pd.read_csv('../../data/y.csv')
y

Unnamed: 0,signal
0,short
1,long
2,long
3,long
4,short
...,...
24982,dont_trade
24983,dont_trade
24984,dont_trade
24985,dont_trade


In [5]:
from pycaret.classification import load_model, predict_model
from pycaret.classification import *
from pycaret.classification import ClassificationExperiment
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd


exp = ClassificationExperiment()
dataset_path = '../../data/'
# data = pd.read_csv('../../data/tsfresh_training_dataset.csv')
data = concatenated_df
y = pd.read_csv('../../data/y.csv')
data['signal'] = list(y['signal'])
# data.replace([np.inf, -np.inf], np.nan, inplace=True)
s = setup(data, target = 'signal', session_id = 123, use_gpu=True)
model = create_model(LGBMClassifier())
# model = compare_models(exclude='gbc')
# validation_scores = pull()
# accuracy_mean = validation_scores['Accuracy']['Mean']

# save pipeline
model_name = 'tsfresh'
save_model(model, f'../../models/{model_name}')
# print(f'{model_name} model saved. accuracy_mean={accuracy_mean}')
plot_model(model, plot = 'confusion_matrix', plot_kwargs = {'percent': True})
plot_model(model, plot = 'feature')

: 

In [1]:
import numpy as np
import pandas as pd


data = pd.read_csv('../../data/tsfresh_testing_dataset.csv')
data

Unnamed: 0,LOGRET_1__variance_larger_than_standard_deviation,LOGRET_1__has_duplicate_max,LOGRET_1__has_duplicate_min,LOGRET_1__has_duplicate,LOGRET_1__sum_values,LOGRET_1__abs_energy,LOGRET_1__mean_abs_change,LOGRET_1__mean_change,LOGRET_1__mean_second_derivative_central,LOGRET_1__median,...,VHF_28__fourier_entropy__bins_5,VHF_28__fourier_entropy__bins_10,VHF_28__fourier_entropy__bins_100,VHF_28__permutation_entropy__dimension_3__tau_1,VHF_28__permutation_entropy__dimension_4__tau_1,VHF_28__permutation_entropy__dimension_5__tau_1,VHF_28__permutation_entropy__dimension_6__tau_1,VHF_28__permutation_entropy__dimension_7__tau_1,VHF_28__query_similarity_count__query_None__threshold_0.0,VHF_28__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,0.0,-0.000324,0.000005,0.000791,-0.000022,0.000164,-0.000154,...,0.796312,0.796312,1.549826,1.220607,1.676988,2.079442,1.945910,1.791759,0.0,0.414968
1,0.0,0.0,0.0,0.0,-0.002614,0.000007,0.000880,0.000067,-0.000134,-0.000291,...,0.410116,0.955700,1.945910,0.950271,1.427061,1.906155,1.945910,1.791759,0.0,0.409198
2,0.0,0.0,0.0,0.0,-0.001549,0.000007,0.001031,-0.000083,0.000073,-0.000154,...,0.410116,0.955700,1.549826,0.950271,1.427061,1.667462,1.747868,1.791759,0.0,0.397697
3,0.0,0.0,0.0,0.0,-0.000891,0.000006,0.001023,-0.000029,-0.000092,-0.000096,...,0.410116,0.796312,1.549826,0.950271,1.149060,1.386294,1.475076,1.560710,0.0,0.386517
4,0.0,0.0,0.0,0.0,-0.000950,0.000006,0.000914,0.000149,0.000037,-0.000096,...,0.796312,0.796312,1.475076,0.801819,1.002718,1.073543,1.153742,1.242453,0.0,0.375433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14384,0.0,0.0,0.0,0.0,0.001762,0.000006,0.000707,0.000055,0.000136,0.000102,...,0.955700,1.475076,1.747868,1.220607,1.581094,1.667462,1.747868,1.791759,0.0,0.333386
14385,0.0,0.0,0.0,0.0,0.001251,0.000006,0.000550,-0.000119,-0.000055,0.000102,...,1.153742,1.475076,1.747868,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333895
14386,0.0,0.0,0.0,0.0,0.004470,0.000009,0.000614,-0.000184,0.000092,0.000132,...,0.796312,1.153742,1.747868,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333976
14387,0.0,0.0,0.0,0.0,0.001902,0.000015,0.001026,0.000239,-0.000275,0.000132,...,0.796312,0.796312,1.475076,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333847


In [4]:
signal = pd.read_csv('../../data/y.csv')['signal']
signal

0        dont_trade
1        dont_trade
2        dont_trade
3        dont_trade
4        dont_trade
            ...    
14384    dont_trade
14385    dont_trade
14386    dont_trade
14387    dont_trade
14388    dont_trade
Name: signal, Length: 14389, dtype: object

In [8]:
from pycaret.classification import load_model, predict_model


model = load_model('../../models/tsfresh')
predictions = predict_model(model, data=data)

Transformation Pipeline and Model Successfully Loaded


In [9]:
predictions

Unnamed: 0,LOGRET_1__variance_larger_than_standard_deviation,LOGRET_1__has_duplicate_max,LOGRET_1__has_duplicate_min,LOGRET_1__has_duplicate,LOGRET_1__sum_values,LOGRET_1__abs_energy,LOGRET_1__mean_abs_change,LOGRET_1__mean_change,LOGRET_1__mean_second_derivative_central,LOGRET_1__median,...,VHF_28__fourier_entropy__bins_100,VHF_28__permutation_entropy__dimension_3__tau_1,VHF_28__permutation_entropy__dimension_4__tau_1,VHF_28__permutation_entropy__dimension_5__tau_1,VHF_28__permutation_entropy__dimension_6__tau_1,VHF_28__permutation_entropy__dimension_7__tau_1,VHF_28__query_similarity_count__query_None__threshold_0.0,VHF_28__mean_n_absolute_max__number_of_maxima_7,prediction_label,prediction_score
0,0.0,0.0,0.0,0.0,-0.000324,0.000005,0.000791,-0.000022,0.000164,-0.000154,...,1.549826,1.220607,1.676988,2.079442,1.945910,1.791759,0.0,0.414968,dont_trade,0.9970
1,0.0,0.0,0.0,0.0,-0.002614,0.000007,0.000880,0.000067,-0.000134,-0.000291,...,1.945910,0.950271,1.427061,1.906155,1.945910,1.791759,0.0,0.409198,dont_trade,0.9972
2,0.0,0.0,0.0,0.0,-0.001549,0.000007,0.001031,-0.000083,0.000073,-0.000154,...,1.549826,0.950271,1.427061,1.667462,1.747868,1.791759,0.0,0.397697,dont_trade,0.9978
3,0.0,0.0,0.0,0.0,-0.000891,0.000006,0.001023,-0.000029,-0.000092,-0.000096,...,1.549826,0.950271,1.149060,1.386294,1.475076,1.560710,0.0,0.386517,dont_trade,0.9956
4,0.0,0.0,0.0,0.0,-0.000950,0.000006,0.000914,0.000149,0.000037,-0.000096,...,1.475076,0.801819,1.002718,1.073543,1.153742,1.242453,0.0,0.375433,dont_trade,0.9967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14384,0.0,0.0,0.0,0.0,0.001762,0.000006,0.000707,0.000055,0.000136,0.000102,...,1.747868,1.220607,1.581094,1.667462,1.747868,1.791759,0.0,0.333386,dont_trade,0.9983
14385,0.0,0.0,0.0,0.0,0.001251,0.000006,0.000550,-0.000119,-0.000055,0.000102,...,1.747868,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333895,dont_trade,0.9990
14386,0.0,0.0,0.0,0.0,0.004470,0.000009,0.000614,-0.000184,0.000092,0.000132,...,1.747868,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333976,dont_trade,0.9990
14387,0.0,0.0,0.0,0.0,0.001902,0.000015,0.001026,0.000239,-0.000275,0.000132,...,1.475076,1.088900,1.303092,1.386294,1.475076,1.560710,0.0,0.333847,dont_trade,0.9992


In [12]:
analyse = pd.DataFrame()
analyse['correct_sig'] = list(signal)
analyse['predicted_sig'] = predictions['prediction_label']
analyse['prediction_score'] = predictions['prediction_score']

analyse.to_csv('../../data/analyse.csv', index=False)
analyse

Unnamed: 0,correct_sig,predicted_sig,prediction_score
0,dont_trade,dont_trade,0.9970
1,dont_trade,dont_trade,0.9972
2,dont_trade,dont_trade,0.9978
3,dont_trade,dont_trade,0.9956
4,dont_trade,dont_trade,0.9967
...,...,...,...
14384,dont_trade,dont_trade,0.9983
14385,dont_trade,dont_trade,0.9990
14386,dont_trade,dont_trade,0.9990
14387,dont_trade,dont_trade,0.9992
