In [6]:
import json

import talib as ta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.shell import spark
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import lag
from pyspark.sql.functions import col

# OHLCV data
input_file = open('../../data/futures/ADA_USDT-1h-futures.json')
json_array = json.load(input_file)

In [54]:
# data frame "df"
df = pd.DataFrame(json_array)
df.columns = ['Time', 'Open', 'High', 'Low', 'Close', 'Volume']

In [55]:
# indicators

df['index'] = df.index

# Ema 89 strategy
df['EMA89High']=ta.EMA(df.High, timeperiod=89)
df['EMA89Low']=ta.EMA(df.Low, timeperiod=89)

df['SMA5'] = ta.SMA(df.Close, timeperiod=5)
df['SMA10'] = ta.SMA(df.Close, timeperiod=10)
df['SMA20'] = ta.SMA(df.Close, timeperiod=20)
df['SMA50'] = ta.SMA(df.Close, timeperiod=50)
df['SMA100'] = ta.SMA(df.Close, timeperiod=100)
df['SMA200'] = ta.SMA(df.Close, timeperiod=200)

df['EMA200']=ta.EMA(df.Close, timeperiod=200)
df['EMA150']=ta.EMA(df.Close, timeperiod=150)
df['EMA100']=ta.EMA(df.Close, timeperiod=100)
df['EMA50']=ta.EMA(df.Close, timeperiod=50)

df['EMA32']=ta.EMA(df.Close, timeperiod=32)
df['EMA21']=ta.EMA(df.Close, timeperiod=21)
df['EMA13']=ta.EMA(df.Close, timeperiod=13)

df['RSI']=ta.RSI(df.Close, timeperiod=14)


# patterns
df['CDL_MARUBOZU'] = ta.CDLMARUBOZU(df.Open, df.High, df.Low, df.Close)
df['CDL_DOJI'] = ta.CDLDOJI(df.Open, df.High, df.Low, df.Close)
df['CDL_ENGULFING'] = ta.CDLENGULFING(df.Open, df.High, df.Low, df.Close)
df['CDL_EVENINGSTAR']  = ta.CDLEVENINGSTAR(df.Open, df.High, df.Low, df.Close)
df['CDL_EVENINGSTAR'] = ta.CDLEVENINGSTAR(df.Open, df.High, df.Low, df.Close)


# MACD
MACD_FAST_EMA = 12
MACD_SLOW_EMA = 26
MACD_SIGNAL_PERIOD = 9
df['MACD'],df['MACD_SIGNAL'],df['MACD_HIST'] = ta.MACD(df.Close, fastperiod=MACD_FAST_EMA, slowperiod=MACD_SLOW_EMA, signalperiod=MACD_SIGNAL_PERIOD)

# BB
df['SMA_BB'] = df.Close.rolling(window=20).mean()

# upper and lower bollinger bands: SMA +/- 2 * standard deviation
df['STD_DEV'] = df.Close.rolling(window=20).std()
df['UPPER_BB'] = df['SMA_BB'] + (2 * df['STD_DEV'])
df['LOWER_BB'] = df['SMA_BB'] - (2 * df['STD_DEV'])

# volume
df['OBV'] = ta.OBV(df.Close, df.Volume)

df['Time'] = pd.to_datetime(df['Time'], unit='ms')
# df['VOL1h%'] = (df.Volume.pct_change(periods=1).mul(100))
# df['VOL4h%'] = (df.Volume.pct_change(periods=4).mul(100))
# df['VOL8h%'] = (df.Volume.pct_change(periods=8).mul(100))
# df['VOL12h%'] = (df.Volume.pct_change(periods=12).mul(100))
# df['VOL24h%'] = (df.Volume.pct_change(periods=24).mul(100))

# df['1h%'] = (df.Close.pct_change(periods=1).mul(100))
# df['4h%'] = (df.Close.pct_change(periods=4).mul(100))
# df['8h%'] = (df.Close.pct_change(periods=8).mul(100))
# df['12h%'] = (df.Close.pct_change(periods=12).mul(100))
# df['24h%'] = (df.Close.pct_change(periods=24).mul(100))

df['NEXT_1h%'] = (df.Close.pct_change(periods=-1).mul(100))
# df['NEXT_4h%'] = (df.Close.pct_change(periods=-4).mul(100))
# df['NEXT_8h%'] = (df.Close.pct_change(periods=-8).mul(100))
# df['NEXT_12h%'] = (df.Close.pct_change(periods=-12).mul(100))
# df['NEXT_24h%'] = (df.Close.pct_change(periods=-24).mul(100))

df = df.assign(Long1h = lambda x: (x['NEXT_1h%'] > 2))
# df = df.assign(Short1h = lambda x: (x['NEXT_1h%'] < 2))

# df = df.assign(Long4h = lambda x: (x['NEXT_4h%'] > 5))
# df = df.assign(Short4h = lambda x: (x['NEXT_4h%'] < 5))

df = df.dropna()

# assert that we do not have any nan left
assert df.isnull().sum().sum() == 0

print(df.shape[0])
print('Long1h', df['Long1h'].value_counts()[True])
# print('Short1h', df['Short1h'].value_counts()[True])
# print('Long4h', df['Long4h'].value_counts()[True])
# print('Short4h', df['Short4h'].value_counts()[True])
print(df.columns[:-1])
df.head()

7064
Long1h 227
Index(['Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'index', 'EMA89High',
       'EMA89Low', 'SMA5', 'SMA10', 'SMA20', 'SMA50', 'SMA100', 'SMA200',
       'EMA200', 'EMA150', 'EMA100', 'EMA50', 'EMA32', 'EMA21', 'EMA13', 'RSI',
       'CDL_MARUBOZU', 'CDL_DOJI', 'CDL_ENGULFING', 'CDL_EVENINGSTAR', 'MACD',
       'MACD_SIGNAL', 'MACD_HIST', 'SMA_BB', 'STD_DEV', 'UPPER_BB', 'LOWER_BB',
       'OBV', 'NEXT_1h%'],
      dtype='object')


Unnamed: 0,Time,Open,High,Low,Close,Volume,index,EMA89High,EMA89Low,SMA5,...,MACD,MACD_SIGNAL,MACD_HIST,SMA_BB,STD_DEV,UPPER_BB,LOWER_BB,OBV,NEXT_1h%,Long1h
199,2022-01-09 07:00:00,1.18,1.1843,1.172,1.177,10199968.0,199,1.241099,1.219655,1.17932,...,-0.010083,-0.011113,0.00103,1.186205,0.028003,1.24221,1.1302,-343866300.0,0.986701,False
200,2022-01-09 08:00:00,1.1771,1.1829,1.1625,1.1655,14798267.0,200,1.239805,1.218385,1.17736,...,-0.010758,-0.011042,0.000284,1.182845,0.026097,1.235039,1.130651,-358664567.0,0.456818,False
201,2022-01-09 09:00:00,1.1655,1.1737,1.1563,1.1602,15315921.0,201,1.238336,1.217005,1.172,...,-0.011587,-0.011151,-0.000436,1.17932,0.023967,1.227254,1.131386,-373980488.0,-0.283627,False
202,2022-01-09 10:00:00,1.1601,1.1689,1.151,1.1635,14871076.0,202,1.236793,1.215538,1.16924,...,-0.011842,-0.011289,-0.000553,1.17629,0.021735,1.219759,1.132821,-359109412.0,0.034391,False
203,2022-01-09 11:00:00,1.1636,1.1744,1.1592,1.1631,13136258.0,203,1.235407,1.214286,1.16586,...,-0.011938,-0.011419,-0.000519,1.173695,0.01989,1.213474,1.133916,-372245670.0,-0.025786,False


In [None]:
def support(df1, l, n1, n2): #n1 n2 before and after candle l
    for i in range(l-n1+1, l+1):
        if(df1.low[i]>df1.low[i-1]):
            return 0
    for i in range(l+1,l+n2+1):
        if(df1.low[i]<df1.low[i-1]):
            return 0
    return 1

def resistance(df1, l, n1, n2): #n1 n2 before and after candle l
    for i in range(l-n1+1, l+1):
        if(df1.high[i]<df1.high[i-1]):
            return 0
    for i in range(l+1,l+n2+1):
        if(df1.high[i]>df1.high[i-1]):
            return 0
    return 1


In [None]:
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML

features = ['Open', 'High', 'Low', 'Close', 'Volume', 'EMA89High',
       'EMA89Low', 'SMA5', 'SMA10', 'SMA20', 'SMA50', 'SMA100', 'SMA200',
       'EMA200', 'EMA150', 'EMA100', 'EMA50', 'EMA32', 'EMA21', 'EMA13', 'RSI',
       'CDL_MARUBOZU', 'CDL_DOJI', 'CDL_ENGULFING', 'CDL_EVENINGSTAR', 'MACD',
       'MACD_SIGNAL', 'MACD_HIST', 'SMA_BB', 'STD_DEV', 'UPPER_BB', 'LOWER_BB',
       'OBV', 'NEXT_1h%']
X_train, X_test, y_train, y_test = train_test_split(df[features], df["Long1h"], test_size=0.25)

automl = AutoML(mode="Compete", features_selection=True)
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

AutoML directory: AutoML_6
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree logloss 1e-06 trained in 6.17 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree logloss 0.005216 trained in 11.14 seconds
2_DecisionTree logloss 0.005216 trained in 10.28 seconds
3_DecisionTree logloss 0.005216 

In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

4_Linear logloss 0.009495 trained in 22.61 seconds
* Step default_algorithms will try to check up to 7 models
5_Default_LightGBM logloss 0.002256 trained in 33.96 seconds
6_Default_Xgboost logloss 0.004602 trained in 24.3 seconds
7_Default_CatBoost logloss 0.006108 trained in 34.06 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

8_Default_NeuralNetwork logloss 0.020889 trained in 125.95 seconds
9_Default_RandomForest logloss 0.002609 trained in 27.7 seconds
10_Default_ExtraTrees logloss 0.025588 trained in 22.93 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

11_Default_NearestNeighbors logloss 0.094415 trained in 19.09 seconds
* Step not_so_random will try to check up to 61 models
21_LightGBM logloss 0.002337 trained in 20.1 seconds
12_Xgboost logloss 0.004985 trained in 16.02 seconds
30_CatBoost logloss 0.006283 trained in 93.21 seconds
39_RandomForest logloss 0.003376 trained in 37.88 seconds
48_ExtraTrees logloss 0.052492 trained in 22.9 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

57_NeuralNetwork logloss 0.016689 trained in 89.11 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

66_NearestNeighbors logloss 0.091728 trained in 19.77 seconds
22_LightGBM logloss 0.002051 trained in 19.32 seconds
13_Xgboost logloss 0.004581 trained in 20.18 seconds
31_CatBoost logloss 0.006626 trained in 75.23 seconds
40_RandomForest logloss 0.000753 trained in 29.29 seconds
49_ExtraTrees logloss 0.029481 trained in 29.69 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

58_NeuralNetwork logloss 0.014059 trained in 142.78 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

67_NearestNeighbors logloss 0.08966 trained in 20.24 seconds
23_LightGBM logloss 0.002487 trained in 32.45 seconds
14_Xgboost logloss 0.017029 trained in 15.1 seconds
32_CatBoost logloss 0.006175 trained in 68.52 seconds
41_RandomForest logloss 0.001399 trained in 38.73 seconds
50_ExtraTrees logloss 0.056882 trained in 33.1 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

59_NeuralNetwork logloss 0.023269 trained in 112.62 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

68_NearestNeighbors logloss 0.08966 trained in 20.86 seconds
24_LightGBM logloss 0.002296 trained in 23.62 seconds
15_Xgboost logloss 0.072628 trained in 14.63 seconds
33_CatBoost logloss 0.005936 trained in 101.75 seconds
42_RandomForest logloss 0.000707 trained in 44.14 seconds
51_ExtraTrees logloss 0.045896 trained in 26.12 seconds


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newval

60_NeuralNetwork logloss 0.022787 trained in 162.81 seconds
* Step golden_features will try to check up to 3 models


In [None]:
print(predictions)