In [1]:
# load libraries
import pandas as pd
import yfinance as yf
from datetime import datetime
from datetime import timedelta
import plotly.graph_objects as go
import numpy as np
from pathlib import Path
import os
from tqdm import tqdm
from src.data_processing.data_downloader import yf_download_data, fgi_download_data
from src.data_processing.data_transformer import merge_price_fgi_data, generate_ta_data, generate_synth_data
from src.common_functions import load_config, get_backfill_range
from src.common_functions import save_forecast
from src.common_functions import get_fbp_accuracy, get_accuracy_tb
from src.common_functions import plot_prediction_tb5
from src.common_functions import get_coin_summary
from src.TripleBarrierLabel.label_price_data import tbl_form_label_all_coins
import src.lstm.wrapper as lstm

#import warnings
#warnings.filterwarnings('ignore')

# TODO 
- improve train/val/test data split mechanism
- Model saving, delete unnescessary models?
- Model selection (fix overfit), do not retrain all the time
- deploy to cloud
- add LSTM for ETH
- Model selection (fix overfit)

# Init

In [2]:
# test mode will skip saving results
TEST_MODE = False
# backfill mode will run forecast for all missing dates between today and last run
BACKFILL_MODE = True

In [3]:
DATE_FORMAT = '%Y-%m-%d'
today = datetime.today().strftime(DATE_FORMAT)

In [4]:
# get config
config = load_config(verbose=False)

Below code gets price data from yfinance, performs forecast and saves all data

# 1 Download data

In [5]:
# download fresh data
if TEST_MODE == False:
    result = yf_download_data(config, verbose=False)
    result = fgi_download_data(config, verbose=False)

Note: 'info' dict is now fixed & improved, 'fast_info' no longer faster


# 2 Process RAW data

In [6]:
if TEST_MODE == False:
    result = merge_price_fgi_data(config, verbose=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fgi_class[x] = mapping[fgi_class[x]]


In [7]:
if TEST_MODE == False:
    result = generate_synth_data(config, verbose=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_btc_out.loc[:,'Open'] = df_btc_s.loc[:,'Open']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_btc_out.loc[:,'Volume'] = df_btc_s.loc[:,'Volume']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_btc_out.loc[:,'High'] = df_btc_s.loc[:,'High']
A value is trying to be set on a copy of a slice fr

In [8]:
if TEST_MODE == False:
    result = generate_ta_data(config, verbose=False)

In [9]:
days_to_process = []
if BACKFILL_MODE == False:
    days_to_process.append(today)
else:
    days_to_process = get_backfill_range(verbose=True)

Last forecast done for: 2023-05-29
Backfill will run 2x from 2023-05-30 to 2023-05-31.


# 3 Train & forecast

## LSTM 1
trained on most of the historic data, including wild peaks of 2020-21, including synthetic. see __generate_synthetic_data.ipynb__ v4

In [10]:
# TODO 
#  - Model selection (fix overfit)
#  - Model saving, delete unnescessary models?
#  - improve train/val/test data split mechanism

In [11]:
lstm_configs = lstm.load_lstm_config(verbose=False)
lstm_config_1 = lstm_configs['models'][0]

if not os.path.exists(lstm_config_1['training']['model_save_dir']): os.makedirs(lstm_config_1['training']['model_save_dir'])

In [12]:
# get data
x, y, x_val, y_val, x_test, y_test = lstm.get_data(lstm_config_1, use_same_train_validation=True, use_same_validation_test=True, verbose='Summary')

[Data] Train data loaded, size: (6238, 20, 43) 
[Data] Validation data loaded, size: (1541, 20, 43) 
[Data] Test data loaded, size: (1541, 20, 43) 


In [13]:
# reload not normalized for viz
lstm_config_1['data']['normalise'] = False
x2, y2, x_val2, y_val2, x_test_viz, y_test_viz = lstm.get_data(lstm_config_1, use_same_train_validation=True, use_same_validation_test=True, verbose='Summary')

[Data] Train data loaded, size: (6238, 20, 43) 
[Data] Validation data loaded, size: (1541, 20, 43) 
[Data] Test data loaded, size: (1541, 20, 43) 


In [14]:
m = lstm.train_model(lstm_config_1, x, y, x_val, y_val, verbose='Summary')

[Model] Model Compiled
Time taken: 0:00:04.227001
[Model] Training Started
[Model] 256 epochs, 32 batch size
[Model] Training Completed. Model saved as saved_models\31052023-110629-epoch{epoch:02d}-acc{val_accuracy:.2f}-loss{val_loss:.2f}.h5
Time taken: 0:02:55.297306
Max validation accuracy: 61.5% @ epoch: 15
Min validation loss: 1.0 @ epoch: 7


In [15]:
fig_acc = lstm.test_model(lstm_config_1, m, x_test, y_test, x_test_viz, y_test_viz, verbose='Summary')
fig_acc.show()

[Model] Predicting Sequences Multiple TBL ...
[Test] Test accuracy: 61.45%
pred > 0.7: 21.28487994808566%
pred > 0.5: 30.715985290936622%
pred < 0.5: 69.28401470906338%


In [16]:
for day in tqdm(days_to_process):
    days_shift = (datetime.strptime(today, DATE_FORMAT) - datetime.strptime(day, DATE_FORMAT)).days

    # Forecast
    todays_window = lstm.get_todays_data(lstm_config_1,days_shift,verbose='Summary')
    todays_window_temp = []
    todays_window_temp.append(todays_window)
    todays_pred = m.predict_sequence_full_tbl(todays_window_temp)

    # target_names = ['-1', '0', '1']
    predictions_argmax = np.argmax(todays_pred[:], axis=1)
    if predictions_argmax[0] == 0:
        pred_direction = '-1.0'
    elif predictions_argmax[0] == 1:
        pred_direction = '0.0'
    else:
        pred_direction = '1.0'

    # Create DataFrame
    df_pred_lstm = pd.DataFrame(
        {'Date': [day],
        'Coin': ['BTC-USD'],
        'Label': [pred_direction]})
        
    # save forecast
    if TEST_MODE:
        print('Test mode - saving skipped')
    else:
        # save tbl forecast
        save_forecast(
            df_pred_lstm,
            model='lstm-tb', 
            value_column='Label', 
            verbose=False)

  0%|          | 0/2 [00:00<?, ?it/s]

[Todays data] X data size: (20, 43)
[Model] Predicting Sequences Multiple TBL ...


 50%|█████     | 1/2 [00:00<00:00,  3.26it/s]

[Todays data] X data size: (20, 43)
[Model] Predicting Sequences Multiple TBL ...


100%|██████████| 2/2 [00:00<00:00,  3.77it/s]


In [17]:
print('Todays prediction confidence: {}'.format(todays_pred))
print('Todays prediction direction: {}'.format(pred_direction))

Todays prediction confidence: [[0.14580025 0.462726   0.3914738 ]]
Todays prediction direction: 0.0


In [18]:
# TODO
# best_model_epoch = (history.history['val_loss'].index(min(history.history['val_loss'])) + history.history['val_accuracy'].index(max(history.history['val_accuracy']))) / 2.0
# best_model_epoch

## LSTM2
trained only similar looking data to recent market development, including synthetic. see __generate_synthetic_data.ipynb__ v5.5

### generate data

In [19]:
# generate data for LSTM2 
#TODO - move to wrapper

# settings - only used for V5
SOURCE_FOLDER = 'data'
OUTPUT_FOLDER = 'data'
SOURCE_FILE = 'processed_market_data_v7.csv'
OUTPUT_FILE = 'synth_market_data_v5_5_train.csv'
TEST_FILE = 'synth_market_data_v5_5_test.csv'

path = Path(os.getcwd())
price_path = path.absolute()
price_path = price_path / SOURCE_FOLDER / SOURCE_FILE
df_price_data = pd.read_csv(price_path)

In [20]:
# limits data to a subset
# train set: TRAIN_FROM ---- CONNECTION_POINT_1 + CONNECTION_POINT_2 --- TRAIN_TO (repeated 4x)
# test set: TRAIN_TO ---- end
TRAIN_FROM = 1400    # 5.3 was 850
TRAIN_TO = 1659      # 5.3 was 1535 
TEST_FROM = 1530     # 5.3 was TRAIN_TO

BB_LENGHT = 20
CONNECTION_POINT_1 = 1420 # 5.3 was 867
CONNECTION_POINT_2 = 1421 # 5.3 was 1362

df_btc_temp = df_price_data.loc[df_price_data['Coin'] == 'BTC-USD',:]
df_btc_cut = pd.concat([df_btc_temp.iloc[TRAIN_FROM:CONNECTION_POINT_1],df_btc_temp.iloc[CONNECTION_POINT_2:TRAIN_TO]], ignore_index=True)

# viz
fig = go.Figure()
fig.add_trace(go.Scatter(
    y = df_btc_cut['Open'], 
    name='Open'))

fig.update_layout(title_text= 'BTC Open CUT')
fig.update_layout(template="plotly_dark")
fig.show()

In [21]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        result[feature_name] = (df[feature_name]) / (max_value)
    return result

In [22]:
import talib

In [23]:
# normalize 
columns = ['Open','Close','High','Low','Volume']
df_btc = df_price_data[df_price_data['Coin'] == 'BTC-USD']
df_btc_s = pd.DataFrame(normalize(df_btc[['Open','Close','High','Low','Volume']]), columns=columns)

df_eth = df_price_data[df_price_data['Coin'] == 'ETH-USD']
df_eth_s = pd.DataFrame(normalize(df_eth[['Open','Close','High','Low','Volume']]), columns=columns)

# concatenate O+L+C+H
# 1/4 copy the whole data
df_temp_v5 = df_price_data.copy()
df_btc_out = df_temp_v5.loc[df_temp_v5['Coin'] == 'BTC-USD',:]
df_btc_out.reset_index(inplace=True)
# df_btc_s.reset_index(inplace=True)
df_btc_out.loc[:,'Open'] = df_btc_s.loc[:,'Open']
df_btc_out.loc[:,'Volume'] = df_btc_s.loc[:,'Volume']
df_btc_out.loc[:,'High'] = df_btc_s.loc[:,'High']
df_btc_out.loc[:,'Low'] = df_btc_s.loc[:,'Low']
df_btc_out.loc[:,'Close'] = df_btc_s.loc[:,'Close']
# 2/4 append Low to Open
df_btc_low = df_btc_out.copy()
df_btc_low['Open'] = df_btc_s['Low']
# 3/4 append Close to Open
df_btc_close = df_btc_out.copy()
df_btc_close['Open'] = df_btc_s['Close']
# 4/4 append High to Open
df_btc_high = df_btc_out.copy()
df_btc_high['Open'] = df_btc_s['High']
# V5 - cut part of the series only
# concat High/low/open close
df_signal_v5 = pd.concat([df_btc_out.iloc[TRAIN_FROM:CONNECTION_POINT_1],df_btc_out.iloc[CONNECTION_POINT_2:TRAIN_TO],
                        df_btc_low.iloc[TRAIN_FROM:CONNECTION_POINT_1],df_btc_low.iloc[CONNECTION_POINT_2:TRAIN_TO],
                        df_btc_close.iloc[TRAIN_FROM:CONNECTION_POINT_1],df_btc_close.iloc[CONNECTION_POINT_2:TRAIN_TO],
                        df_btc_high.iloc[TRAIN_FROM:CONNECTION_POINT_1],df_btc_high.iloc[CONNECTION_POINT_2:TRAIN_TO],
                        ], ignore_index=True)

# add noise
df_signal_v5['Coin'] = 'Synth'
df_signal_v5.drop(['index'], axis=1, inplace=True)
df_signal_v5.reset_index(inplace=True)
df_signal_v5.drop(['index'], axis=1, inplace=True)
# 1/3 copy the whole data
df_noise_v5 = df_signal_v5.copy()
#df_noise_v5 = df_noise_v5.iloc[(CONNECTION_POINT-START):]
df_out_v5 = df_signal_v5.copy()
# 2/3 add noise (gausian)
mu = 0
sigma_open =  df_noise_v5['Open'].mean()*0.015  # 1.5%
sigma_vol =  df_noise_v5['Volume'].mean()*0.05  # 5%

noise_open = np.random.normal(mu, sigma_open, [df_noise_v5.shape[0]])
noise_vol = np.random.normal(mu, sigma_vol, [df_noise_v5.shape[0]])

df_noise_v5['Open'] = df_noise_v5['Open'] + noise_open
df_noise_v5['Volume'] = df_noise_v5['Volume'] + noise_vol
# 3/3 concatenate
df_out_v5 = pd.concat([df_out_v5,df_noise_v5], ignore_index=True)
df_out_v5.reset_index(inplace=True)
df_out_v5.drop(['index'], axis=1, inplace=True)

# add TA values for new signal
# SMA
df_out_v5.loc[:, 'sma5'] = talib.SMA(df_out_v5.loc[:, 'Open'], timeperiod=5)
df_out_v5.loc[:, 'sma10'] = talib.SMA(df_out_v5.loc[:, 'Open'], timeperiod=10)
df_out_v5.loc[:, 'sma20'] = talib.SMA(df_out_v5.loc[:, 'Open'], timeperiod=20)
df_out_v5.loc[:, 'sma50'] = talib.SMA(df_out_v5.loc[:, 'Open'], timeperiod=50)
df_out_v5.loc[:, 'sma200'] = talib.SMA(df_out_v5.loc[:, 'Open'], timeperiod=200)
# EMA
df_out_v5.loc[:, 'ema30'] = talib.EMA(df_out_v5.loc[:, 'Open'], timeperiod=30)
df_out_v5.loc[:, 'ema50'] = talib.EMA(df_out_v5.loc[:, 'Open'], timeperiod=50)
df_out_v5.loc[:, 'ema144'] = talib.EMA(df_out_v5.loc[:, 'Open'], timeperiod=144)
# BB
bb_upperband, bb_middleband, bb_lowerband = talib.BBANDS(df_out_v5.loc[:, 'Open'], timeperiod=BB_LENGHT, nbdevup=2, nbdevdn=2, matype=0)
df_out_v5.loc[:, 'bb_upperband'] = bb_upperband
df_out_v5.loc[:, 'bb_middleband'] = bb_middleband
df_out_v5.loc[:, 'bb_lowerband'] = bb_lowerband
# MACD
macd, macdsignal, macdhist = talib.MACD(df_out_v5.loc[:, 'Open'], fastperiod=12, slowperiod=26, signalperiod=9)
df_out_v5.loc[:, 'macd'] = macd
df_out_v5.loc[:, 'macdsignal'] = macdsignal
df_out_v5.loc[:, 'macdhist'] = macdhist
# RSI
df_out_v5.loc[:, 'rsi'] = talib.RSI(df_out_v5.loc[:, 'Open'], timeperiod=14)

# add TA signal values
# SMA
df_out_v5['above_sma5'] = 0
df_out_v5.loc[df_out_v5['sma5'] < df_out_v5['Open'], 'above_sma5'] = 1
df_out_v5['above_sma10'] = 0
df_out_v5.loc[df_out_v5['sma10'] < df_out_v5['Open'], 'above_sma10'] = 1
df_out_v5['above_sma20'] = 0
df_out_v5.loc[df_out_v5['sma20'] < df_out_v5['Open'], 'above_sma20'] = 1
df_out_v5['above_sma50'] = 0
df_out_v5.loc[df_out_v5['sma50'] < df_out_v5['Open'], 'above_sma50'] = 1
df_out_v5['above_sma200'] = 0
df_out_v5.loc[df_out_v5['sma200'] < df_out_v5['Open'], 'above_sma200'] = 1
# EMA
df_out_v5['above_ema30'] = 0
df_out_v5.loc[df_out_v5['ema30'] < df_out_v5['Open'], 'above_ema30'] = 1
df_out_v5['above_ema50'] = 0
df_out_v5.loc[df_out_v5['ema50'] < df_out_v5['Open'], 'above_ema50'] = 1
df_out_v5['above_ema144'] = 0
df_out_v5.loc[df_out_v5['ema144'] < df_out_v5['Open'], 'above_ema144'] = 1
# BB
df_out_v5['bb_signal_above_h'] = 0
df_out_v5['bb_signal_between'] = 0
df_out_v5['bb_signal_below_l'] = 0
df_out_v5.loc[df_out_v5['Open'] > df_out_v5['bb_upperband'], 'bb_signal_above_h'] = 1
df_out_v5.loc[df_out_v5['Open'] < df_out_v5['bb_lowerband'], 'bb_signal_below_l'] = 1
df_out_v5.loc[(df_out_v5['Open'] <= df_out_v5['bb_upperband']) & (df_out_v5['Open'] >= df_out_v5['bb_lowerband']), 'bb_signal_between'] = 1
# MACD
df_out_v5['macd_signal'] = 0
df_out_v5.loc[df_out_v5['macdsignal'] < df_out_v5['macd'], 'macd_signal'] = 1
# RSI
df_out_v5['rsi_signal_b20'] = 0
df_out_v5['rsi_signal_b30'] = 0
df_out_v5['rsi_signal_b40'] = 0
df_out_v5['rsi_signal_b50'] = 0
df_out_v5['rsi_signal_b60'] = 0
df_out_v5['rsi_signal_b70'] = 0
df_out_v5['rsi_signal_b80'] = 0
df_out_v5['rsi_signal_a80'] = 0
df_out_v5.loc[df_out_v5['rsi'] <= 20, 'rsi_signal_b20'] = 1
df_out_v5.loc[(df_out_v5['rsi'] <= 30) & (df_out_v5['rsi'] >= 20), 'rsi_signal_b30'] = 1
df_out_v5.loc[(df_out_v5['rsi'] <= 40) & (df_out_v5['rsi'] >= 30), 'rsi_signal_b40'] = 1
df_out_v5.loc[(df_out_v5['rsi'] <= 50) & (df_out_v5['rsi'] >= 40), 'rsi_signal_b50'] = 1
df_out_v5.loc[(df_out_v5['rsi'] <= 60) & (df_out_v5['rsi'] >= 50), 'rsi_signal_b60'] = 1
df_out_v5.loc[(df_out_v5['rsi'] <= 70) & (df_out_v5['rsi'] >= 60), 'rsi_signal_b70'] = 1
df_out_v5.loc[(df_out_v5['rsi'] <= 80) & (df_out_v5['rsi'] >= 70), 'rsi_signal_b80'] = 1
df_out_v5.loc[(df_out_v5['rsi'] >= 80), 'rsi_signal_a80'] = 1

# remove NaN (~200 values from start)
df_out_v5.dropna(inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [24]:
# viz
fig = go.Figure()
fig.add_trace(go.Scatter(
    y = df_out_v5['Open'], 
    name='Open'))

fig.update_layout(title_text= 'BTC Open OUT')
fig.update_layout(template="plotly_dark")
fig.show()

In [25]:
# save - train
print('new data shape: {}'.format(df_out_v5.shape))
print('SUCCESS - Synthetic data generated')

data_path = path.absolute()
data_path_export = data_path / OUTPUT_FOLDER / OUTPUT_FILE

df_out_v5.to_csv(data_path_export, index=False, mode='w', header=True)

new data shape: (1865, 57)
SUCCESS - Synthetic data generated


In [26]:
df_test_v5 = df_btc_out.iloc[TEST_FROM:]
df_test_v5['Coin'] = 'Synth'
df_test_v5.drop(['index'], axis=1, inplace=True)
df_test_v5.reset_index(inplace=True)
df_test_v5.drop(['index'], axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
# save - test
print('new data shape: {}'.format(df_test_v5.shape))
print('SUCCESS - Synthetic data generated (test)')

data_path = path.absolute()
data_path_export = data_path / OUTPUT_FOLDER / TEST_FILE
df_test_v5.to_csv(data_path_export, index=False, mode='w', header=True)

new data shape: (217, 57)
SUCCESS - Synthetic data generated (test)


### Train / forecast

In [28]:
# TODO - fix the yaml structure (separete test/train)

In [29]:
lstm_configs = lstm.load_lstm_config(verbose=False)
lstm_config_2 = lstm_configs['models'][1]

In [30]:
# get data
x, y, x_val, y_val, x_test, y_test = lstm.get_data(lstm_config_2, use_same_train_validation=True, use_same_validation_test=True, verbose='Summary')
# reload not normalized for viz
lstm_config_2['data']['normalise'] = False
x2, y2, x_val2, y_val2, x_test_viz, y_test_viz = lstm.get_data(lstm_config_2, use_same_train_validation=True, use_same_validation_test=True, verbose='Summary')

[Data] Train data loaded, size: (1467, 20, 28) 
[Data] Validation data loaded, size: (348, 20, 28) 
[Data] Test data loaded, size: (348, 20, 28) 
[Data] Train data loaded, size: (1467, 20, 28) 
[Data] Validation data loaded, size: (348, 20, 28) 
[Data] Test data loaded, size: (348, 20, 28) 


In [31]:
m = lstm.train_model(lstm_config_2, x, y, x_val, y_val, verbose='Summary')

[Model] Model Compiled
Time taken: 0:00:02.529998
[Model] Training Started
[Model] 256 epochs, 32 batch size
[Model] Training Completed. Model saved as saved_models\31052023-111157-epoch{epoch:02d}-acc{val_accuracy:.2f}-loss{val_loss:.2f}.h5
Time taken: 0:01:19.490547
Max validation accuracy: 76.1% @ epoch: 18
Min validation loss: 0.7 @ epoch: 25


In [32]:
fig_acc = lstm.test_model(lstm_config_2, m, x_test, y_test, x_test_viz, y_test_viz, verbose='Summary')
fig_acc.show()

[Model] Predicting Sequences Multiple TBL ...
[Test] Test accuracy: 76.15%
pred > 0.7: 24.137931034482758%
pred > 0.5: 32.088122605363985%
pred < 0.5: 67.91187739463601%


In [33]:
for day in tqdm(days_to_process):
    days_shift = (datetime.strptime(today, DATE_FORMAT) - datetime.strptime(day, DATE_FORMAT)).days

    # Forecast
    todays_window = lstm.get_todays_data(lstm_config_2,days_shift,verbose='Summary')
    todays_window_temp = []
    todays_window_temp.append(todays_window)
    todays_pred = m.predict_sequence_full_tbl(todays_window_temp)

    # target_names = ['-1', '0', '1']
    predictions_argmax = np.argmax(todays_pred[:], axis=1)
    if predictions_argmax[0] == 0:
        pred_direction = '-1.0'
    elif predictions_argmax[0] == 1:
        pred_direction = '0.0'
    else:
        pred_direction = '1.0'

    # Create DataFrame
    df_pred_lstm = pd.DataFrame(
        {'Date': [day],
        'Coin': ['BTC-USD'],
        'Label': [pred_direction]})
        
    # save forecast
    if TEST_MODE:
        print('Test mode - saving skipped')
    else:
        # save tbl forecast
        save_forecast(
            df_pred_lstm,
            model='lstm-tb-2', 
            value_column='Label', 
            verbose=False)
print('Todays prediction confidence: {}'.format(todays_pred))
print('Todays prediction direction: {}'.format(pred_direction))
# TODO
# best_model_epoch = (history.history['val_loss'].index(min(history.history['val_loss'])) + history.history['val_accuracy'].index(max(history.history['val_accuracy']))) / 2.0
# best_model_epochxxxxx

  0%|          | 0/2 [00:00<?, ?it/s]

[Todays data] X data size: (20, 28)
[Model] Predicting Sequences Multiple TBL ...


 50%|█████     | 1/2 [00:00<00:00,  6.10it/s]

[Todays data] X data size: (20, 28)
[Model] Predicting Sequences Multiple TBL ...


100%|██████████| 2/2 [00:00<00:00,  6.29it/s]

Todays prediction confidence: [[0.09145322 0.1779547  0.7305921 ]]
Todays prediction direction: 1.0





## LSTM3
- based on ema7 smoothened & shuffled train/test data
- TBL is 14 days +/-5% (ema7) = 7% (Open)
- data included: ema7, vol_ema7, f&g

In [34]:
lstm_config_3 = lstm_configs['models'][2]

In [35]:
# get data
x, y, x_val, y_val, x_test, y_test = lstm.get_data(lstm_config_3, use_same_train_validation=True, use_same_validation_test=True, verbose='Summary')

[Data] Train & Val data shuffled !!
[Data] Train data loaded, size: (1161, 30, 7) 
[Data] Validation data loaded, size: (498, 30, 7) 
[Data] Test data loaded, size: (481, 30, 7) 


In [36]:
m3 = lstm.train_model(lstm_config_3, x, y, x_val, y_val, verbose='Summary')

[Model] Model Compiled
Time taken: 0:00:04.799998
[Model] Training Started
[Model] 256 epochs, 32 batch size
[Model] Training Completed. Model saved as saved_models\31052023-111404-epoch{epoch:02d}-acc{val_accuracy:.2f}-loss{val_loss:.2f}.h5
Time taken: 0:04:24.216985
Max validation accuracy: 92.6% @ epoch: 66
Min validation loss: 0.3 @ epoch: 42


In [37]:
for day in tqdm(days_to_process):
    days_shift = (datetime.strptime(today, DATE_FORMAT) - datetime.strptime(day, DATE_FORMAT)).days

    # Forecast
    todays_window = lstm.get_todays_data(lstm_config_3,days_shift,verbose='Summary')
    todays_window_temp = []
    todays_window_temp.append(todays_window)
    todays_pred = m3.predict_sequence_full_tbl(todays_window_temp)

    # target_names = ['-1', '0', '1']
    predictions_argmax = np.argmax(todays_pred[:], axis=1)
    if predictions_argmax[0] == 0:
        pred_direction = '-1.0'
    elif predictions_argmax[0] == 1:
        pred_direction = '0.0'
    else:
        pred_direction = '1.0'

    # Create DataFrame
    df_pred_lstm = pd.DataFrame(
        {'Date': [day],
        'Coin': ['BTC-USD'],
        'Label': [pred_direction]})
        
    # save forecast
    if TEST_MODE:
        print('Test mode - saving skipped')
    else:
        # save tbl forecast
        save_forecast(
            df_pred_lstm,
            model='lstm-tb-3', 
            value_column='Label', 
            verbose=False)

  0%|          | 0/2 [00:00<?, ?it/s]

[Todays data] X data size: (30, 7)
[Model] Predicting Sequences Multiple TBL ...


 50%|█████     | 1/2 [00:02<00:02,  2.93s/it]

[Todays data] X data size: (30, 7)
[Model] Predicting Sequences Multiple TBL ...


100%|██████████| 2/2 [00:03<00:00,  1.57s/it]


In [38]:
print('Todays prediction confidence: {}'.format(todays_pred))
print('Todays prediction direction: {}'.format(pred_direction))

Todays prediction confidence: [[1.3881042e-03 2.4954521e-04 9.9836236e-01]]
Todays prediction direction: 1.0


# 3 Backtest
- take latest price data and validate with stored forecast data

### accuracy

In [39]:
# get forecast error of LSTM
df_accuracy_lstm, df_accuracy_data_lstm = get_accuracy_tb(model='lstm-tb', config=config, lstm_config=lstm_config_1, verbose=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Confusion matrix: 
[[10  3 13]
 [52 23 25]
 [14  9 17]]
              precision    recall  f1-score   support

        -1.0      0.132     0.385     0.196        26
         0.0      0.657     0.230     0.341       100
         1.0      0.309     0.425     0.358        40

    accuracy                          0.301       166
   macro avg      0.366     0.347     0.298       166
weighted avg      0.491     0.301     0.322       166

BTC-USD: TB classification accuracy over 5 days forecast: 30.12%



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [40]:
# get forecast error of LSTM
df_accuracy_lstm_2, df_accuracy_data_lstm_2 = get_accuracy_tb(model='lstm-tb-2', config=config, lstm_config=lstm_config_2, verbose=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Confusion matrix: 
[[ 7  2  0]
 [26 20  3]
 [13  3  1]]
              precision    recall  f1-score   support

        -1.0      0.152     0.778     0.255         9
         0.0      0.800     0.408     0.541        49
         1.0      0.250     0.059     0.095        17

    accuracy                          0.373        75
   macro avg      0.401     0.415     0.297        75
weighted avg      0.598     0.373     0.405        75

BTC-USD: TB classification accuracy over 5 days forecast: 37.33%



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [41]:
df_accuracy_lstm_3, df_accuracy_data_lstm_3 = get_accuracy_tb(model='lstm-tb-3', config=config, lstm_config=lstm_config_3, verbose=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy






The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Real data accuracy - lstm 1

In [42]:
fig = lstm.viz_generate_accuracy_chart(df_accuracy_data_lstm)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Real data accuracy - lstm 2

In [43]:
fig = lstm.viz_generate_accuracy_chart(df_accuracy_data_lstm_2)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [44]:
fig = lstm.viz_generate_accuracy_chart(df_accuracy_data_lstm_3)
fig.show()

# 4 Summary

In [45]:
# TODO fix todays_pred, df_pred_lstm and df_accuracy_lstm are recycled for all three LSTMs 

In [46]:
print('--------- BTC Summary -----------------------')
print('LSTM prediction: {} ({:2.0f}% certain) @ {:2.0f}% (hist. acc.)'
    .format(df_pred_lstm[df_pred_lstm['Coin'] == 'BTC-USD']['Value'][0],
        todays_pred.max()*100,  
        df_accuracy_lstm[df_accuracy_lstm['Coin'] == 'BTC-USD']['Accuracy'][0]*100))
print('Random guess: 33%')
print('--------- ETH Summary -----------------------')
# print('LSTM prediction: {} @ {:2.0f}% '.format(df_pred_lstm[df_pred_lstm['Coin'] == 'BTC-USD']['Label'][0],todays_pred.max()*100))
print('-- no prediction --')

--------- BTC Summary -----------------------
LSTM prediction: 1.0 (100% certain) @ 30% (hist. acc.)
Random guess: 33%
--------- ETH Summary -----------------------
-- no prediction --


In [47]:
# simplified above for script environmet
# df_summary = get_coin_summary(df_tbl, df_accuracy, df_mape, df_accuracy_lstm)
# df_summary

Above are data related to Forecast using Triple barrier label: 5% thresholds and 5 day forecast window
- forecasting Facebook Prophet and forecast accuracy - MAPE (Mean Absolute Percentage Error) over all forecasts done so far
- Triple barier label: 1 = will grow ; -1 = will decline; 0 = will not exceed 5% either way
- TBL accuraccy 

testing zone