#### BUILDING A MODEL WITH XGBOOST INSTEAD OF LSTM 

In [1]:
import os 
from datetime import datetime

directory = os.getcwd().replace("\\", "/")
#directory = "C:/Users/ktsar/Downloads/Python codes/Python codes/Git_Repos/ATS_Development/Strat_2"
os.chdir(directory)

import pandas as pd 
from pathlib import Path
from ALGO_KT1 import Preprocessing_functions as pf 
from ALGO_KT1 import LSTM_Architecture as ls
from techinical_analysis import * 

In [159]:

ticker = 'BTC-USD'

if ticker != 'BTC-USD':

    df = pf.downlaod_symbol_data(ticker, period='130mo')

else:
    
    try:
        df = pd.read_csv('Strat_2/data/BTC-USD/BTCUSD_15.csv')
    except FileNotFoundError:
        df = pd.read_csv('data/BTC-USD/BTCUSD_15.csv')
    
    del df['Timestamp'], df['datetime.1']
    df = df.rename(columns={'datetime' : 'Date'})
    df = df.set_index('Date')

df = pf.create_momentum_feat(df, ticker)
df = pf.technical_indicators(df).dropna()
df = reversal_patterns(df)
df = continuation_patterns(df)
df = magic_doji(df)

if ticker != 'BTC-USD':
    df = pf.format_idx_date(df)
    
else: 
    df.index = pd.to_datetime(df.index)

df['labels'] = ((df['Close'] - df['Open']) >= 0).astype(int) 
df['labels'] = df['labels'].shift(-1)
df['open_high'] = df['open_high'] * (-1)

print(f"0 - red bar,  1 - green bar",df['labels'].value_counts())


### Slice dataframe up to a given time 
#df = df[df.index <= '2024-02-01']

df.head()

0 - red bar,  1 - green bar labels
1.0    157471
0.0    143661
Name: count, dtype: int64


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Trades,open_low,open_close,open_high,high_low,...,stalled_pattern,counterattack,tasuki,rf_three_methods,separating_lines,long_legged_doji,gravestone_doji,dragonfly_doji,tristar_doji,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-10 19:15:00,448.15998,448.15998,448.15998,448.15998,0.06153,1,0.0,0.0,-0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.0
2016-01-10 19:30:00,447.32002,449.49,447.3101,449.49,0.3031,6,0.002218,-0.485107,0.485107,0.484972,...,0,0,0,0,0,0,0,0,0,0.0
2016-01-10 21:15:00,446.51,446.51,446.34001,446.34001,2.0,3,0.038071,0.038071,-0.0,0.038071,...,0,0,0,0,0,0,0,0,0,1.0
2016-01-10 22:00:00,447.49,447.49,447.49,447.49,0.0447,1,0.0,0.0,-0.0,0.0,...,0,0,0,0,0,0,0,0,0,1.0
2016-01-10 22:30:00,447.32001,447.32001,447.32001,447.32001,0.527,1,0.0,0.0,-0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.0


In [160]:
#### CREATE LAGGED VARIABLES 

def add_lags(dataframe, feature_name, n_lags = 5):
    
    for lag in range(1,n_lags+1,1):
    
        dataframe[f"{feature_name}_{lag}"] = dataframe[feature_name].shift(lag)
        
    return dataframe


features = ['Volume', 'Trades', 'open_low',
            'open_close', 'open_high', 'high_low', 
            'low_close', 'high_close']


print('df shape before: ', df.shape)
for feature in features:
    
    df = add_lags(df, feature)
    
    
df.shape

df shape before:  (301133, 52)


(301133, 92)

In [161]:
# =============================================================================
# BAR STATS 
# =============================================================================
df_green, green_day_stats = pf.cluster_stats(df, 1, "open_close", "open_high", "open_low")
df_Red, red_day_stats = pf.cluster_stats(df, 0, "open_close", "open_high", "open_low")

green_day_stats.columns = ['open_close_green', "open_high_green", "open_low_green"]
red_day_stats.columns = ['open_close_red', "open_high_red", "open_low_red"]
stats = green_day_stats.merge(red_day_stats, left_index = True, right_index = True)
stats

Unnamed: 0,open_close_green,open_high_green,open_low_green,open_close_red,open_high_red,open_low_red
count,157471.0,157471.0,157471.0,143661.0,143661.0,143661.0
min,-12.4475,-0.0,0.0,-19.814,-0.0,0.0
max,13.502,41.6638,21.7703,16.6542,22.5918,24.5048
mean,0.0085,0.2093,0.2262,-0.0142,0.2251,0.2183
median,0.0,0.1028,0.1128,-0.0027,0.1177,0.1085
std,0.4078,0.3711,0.4011,0.4149,0.3813,0.3774
skew,0.4813,15.1907,9.8975,-0.562,10.7367,8.9374
kurtosis,46.9066,1096.7762,261.2927,98.3286,353.971,255.3406


In [162]:
try:
    df = df.drop(columns= ['Open', 'High', 'Low', 'Close', 'Capital Gains', 'Stock Splits']).dropna()
    df.columns
    
except KeyError:
    df = df.drop(columns= ['Open', 'High', 'Low', 'Close']).dropna()
    df.columns

In [163]:
df.columns

Index(['Volume', 'Trades', 'open_low', 'open_close', 'open_high', 'high_low',
       'low_close', 'high_close', 'BTC-USD_mom1', 'BTC-USD_mom2',
       'BTC-USD_mom3', 'BTC-USD_mom4', 'BTC-USD_mom5', 'BTC-USD_mom10',
       'BTC-USD_mom15', 'BTC-USD_mom20', 'BTC-USD_mom60', 'BTC-USD_mom120',
       'BTC-USD_mom180', 'BTC-USD_mom240', 'ATR', 'MACD', 'hammer',
       'hanging_man', 'engulfing_pattern', 'dark_cloud', 'piercing_line',
       'morning_star', 'evening_star', 'shooting_star', 'inverted_hammer',
       'harami', 'harami_cross', 'belt_hold', 'upsidegap_two_crows',
       'three_black_crows', 'three_white_soldiers', 'advance_block',
       'stalled_pattern', 'counterattack', 'tasuki', 'rf_three_methods',
       'separating_lines', 'long_legged_doji', 'gravestone_doji',
       'dragonfly_doji', 'tristar_doji', 'labels', 'Volume_1', 'Volume_2',
       'Volume_3', 'Volume_4', 'Volume_5', 'Trades_1', 'Trades_2', 'Trades_3',
       'Trades_4', 'Trades_5', 'open_low_1', 'open_low_2', '

In [164]:
X = df.copy()
print('X shape: ', X.shape)
y = X.pop('labels')
print('y shape: ', y.shape)
print('X shape after pop: ', X.shape)

X shape:  (301127, 88)
y shape:  (301127,)
X shape after pop:  (301127, 87)


In [166]:
train_size = int(0.7 * X.shape[0])

X_train = X.iloc[ :train_size, :]
X_test = X.iloc[train_size :, :]

y_train = y.iloc[:train_size]
y_test = y.iloc[train_size:]

print('Training set shape', X_train.shape)
print('Test set shape', X_test.shape)
print('Labels shape', y_train.shape)
print('Labels shape', y_test.shape)

Training set shape (210788, 87)
Test set shape (90339, 87)
Labels shape (210788,)
Labels shape (90339,)


In [117]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report   

In [167]:
print('End date for test set', X_test.index.max())
print('Start date for test set', X_test.index.min())
print('End date for test set', y_test.index.max())
print('Start date for test set', y_test.index.min())

End date for test set 2024-09-30 23:30:00
Start date for test set 2022-03-03 19:00:00
End date for test set 2024-09-30 23:30:00
Start date for test set 2022-03-03 19:00:00


In [168]:
# Create and train the XGBoost model
model = XGBClassifier(random_state = 42,
                     # max_depth = 100,
                     # max_leaves = 200,
                     # n_estimators = 500,
                     # learning_rate = 0.001
                      )
model.fit(X_train, y_train)

In [169]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.524889582572311
              precision    recall  f1-score   support

         0.0       0.51      0.53      0.52     44080
         1.0       0.54      0.52      0.53     46259

    accuracy                           0.52     90339
   macro avg       0.52      0.52      0.52     90339
weighted avg       0.53      0.52      0.52     90339



(747,)

In [170]:
pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred).to_frame()], axis=1)

Unnamed: 0,labels,0
0,0.0,1
1,0.0,0
2,0.0,1
3,0.0,0
4,1.0,1
...,...,...
90334,0.0,0
90335,1.0,1
90336,0.0,1
90337,0.0,1
