In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, date, time, timezone
import tensorflow as tf

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
def file_name(time_frame):
  return ( f'btcusdt_{time_frame}m.csv' if time_frame < 60 else f'btcusdt_1h.csv')

In [4]:
df_3 = pd.read_csv(file_name(3))
df_5 = pd.read_csv(file_name(5))
df_15 = pd.read_csv(file_name(15))
df_30 = pd.read_csv(file_name(30))
df_60 = pd.read_csv(file_name(60))

In [5]:
dataframes = [df_3,df_5,df_15,df_30,df_60]

In [6]:
for x in dataframes:
    x['datetime']=pd.to_datetime(x['datetime'])
    x['just_Date']=x['datetime'].dt.date
    start_date = date(2018, 1, 1)
    end_date = date(2022, 12, 31)
    x = x[(x['datetime'].dt.date >= start_date) & (x['datetime'].dt.date <= end_date)]
    x = x.sort_values(by='datetime', ascending=True)
    x = x.sort_values(by='datetime', ascending=True)

In [7]:
df = pd.DataFrame()
df = df_5.copy(deep=True)

In [8]:
def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    data['ema_short'] = data['close'].ewm(span=short_window, adjust=False).mean()
    data['ema_long'] = data['close'].ewm(span=long_window, adjust=False).mean()
    data['macd'] = data['ema_short'] - data['ema_long']
    data['signal_line'] = data['macd'].ewm(span=signal_window, adjust=False).mean()
    data['macd_sig'] = np.where(data['macd'] > data['signal_line'], 1, 0)

def calculate_cci(data, window=20):
    tp = (data['high'] + data['low'] + data['close']) / 3
    data['tp'] = tp
    data['tp_mean'] = tp.rolling(window=window).mean()
    data['mad'] = (data['tp'] - data['tp_mean']).abs().rolling(window=window).mean()
    data['cci'] = (data['tp'] - data['tp_mean']) / (0.015 * data['mad'])

    
def calculate_rsi(data, window=14):
    data['delta'] = data['close'].diff(1)
    gain = data['delta'].apply(lambda x: x if x > 0 else 0)
    loss = -data['delta'].apply(lambda x: x if x < 0 else 0)

    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()

    rs = avg_gain / avg_loss
    data['rsi'] = 100 - (100 / (1 + rs))

def calculate_bollinger_bands(data, window=20, num_std_dev=2):
    data['rolling_mean'] = data['close'].rolling(window=window).mean()
    data['upper_band'] = data['rolling_mean'] + (num_std_dev * data['close'].rolling(window=window).std())
    data['lower_band'] = data['rolling_mean'] - (num_std_dev * data['close'].rolling(window=window).std())

def calculate_vwap(df):
    typical_price = (df['high'] + df['low'] + df['close']) / 3
    vwap = (typical_price * df['volume']).cumsum() / df['volume'].cumsum()
    df['VWAP'] = vwap


In [9]:
# Define the time span for EMA (adjust as needed)
span = 12

# Calculate EMA for 'open', 'high', 'low', 'close' columns
df['ema_open'] = df['open'].ewm(span=span, adjust=False).mean()
df['ema_high'] = df['high'].ewm(span=span, adjust=False).mean()
df['ema_low'] = df['low'].ewm(span=span, adjust=False).mean()
df['ema_close'] = df['close'].ewm(span=span, adjust=False).mean()

In [10]:
calculate_macd(df)
calculate_cci(df)
calculate_rsi(df)
calculate_bollinger_bands(df)
calculate_vwap(df)

def calculate_ma5(df):
    return df['close'].rolling(window=5).mean()

def calculate_ma8(df):
    return df['close'].rolling(window=8).mean()

def calculate_ma13(df):
    return df['close'].rolling(window=13).mean()


In [11]:
df['Target'] = df['close'].shift(-1) - df['close']
df['TargetClass'] = df['Target'].apply(lambda x: 1 if x > 0 else 0)
df['TargetNextclose'] = df['close'].shift(-1)

In [12]:

df.reset_index(inplace = True)
df.drop(['just_Date','index'], axis=1, inplace=True)

In [13]:
def calculate_average(df,days):
    avg_list = []

    for i in range(len(df)):
        if i < days:
            avg_list.append(df['volume'][:i].mean())
        else:
            avg_list.append(df['volume'][i-days:i].mean())
    df['adv20'] = avg_list
#calculate_average(df,1000)

In [14]:
df.dropna(inplace=True)

In [15]:
df.reset_index(inplace = True)

In [16]:
df_set = df.iloc[:,:]#.values
df_set.head(20)


Unnamed: 0,index,datetime,open,high,low,close,volume,ema_open,ema_high,ema_low,...,cci,delta,rsi,rolling_mean,upper_band,lower_band,VWAP,Target,TargetClass,TargetNextclose
0,38,2018-01-01 08:40:00,13352.0,13450.76,13346.75,13444.99,35.581784,13329.479253,13392.188651,13286.029679,...,53.700266,94.99,54.818403,13332.627,13487.33097,13177.92303,13416.656561,13.08,1,13458.07
1,39,2018-01-01 08:45:00,13440.0,13534.96,13440.0,13458.07,52.851111,13346.482445,13414.153474,13309.717421,...,91.445161,13.08,58.774706,13329.736,13471.652746,13187.819254,13418.963433,51.93,1,13510.0
2,40,2018-01-01 08:50:00,13468.0,13534.9,13468.0,13510.0,24.817059,13365.177453,13432.729862,13334.068587,...,100.97319,51.93,57.969625,13333.2355,13489.343794,13177.127206,13420.451903,52.37,1,13562.37
3,41,2018-01-01 08:55:00,13540.75,13611.27,13510.11,13562.37,34.605136,13392.188614,13460.197576,13361.151881,...,123.95287,52.37,68.973085,13342.854,13529.264542,13156.443458,13423.795049,27.61,1,13589.98
4,42,2018-01-01 09:00:00,13562.37,13611.01,13500.0,13589.98,45.012178,13418.370366,13483.399487,13382.51313,...,117.461007,27.61,69.536935,13357.3425,13572.608135,13142.076865,13428.085331,-144.78,0,13445.2
5,43,2018-01-01 09:05:00,13589.95,13594.96,13400.0,13445.2,40.055787,13444.767233,13500.562643,13385.203418,...,65.959086,-144.78,63.659372,13369.4495,13575.20659,13163.69241,13429.43486,54.8,1,13500.0
6,44,2018-01-01 09:10:00,13473.91,13529.42,13446.79,13500.0,20.551341,13449.250735,13505.002236,13394.678277,...,69.094024,54.8,63.285872,13376.1495,13590.001214,13162.297786,13430.258412,-10.04,0,13489.96
7,45,2018-01-01 09:15:00,13510.0,13510.0,13465.9,13489.96,42.405044,13458.596776,13505.771123,13405.635465,...,63.08577,-10.04,68.243499,13384.6475,13602.573312,13166.721688,13431.799941,-53.98,0,13435.98
8,46,2018-01-01 09:20:00,13475.01,13489.97,13400.04,13435.98,45.49796,13461.121887,13503.340181,13404.774624,...,33.02607,-53.98,58.292676,13387.0965,13606.231125,13167.961875,13432.080952,-25.95,0,13410.03
9,47,2018-01-01 09:25:00,13411.35,13438.65,13410.02,13410.03,25.2719,13453.464674,13493.387846,13405.581605,...,16.43587,-25.95,53.267935,13393.2255,13607.42666,13179.02434,13431.892276,16.92,1,13426.95


In [17]:
#columns_to_include = ['open', 'close','volume', 'ema_short', 'ema_long','signal_line',  'cci','rsi','upper_band', 'lower_band']
columns_to_include = ['ema_open', 'ema_high','ema_low',  'ema_close']
num_lags = 10
for col in columns_to_include:
  for i in range(1, num_lags+1):
    df_set[f'{col}_prev{i}'] = df_set[col].shift(i)

In [18]:
df.dropna(how='any',inplace = True)

In [19]:
columns_and_lags = [f'{col}_prev{i}' for col in columns_to_include for i in range(1, num_lags + 1)]

In [20]:
df_set

Unnamed: 0,index,datetime,open,high,low,close,volume,ema_open,ema_high,ema_low,...,ema_close_prev1,ema_close_prev2,ema_close_prev3,ema_close_prev4,ema_close_prev5,ema_close_prev6,ema_close_prev7,ema_close_prev8,ema_close_prev9,ema_close_prev10
0,38,2018-01-01 08:40:00,13352.00,13450.76,13346.75,13444.99,35.581784,13329.479253,13392.188651,13286.029679,...,,,,,,,,,,
1,39,2018-01-01 08:45:00,13440.00,13534.96,13440.00,13458.07,52.851111,13346.482445,13414.153474,13309.717421,...,13343.851969,,,,,,,,,
2,40,2018-01-01 08:50:00,13468.00,13534.90,13468.00,13510.00,24.817059,13365.177453,13432.729862,13334.068587,...,13361.423974,13343.851969,,,,,,,,
3,41,2018-01-01 08:55:00,13540.75,13611.27,13510.11,13562.37,34.605136,13392.188614,13460.197576,13361.151881,...,13384.281824,13361.423974,13343.851969,,,,,,,
4,42,2018-01-01 09:00:00,13562.37,13611.01,13500.00,13589.98,45.012178,13418.370366,13483.399487,13382.513130,...,13411.680005,13384.281824,13361.423974,13343.851969,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422293,422336,2022-01-12 05:05:00,42704.17,42794.82,42693.00,42794.81,49.649990,42759.355940,42807.591207,42723.560214,...,42759.926418,42770.065766,42766.439542,42764.339459,42751.675724,42729.271310,42711.022458,42697.068359,42694.148061,42687.713163
422294,422337,2022-01-12 05:10:00,42794.82,42822.21,42753.01,42790.03,61.098870,42764.811949,42809.840252,42728.090950,...,42765.293123,42759.926418,42770.065766,42766.439542,42764.339459,42751.675724,42729.271310,42711.022458,42697.068359,42694.148061
422295,422338,2022-01-12 05:15:00,42790.04,42819.89,42718.77,42736.01,39.718990,42768.693187,42811.386367,42726.656958,...,42769.098796,42765.293123,42759.926418,42770.065766,42766.439542,42764.339459,42751.675724,42729.271310,42711.022458,42697.068359
422296,422339,2022-01-12 05:20:00,42736.01,42736.02,42633.97,42674.32,55.094370,42763.665005,42799.791542,42712.397426,...,42764.008212,42769.098796,42765.293123,42759.926418,42770.065766,42766.439542,42764.339459,42751.675724,42729.271310,42711.022458


In [21]:
from sklearn.preprocessing import MinMaxScaler


In [22]:
numeric_columns = df_set.columns[2:]
df_set_numeric = df_set[numeric_columns]

In [23]:
scaler=MinMaxScaler(feature_range=(0,1))
df_set_numeric = pd.DataFrame(scaler.fit_transform(df_set_numeric), columns=df_set_numeric.columns)

In [24]:
y = df_set['TargetClass'].values

In [25]:
X = df_set_numeric.drop(['TargetClass','Target','TargetNextclose'],axis=1).values

In [26]:
df_set_numeric

Unnamed: 0,open,high,low,close,volume,ema_open,ema_high,ema_low,ema_close,ema_short,...,ema_close_prev1,ema_close_prev2,ema_close_prev3,ema_close_prev4,ema_close_prev5,ema_close_prev6,ema_close_prev7,ema_close_prev8,ema_close_prev9,ema_close_prev10
0,0.155400,0.156217,0.155759,0.156795,0.004299,0.155313,0.156020,0.154897,0.155534,0.155534,...,,,,,,,,,,
1,0.156742,0.157496,0.157184,0.156995,0.006385,0.155573,0.156355,0.155260,0.155802,0.155802,...,0.155534,,,,,,,,,
2,0.157169,0.157495,0.157612,0.157787,0.002998,0.155859,0.156639,0.155633,0.156152,0.156152,...,0.155802,0.155534,,,,,,,,
3,0.158278,0.158655,0.158256,0.158586,0.004181,0.156272,0.157059,0.156047,0.156571,0.156571,...,0.156152,0.155802,0.155534,,,,,,,
4,0.158608,0.158651,0.158101,0.159007,0.005438,0.156672,0.157413,0.156374,0.156990,0.156990,...,0.156571,0.156152,0.155802,0.155534,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422293,0.603031,0.601948,0.604309,0.604403,0.005998,0.605283,0.605256,0.605456,0.605375,0.605375,...,0.605293,0.605448,0.605392,0.605360,0.605167,0.604824,0.604545,0.604332,0.604287,0.604189
422294,0.604414,0.602364,0.605226,0.604330,0.007382,0.605367,0.605291,0.605525,0.605433,0.605433,...,0.605375,0.605293,0.605448,0.605392,0.605360,0.605167,0.604824,0.604545,0.604332,0.604287
422295,0.604341,0.602329,0.604703,0.603506,0.004799,0.605426,0.605314,0.605503,0.605355,0.605355,...,0.605433,0.605375,0.605293,0.605448,0.605392,0.605360,0.605167,0.604824,0.604545,0.604332
422296,0.603517,0.601055,0.603406,0.602566,0.006656,0.605349,0.605137,0.605285,0.605144,0.605144,...,0.605355,0.605433,0.605375,0.605293,0.605448,0.605392,0.605360,0.605167,0.604824,0.604545


In [27]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)


In [28]:
y_train

array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

In [29]:
X_train.shape

(380068, 64)

In [30]:
y_train.shape

(380068,)

In [31]:
# Train model
model = XGBClassifier(
    booster='gbtree',
    learning_rate=0.3,
    gamma=0,
    max_depth=6,
    reg_lambda=1,
    reg_alpha=0
)
model.fit(X_train, y_train)

In [48]:
# Evaluate model
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
print(f"Train accuracy: {train_acc:.3f}")
print(f"Test accuracy: {test_acc:.3f}")

Train accuracy: 0.902
Test accuracy: 0.844


In [33]:
y_pred = model.predict(X)

In [34]:
df['y_pred'] = y_pred

In [35]:
new_df = pd.DataFrame()

In [36]:
new_df = df[['close', 'open','datetime','open','high','low','volume']].copy()
new_df['y_pred'] = y_pred

In [37]:
new_df.reset_index(inplace = True)

In [38]:
new_df.drop('index', axis=1,inplace = True)

In [39]:
def generate_signals(df, initial_balance=1000):
    positions = []
    position = {'entry_index': None, 'entry_price': None, 'exit_index': None, 'exit_price': None, 'pnl': None}
    balance = initial_balance

    for i in range(1, len(df)):
        if df['y_pred'][i] == 1 and df['volume'][i]>df['adv20'][i] :
            if position['entry_index'] is None:
                position['entry_index'] = df.index[i]
                position['entry_price'] = df['close'][i]
                position['exit_index'] = None
                position['exit_price'] = None
                position['pnl'] = None


        elif df['y_pred'][i] == 0:
            if position['entry_index'] is not None:
                position['exit_index'] = df.index[i]
                position['exit_price'] = df['close'][i]
                position['pnl'] = (position['exit_price'] - position['entry_price'])*1000/position['entry_price']
                balance += position['pnl']
                positions.append(position.copy())

                position = {'entry_index': None, 'entry_price': None, 'exit_index': None, 'exit_price': None, 'pnl': None}
    if position['entry_index'] is not None:
        positions.append(position.copy())
    signals_df = pd.DataFrame(positions)

    return balance, signals_df

In [40]:
final_balance, signals = generate_signals(new_df)

In [41]:
signals.dropna(inplace=True)

In [42]:
# Create a new DataFrame with all values initialized to 0
signal_df = pd.DataFrame(0, index=new_df.index, columns=['signal'])
# Update values based on entry_index and exit_index
signal_df.loc[signals['entry_index'], 'signal'] = 1
signal_df.loc[signals['exit_index'], 'signal'] = -1

In [43]:
# Concatenate new_df and signal_df
strategy = pd.concat([new_df, signal_df], axis=1)

In [44]:
buy_type = []
for index,row in strategy.iterrows():
  buy_t = np.nan if row['signal'] == 0 else 'close' if row['signal'] == 1 else 'close'
  buy_type.append(buy_t)
strategy['buy type'] = buy_type

In [45]:
strategy.drop('y_pred',axis=1,inplace=True)

In [47]:
def backtesting1(strategy_):
  st_temp = strategy_.dropna()
  st_temp = st_temp.reset_index(drop=True)
  cum_pnl = 0
  index = 0
  while index < len(st_temp)-1:
    entry_row = st_temp.iloc[index]
    exit_row = st_temp.iloc[index+1]
    assert(entry_row['signal'] != exit_row['signal'])
    entry_price = (entry_row['open'] if entry_row['buy type'] == 'open' else entry_row['close'])
    exit_price = (exit_row['open'] if exit_row['buy type'] == 'open' else exit_row['close'])
    num_shares = 1000/entry_price
    returns = (exit_price-entry_price)*num_shares
    if exit_row['signal'] == 1:
      returns *= -1
    cum_pnl += returns

    index += 2

  return cum_pnl

backtesting1(strategy)

51210.17289895803