#### BACKTESTING FEB 2025

In [1]:

#import datetime
import joblib

from pathlib import Path
from Preprocessing_functions import *
from techinical_analysis import * 

In [2]:
import os
import numpy as np 
import pandas as pd
import Preprocessing_functions as pf  
import torch
import torch.nn as nn
import torch.optim 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime 

In [3]:
# Prepare data for LSTM model
def prepare_data(data, sequence_length):
    import numpy as np
    y = data.pop('labels')
    
    data = data.dropna()
    
    features = list(data.columns)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(data[features])
    
     # Create sequences
    X, y_seq = [], []
    for i in range(len(X_scaled) - sequence_length):
        X.append(X_scaled[i:i + sequence_length])
        y_seq.append(y.iloc[i + sequence_length - 1])

    return np.array(X), np.array(y_seq)

In [4]:

class CustomLSTMModel_2(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        """
        Args:
            input_size (int): The number of input features.
            hidden_sizes (list): A list of hidden sizes for each LSTM layer (e.g., [128, 64]).
            output_size (int): The number of output features.
        """
        super(CustomLSTMModel_2, self).__init__()
        
        # Define the first LSTM layer
        self.lstm1 = nn.LSTM(input_size, hidden_sizes[0], batch_first=True)
        
        # Define the second LSTM layer
        self.lstm2 = nn.LSTM(hidden_sizes[0], hidden_sizes[1], batch_first=True)
        
        # Fully connected layer for final output
        self.fc = nn.Linear(hidden_sizes[1], output_size)

    def forward(self, x):
        # Pass through the first LSTM layer
        out, _ = self.lstm1(x)
        
        # Pass through the second LSTM layer
        out, _ = self.lstm2(out)
        
        # Pass the final output through the fully connected layer
        out = self.fc(out[:, -1, :])  # Use the last time-step's output
        return out

In [5]:

ticker = "SPY"
n_clusters = 3 
#time_period = "360mo" # must be the same as in 1_Data_Acquisition or larger

# LOAD DF FOR MODEL BUILDING TO CHECK DATE RANGES 
DF_PATH = f"Data/{ticker}/df/"
DF_FILES = os.listdir(DF_PATH)
try:
    DF_FILES.remove('Junk')
except ValueError:
    print(' ')
print("DataFrames for model building: ", DF_FILES)
idx = 0 if len(DF_FILES) < 2 else int(input("Select file index: "))
DF_NAME = DF_FILES[idx] 
print("Chosen DataFrame file: ", DF_NAME)
df_dates = pd.read_parquet(DF_PATH + DF_NAME)
df_dates = format_idx_date(df_dates)

DataFrames for model building:  ['df_SPY_k3_202402012133.parquet']
Chosen DataFrame file:  df_SPY_k3_202402012133.parquet


In [6]:

### LOAD KMEANS MODEL ###
KMEANS_PATH = f"kmeans_models/{ticker}/"
KMEANS_MODEL_PATH = os.listdir(KMEANS_PATH)
try:
    KMEANS_MODEL_PATH.remove('Junk')
except ValueError:
    print(' ')    

print(KMEANS_MODEL_PATH)
idx = 0 if len(KMEANS_MODEL_PATH) < 2 else int(input("Select file index: "))
KMEANS_NAME = KMEANS_MODEL_PATH[idx]
print("Chosen K_MEANS MODEL file: ", KMEANS_NAME)
FILE = KMEANS_PATH + KMEANS_NAME
loaded_kmeans = joblib.load(FILE)

### LOAD FEAT LIST TO ORDER THE DATA ###
FEAT_PATH = f"model_features/{ticker}/"
FEAT_FILES = os.listdir(FEAT_PATH)
try:
    FEAT_FILES.remove('Junk')
except ValueError:
    print(' ')
print(FEAT_FILES)
idx = 0 if len(FEAT_FILES) < 2 else int(input("Select file index (e.g. 0,1,2)"))
FEAT_NAME = FEAT_FILES[idx]
MODEL_FEAT = pd.read_csv(FEAT_PATH + FEAT_NAME)['0'].to_list()
#MODEL_FEAT.pop(-1)

# Cluster stats
STATS_PATH = f"Data/{ticker}/k_stats/"
STATS_FILES = os.listdir(STATS_PATH)
try:
    STATS_FILES.remove('Junk')
except ValueError:
    print(' ')
print("KMEANS Stats files: ", STATS_FILES)
idx = 0 if len(STATS_FILES) < 2 else int(input("Select file index: "))
STATS_NAME = STATS_FILES[idx]
print("Chosen K_STATS file: ", STATS_NAME)
cluster_stats = pd.read_csv(STATS_PATH + STATS_NAME).set_index("Unnamed: 0")


# LOAD LSTM MODEL STATE DICT  
MODEL_PATH = f"lstm_models/Testing/{ticker}/"
LSTM_FILES = os.listdir(MODEL_PATH)
try:
    LSTM_FILES.remove('Junk')
except ValueError:
    print(' ')
print("LSTM Files: ",LSTM_FILES)
idx = 0 if len(LSTM_FILES) < 2 else int(input("Select file index: "))
MODEL_NAME = LSTM_FILES[idx]
print("Chosen LSTM, MODEL file: ", MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


['kmeans_model_df_SPY_k3_202402012133.joblib']
Chosen K_MEANS MODEL file:  kmeans_model_df_SPY_k3_202402012133.joblib
['LSTM_df_SPY_k3_202402012133_NFEAT23.csv']
KMEANS Stats files:  ['KMEANS_Stats_df_SPY_k3_202402012133.csv']
Chosen K_STATS file:  KMEANS_Stats_df_SPY_k3_202402012133.csv
 
LSTM Files:  ['LSTM_Class_Epoch_349_TestAcc_0.98_TrainAcc_0.99_202502051454']
Chosen LSTM, MODEL file:  LSTM_Class_Epoch_349_TestAcc_0.98_TrainAcc_0.99_202502051454


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:

#df = downlaod_symbol_data(ticker, period = time_period)

df = download_data(ticker, days = 360*25)
df = format_idx_date(df)

#df = df[df.index <= "2024-02-01"]

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,open_low,open_close,open_high,high_low,low_close,high_close,gap,Dividends
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2005-02-25,96.1403,97.2558,96.052,97.0723,76530080.0,0.091845,-0.969417,-1.160283,1.237767,-1.062237,-0.188678,,0
2005-02-28,96.8433,96.9693,95.9469,96.409,86854660.0,0.925619,0.448456,-0.130107,1.054354,-0.481621,-0.577812,-0.235907,0
2005-03-01,96.5438,97.1487,96.5438,96.9107,59512990.0,0.0,-0.380035,-0.626555,0.622654,-0.380035,-0.244985,0.139821,0
2005-03-02,96.534,97.468,96.4288,96.8632,80356790.0,0.108977,-0.34102,-0.967535,1.066196,-0.450488,-0.620511,-0.388708,0
2005-03-03,97.2558,97.4482,96.4774,96.9018,77690770.0,0.800364,0.363989,-0.197829,0.996222,-0.439896,-0.560708,0.405314,0


In [8]:
### SET TIME WINDOW FOR THE BACKTEST 
# REMOVE DATA SNOOPING 
out_sample = True
manual = False

if out_sample is True:
    
    if manual != True:
        start_date = df_dates.index.min()
        df = df[df.index <= start_date]
    
    else: 
        #Manually choosing the cutoff date
        df = df[df.index <= '2016-01-01']
        #df = df[df.index >= '2010-01-01']
    
    del DF_NAME, df_dates 

In [9]:

df = create_momentum_feat(df, ticker).dropna() # MOMENTUM FEATURES CREATE A LOT OF NANS WHICH WILL EXHAUST THE DATA IF ABOVE FILTER IS AFTER 
df = momentum_oscillators(df)
df = volatility(df)
df = reversal_patterns(df) 
df = continuation_patterns(df)
df = magic_doji(df)

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,open_low,open_close,open_high,high_low,low_close,...,advance_block,stalled_pattern,counterattack,tasuki,rf_three_methods,separating_lines,long_legged_doji,gravestone_doji,dragonfly_doji,tristar_doji
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-02-08,100.569,101.333,100.381,101.203,74344450.0,0.186936,-0.630413,-0.759677,0.939477,-0.81888,...,0,0,0,0,0,0,0,0,0,0
2006-02-09,101.401,102.037,100.995,101.025,77594820.0,0.400391,0.370805,-0.627213,1.021198,-0.029704,...,0,0,0,0,0,0,0,0,0,0
2006-02-10,101.035,101.649,100.281,101.223,80702450.0,0.746276,-0.186074,-0.60771,1.345808,-0.93936,...,0,0,0,0,0,0,0,0,0,0
2006-02-13,101.173,101.343,100.619,101.025,65450650.0,0.547577,0.146284,-0.168029,0.714406,-0.403502,...,0,0,0,0,0,0,0,0,0,0
2006-02-14,101.065,102.344,100.827,102.125,113835900.0,0.235492,-1.04883,-1.265522,1.482256,-1.287354,...,0,0,0,0,0,0,0,0,0,0


In [10]:
### ASSIGN CLUSTER TO OBSERVATION ###
data = df[["open_low", "open_close", "gap"]].dropna()
print(data.shape)
k_predictions = pd.DataFrame(loaded_kmeans.predict(data), columns = ["labels"], index = data.index)
#data = data.merge(k_predictions, left_index = True, right_index = True)#.reset_index()
del FILE, KMEANS_NAME, KMEANS_PATH, loaded_kmeans

df_model = df.merge(k_predictions, left_index = True, right_index = True)

df_model.head()

(2249, 3)


Unnamed: 0_level_0,Open,High,Low,Close,Volume,open_low,open_close,open_high,high_low,low_close,...,stalled_pattern,counterattack,tasuki,rf_three_methods,separating_lines,long_legged_doji,gravestone_doji,dragonfly_doji,tristar_doji,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-02-08,100.569,101.333,100.381,101.203,74344450.0,0.186936,-0.630413,-0.759677,0.939477,-0.81888,...,0,0,0,0,0,0,0,0,0,1
2006-02-09,101.401,102.037,100.995,101.025,77594820.0,0.400391,0.370805,-0.627213,1.021198,-0.029704,...,0,0,0,0,0,0,0,0,0,2
2006-02-10,101.035,101.649,100.281,101.223,80702450.0,0.746276,-0.186074,-0.60771,1.345808,-0.93936,...,0,0,0,0,0,0,0,0,0,2
2006-02-13,101.173,101.343,100.619,101.025,65450650.0,0.547577,0.146284,-0.168029,0.714406,-0.403502,...,0,0,0,0,0,0,0,0,0,2
2006-02-14,101.065,102.344,100.827,102.125,113835900.0,0.235492,-1.04883,-1.265522,1.482256,-1.287354,...,0,0,0,0,0,0,0,0,0,1


In [11]:

# Create last day feature
#end_date = df_model.index.max()
#df_model['last_day'] = (df_model.index == end_date).astype(int)
del df, data, k_predictions

#seq_length =  1 # in original Backtesting.py 
df_model = df_model.sort_index(ascending = False)

# preserve the price features to use in the backtest data
drop_cols = ['Open', 'High', 'Low', 'Close']
df1 = df_model[drop_cols]
df1.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-15,174.969,175.276,172.579,172.708
2015-01-14,173.254,174.513,172.311,174.305
2015-01-13,177.131,178.321,174.007,175.346
2015-01-12,177.349,177.518,175.198,175.842
2015-01-09,179.084,179.104,176.606,177.23


In [12]:
df_model.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,open_low,open_close,open_high,high_low,low_close,...,stalled_pattern,counterattack,tasuki,rf_three_methods,separating_lines,long_legged_doji,gravestone_doji,dragonfly_doji,tristar_doji,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-15,174.969,175.276,172.579,172.708,202811800.0,1.365956,1.292229,-0.17546,1.538716,-0.074748,...,0,0,0,0,0,0,0,0,0,0
2015-01-14,173.254,174.513,172.311,174.305,222080600.0,0.544288,-0.606624,-0.726679,1.261797,-1.15721,...,0,0,0,0,0,0,0,0,0,2
2015-01-13,177.131,178.321,174.007,175.346,247207600.0,1.763666,1.007729,-0.671819,2.419233,-0.769509,...,0,0,0,0,0,0,0,0,0,0
2015-01-12,177.349,177.518,175.198,175.842,166364800.0,1.212863,0.849737,-0.095292,1.30691,-0.367584,...,0,0,0,0,0,0,0,0,0,0
2015-01-09,179.084,179.104,176.606,177.23,182710700.0,1.383708,1.035268,-0.011168,1.39472,-0.353329,...,0,0,0,0,0,0,0,0,0,0


In [13]:

##### RENAME MODEL FEATURES WHERE TICKERS DO NOT COINCIDE FOR TESTING SPY MODEL ON OTHERS 
MODEL_FEAT = [i.replace('SPY', ticker) for i in MODEL_FEAT]

if 'last_day' in MODEL_FEAT:
    MODEL_FEAT.remove('last_day')
#####
df_model = df_model[MODEL_FEAT]
df2 = df_model.copy()

df2.head()

Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-15,0,1.365956,1.292229,0.380941,-0.17546,-0.074748,-1.465118,1.538716,0,202811800.0,...,-1.78,-2.55,-3.35,-3.18,-4.22,1.13,5.15,1.68,7.27,16.52
2015-01-14,2,0.544288,-0.606624,-1.19307,-0.726679,-1.15721,-0.119189,1.261797,0,222080600.0,...,-1.65,-2.45,-0.71,-3.26,-3.19,1.23,7.17,2.14,8.6,14.95
2015-01-13,0,1.763666,1.007729,0.733044,-0.671819,-0.769509,-1.66834,2.419233,0,247207600.0,...,-1.87,-0.11,1.12,-3.16,-2.14,1.13,9.07,2.76,9.75,14.94
2015-01-12,0,1.212863,0.849737,0.067144,-0.095292,-0.367584,-0.94413,1.30691,0,166364800.0,...,0.17,1.4,0.45,-2.77,-1.48,-0.23,9.29,3.27,10.42,16.47
2015-01-09,0,1.383708,1.035268,0.222176,-0.011168,-0.353329,-1.046319,1.39472,0,182710700.0,...,2.2,1.25,-0.58,-1.72,1.77,1.1,9.42,4.56,10.37,16.29


In [14]:
# Training parameters
INPUT_SIZE = 21
#HIDDEN_SIZE = 50
#HIDDEN_SIZES = [128,64,32] # for custom lstm 
HIDDEN_SIZES = [32,16] # for custom lstm 
OUTPUT_SIZE = 3
#NUM_LAYERS = 2
LR = 0.001
SEQUENCE_LENGTH = 1


# INSTANTIATE MODEL 
model = CustomLSTMModel_2(INPUT_SIZE, HIDDEN_SIZES, OUTPUT_SIZE)
# LOAD LSTM MODEL STATE DICT  
model.load_state_dict(torch.load(f = MODEL_PATH + MODEL_NAME ))

<All keys matched successfully>

In [15]:
X, y = prepare_data(df_model, SEQUENCE_LENGTH)

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (2248, 1, 21)
y shape:  (2248,)


In [16]:

############################ PREDICTION #######################################

X_tensor = torch.from_numpy(X).type(torch.float).to('cpu').squeeze(0)

#### PREDICTION #### 
model.eval()

with torch.inference_mode():

    output = model(X_tensor)
    pred = torch.softmax(output, dim = 1).argmax(dim = 1)


## possible mistake in creating the predictions df - dates might not align properly
predictions = pd.DataFrame(pred.to("cpu").numpy(), columns = ["predictions"], index = df_model.index[:-1])
#predictions = pd.DataFrame(pred.to('cuda').numpy(), columns = ["predictions"], index = df_model.index[:-1])

predictions.head()


Unnamed: 0_level_0,predictions
Date,Unnamed: 1_level_1
2015-01-15,0
2015-01-14,2
2015-01-13,0
2015-01-12,2
2015-01-09,2


In [17]:

# Dangerous merge - frequently causes duplicate columns
df2 = df2.merge(predictions, left_index = True, right_index = True)
df1 = df1.merge(df2, left_index = True, right_index = True)
#df1 = df2.copy()
del pred, output, predictions

cluster_stats = pd.read_csv(STATS_PATH + STATS_NAME).set_index("Unnamed: 0")



ACC = (df1['labels'] == df1['predictions']).sum() / df1.shape[1]
ACC = accuracy_score(df1['labels'], df1['predictions'])
print('Model Accuracy: ', ACC)
print('Value Counts: ',df1.predictions.value_counts())

Model Accuracy:  0.9230427046263345
Value Counts:  predictions
2    1407
1     584
0     257
Name: count, dtype: int64


In [18]:
df2.head()

Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240,predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-15,0,1.365956,1.292229,0.380941,-0.17546,-0.074748,-1.465118,1.538716,0,202811800.0,...,-2.55,-3.35,-3.18,-4.22,1.13,5.15,1.68,7.27,16.52,0
2015-01-14,2,0.544288,-0.606624,-1.19307,-0.726679,-1.15721,-0.119189,1.261797,0,222080600.0,...,-2.45,-0.71,-3.26,-3.19,1.23,7.17,2.14,8.6,14.95,2
2015-01-13,0,1.763666,1.007729,0.733044,-0.671819,-0.769509,-1.66834,2.419233,0,247207600.0,...,-0.11,1.12,-3.16,-2.14,1.13,9.07,2.76,9.75,14.94,0
2015-01-12,0,1.212863,0.849737,0.067144,-0.095292,-0.367584,-0.94413,1.30691,0,166364800.0,...,1.4,0.45,-2.77,-1.48,-0.23,9.29,3.27,10.42,16.47,2
2015-01-09,0,1.383708,1.035268,0.222176,-0.011168,-0.353329,-1.046319,1.39472,0,182710700.0,...,1.25,-0.58,-1.72,1.77,1.1,9.42,4.56,10.37,16.29,2


In [19]:

df1 = df1.sort_index()

# removes momentum features from the model - why would i do that? - experiment
#df1_cols = [i for i in df1.columns if "mom" not in i]
#df1 = df1[df1_cols]
#del df1_cols

capital = 1e4
tc = 3

#create a list of clusters to use in the backtesting df1
k_names = []

for n in range(0,3):
    
    open_low = cluster_stats[f'open_low_{n}']['median']
    open_close = cluster_stats[f'open_close_{n}']['median']
    
    if open_low and open_close >= 0:
        k_names.append(n)
        
    if open_low > open_close and open_low > 0 and abs(open_close)*3 < open_low:
        if n not in k_names:
            k_names.append(n)

In [20]:
 
################### ADDING KELLY ######################################
# historic returns for this strategy 
#print('End date: ', end_date)
print('Start date: ', df1.index.min())
#df1 = df1[df1.index >= '2024-02-02']

df1.head()

Start date:  2006-02-09 00:00:00


Unnamed: 0_level_0,Open,High,Low,Close,labels,open_low,open_close,gap,open_high,low_close,...,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240,predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-02-09,101.401,102.037,100.995,101.025,2,0.400391,0.370805,0.195646,-0.627213,-0.029704,...,0.12,-0.41,-0.81,-1.47,-2.27,2.13,3.44,5.74,4.79,2
2006-02-10,101.035,101.649,100.281,101.223,2,0.746276,-0.186074,0.009899,-0.60771,-0.93936,...,0.03,0.32,-1.47,0.58,-1.68,2.39,3.4,6.04,4.45,2
2006-02-13,101.173,101.343,100.619,101.025,2,0.547577,0.146284,-0.049396,-0.168029,-0.403502,...,0.73,-0.17,-1.58,-0.01,-1.78,2.56,3.2,5.28,4.3,2
2006-02-14,101.065,102.344,100.827,102.125,1,0.235492,-1.04883,0.039594,-1.265522,-1.287354,...,0.91,1.83,0.16,0.96,-0.42,3.45,4.52,6.25,5.39,1
2006-02-15,102.086,102.542,101.749,102.453,2,0.330114,-0.359501,-0.038188,-0.446682,-0.691899,...,1.41,1.24,-0.14,1.2,0.25,2.82,5.79,7.26,4.41,2


In [21]:

start_capital = 1e4
no_trade_k = [i for i in range(0,3) if i not in k_names][0]
df = pd.DataFrame()

try:
    half_kelly = kelly_criterion(ticker, df1.index.min()) / 2 
except FileNotFoundError:
    half_kelly = 1



for date, row in df1.iterrows():
    
    try:
        half_kelly = kelly_criterion(ticker, date) / 2
    except FileNotFoundError:
        half_kelly =  1
        print(date)
        
        
    for k in range(len(k_names)):
        
        #row['bp_used'] = (start_capital * half_kelly, 2)
        row['shares'] = (start_capital * half_kelly) // row['Close'] ## you need to divide cluster stats from target with USO - check clusters stats df for % or decimals 
        row[f'target_{k_names[k]}'] = round((1 - cluster_stats.loc["median" , f"open_low_{k_names[k]}"]/100) * row['Open'], 2) 
        row[f'k{k_names[k]}_true'] = (row[f'target_{k_names[k]}'] >= row['Low']) 
        row[f'k{k_names[k]}_profit'] = (row[f'k{k_names[k]}_true'] * (row['Open'] - row[f'target_{k_names[k]}']))* row['shares']
        row[f'k{k_names[k]}_loss'] = round(((row['Open'] - row['Close']) * row['shares']),4)
        row[f'k{k_names[k]}_pnl'] = np.where(row[f'k{k_names[k]}_true'] == True, row[f'k{k_names[k]}_profit'], row[f'k{k_names[k]}_loss'])
        del row[f'k{k_names[k]}_profit'], row[f'k{k_names[k]}_loss']
        
    
    row[f'k{k_names[0]}_k{k_names[1]}'] = np.where(row['predictions'] == 0, row[f'k{k_names[0]}_pnl'], row[f'k{k_names[1]}_pnl'])
    row['k0_k1_k2'] = np.where(row['predictions'] == no_trade_k, 0, row[f'k{k_names[0]}_k{k_names[1]}'] )
    row['net_pnl'] = np.where(row['k0_k1_k2'] != 0, row['k0_k1_k2'] - tc, 0)
    row['eod_equity'] = start_capital + row['net_pnl']
    row['daily_ret'] = row['eod_equity'] / start_capital - 1
    row['half_kelly'] = half_kelly
    
    start_capital += row['net_pnl']
    df = pd.concat([df, row.to_frame().transpose()], axis= 0)

#### SET DATATYPES IN THE NEW DF
for col in list(df.columns):
    
    if ("true" or "last_day") in col:
        df[col] = df[col].astype("bool")
        
    elif ("labels" or "Volume" or "predictions") in col:
        df[col] = df[col].astype("int32")
    
    else:
        df[col] = df[col].astype("float64")
    
del df1

df1 = df.copy()

df1['pnl_cumsum'] = df1['net_pnl'].cumsum()

Kelly Calculation window: From: 2005-08-09 00:00:00 To: 2006-02-09 00:00:00
Kelly Calculation window: From: 2005-08-09 00:00:00 To: 2006-02-09 00:00:00
Kelly Calculation window: From: 2005-08-10 00:00:00 To: 2006-02-10 00:00:00
Kelly Calculation window: From: 2005-08-15 00:00:00 To: 2006-02-13 00:00:00
Kelly Calculation window: From: 2005-08-15 00:00:00 To: 2006-02-14 00:00:00
Kelly Calculation window: From: 2005-08-15 00:00:00 To: 2006-02-15 00:00:00
Kelly Calculation window: From: 2005-08-16 00:00:00 To: 2006-02-16 00:00:00
Kelly Calculation window: From: 2005-08-17 00:00:00 To: 2006-02-17 00:00:00
Kelly Calculation window: From: 2005-08-22 00:00:00 To: 2006-02-21 00:00:00
Kelly Calculation window: From: 2005-08-22 00:00:00 To: 2006-02-22 00:00:00
Kelly Calculation window: From: 2005-08-23 00:00:00 To: 2006-02-23 00:00:00
Kelly Calculation window: From: 2005-08-24 00:00:00 To: 2006-02-24 00:00:00
Kelly Calculation window: From: 2005-08-29 00:00:00 To: 2006-02-27 00:00:00
Kelly Calcul

In [22]:

#####################################################################
 # PERFORMANCE EVALUATION 
#####################################################################


#####   MAX DRAWDOWN
from calculateMaxDD import calculateMaxDD

cum_ret = np.cumprod(1+ df1['daily_ret']) - 1
maxDrawdown, maxDrawdownDuration, startDrawdownDay=calculateMaxDD(cum_ret.values)

#####   SHARPE RATIO
sharpe_ratio = round(np.sqrt(252) * np.mean(df1['daily_ret']) / np.std(df1['daily_ret']),2)

#####   AVG YEARLY RETURN AND STD
mean_ret = df1['daily_ret'].mean() * 252
std = df1['daily_ret'].std()*np.sqrt(252)

import numpy as np
p_change = df1['Close'].pct_change().dropna() #/ df1['Close'].shift(1)
corr = np.corrcoef(p_change, df1['Close'][1:])

print(f"Correlation Price / Return: " , round(corr[1][0], 2))
print(f'Sharpe Ratio: {sharpe_ratio}')
print(f'Maximum Drawdown: {round(maxDrawdown,4)}')
print(f'Max Drawdown Duration: {maxDrawdownDuration} days' )
print(f'Start day Drawdown: {startDrawdownDay}')
print(f"Average Yearly Return: {round(mean_ret*100, 2)} %")

Correlation Price / Return:  0.03
Sharpe Ratio: 8.2
Maximum Drawdown: -0.0984
Max Drawdown Duration: 48.0 days
Start day Drawdown: 2117
Average Yearly Return: 255.13 %


In [None]:

# Create figure and axis objects
plt.rcParams.update({'font.size': 12})

fig, ax1 = plt.subplots(figsize=(10, 7))
plt.title(f"Backtest Short Open Strategy - {ticker}")

# Plot data on the first y-axis
ax1.plot(df1.index, df1['Close'], 'g-', alpha = 0.5)
ax1.plot(df1.index, df1['half_kelly'], 'red', alpha = 0.5)
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price ', color='g')

# Create a second y-axis
ax2 = ax1.twinx()
ax2.plot(df1.index, df1['eod_equity'], 'b-', alpha = 0.3)
ax2.set_ylabel('Equity USD', color='b')

# Add black dotted line at y=0
#ax1.axhline(y=0, color='k', linestyle='--')
ax2.axhline(y=1e4, color='k', linestyle='--')

#Remove box lines around the chart area
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)

# Add text box
stats_text = f'Sharpe Ratio: {sharpe_ratio} :\n'
stats_text += f'Maximum Drawdown: {round(maxDrawdown*100,2)}% \n'
stats_text += f'Start day Drawdown: {startDrawdownDay} day \n'
stats_text += f"Drawdown Duration: {int(maxDrawdownDuration)} days \n"
stats_text += f"Average Yearly Return: {round(mean_ret*100, 2)} % \n"
stats_text += f"Average Yearly STD: {round(std*100, 2)} % \n"
fig.text(0.1, 0.03, stats_text, fontsize=12,
         verticalalignment='top', horizontalalignment='left',
         bbox=dict(facecolor='white', alpha=0.5,edgecolor='none'))

In [None]:

# Create figure and axis objects
plt.rcParams.update({'font.size': 12})

fig, ax1 = plt.subplots(figsize=(10, 7))
plt.title(f"Backtest Short Open Strategy - {ticker}")

# Plot data on the first y-axis
ax1.plot(df1.index, df1['Close'], 'g-', alpha = 0.5)
ax1.plot(df1.index, df1['half_kelly'], 'red', alpha = 0.5)
ax1.set_xlabel('Date')
ax1.set_ylabel('Close Price ', color='g')

# Create a second y-axis
ax2 = ax1.twinx()
ax2.plot(df1.index, df1['daily_ret'].cumsum(), 'b-', alpha = 0.3)
ax2.set_ylabel('Cumulative Return', color='b')

# Add black dotted line at y=0
#ax1.axhline(y=0, color='k', linestyle='--')
ax2.axhline(y=0, color='k', linestyle='--')

#Remove box lines around the chart area
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)

# Add text box
stats_text = f'Sharpe Ratio: {sharpe_ratio} :\n'
stats_text += f'Maximum Drawdown: {round(maxDrawdown*100,2)}% \n'
stats_text += f'Start day Drawdown: {startDrawdownDay} day \n'
stats_text += f"Drawdown Duration: {int(maxDrawdownDuration)} days \n"
stats_text += f"Average Yearly Return: {round(mean_ret*100, 2)} % \n"
stats_text += f"Average Yearly STD: {round(std*100, 2)} % \n"
fig.text(0.1, 0.03, stats_text, fontsize=12,
         verticalalignment='top', horizontalalignment='left',
         bbox=dict(facecolor='white', alpha=0.5,edgecolor='none'))