### NEW PREDICTIONS

In [15]:

import os 
import joblib
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn

from datetime import datetime, timedelta 
from mpl_toolkits import mplot3d
from scipy.stats import skew, norm, kurtosis
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

try:
    from Preprocessing_functions import *;
    from LSTM_Architecture import LSTM
    
except ModuleNotFoundError:
    from Strat_1.Preprocessing_functions import *;
    from Strat_1.LSTM_Architecture import LSTM

cwd = os.getcwd().replace("\\", "/"  )
os.chdir(cwd)

In [2]:
cwd

'c:/Users/User/Documents/ATS_Development/Strat_1'

In [16]:
# Prepare data for LSTM model
def prepare_data(data, sequence_length):
    import numpy as np
    y = data.pop('labels')
    
    data = data.dropna()
    
    features = list(data.columns)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(data[features])
    
     # Create sequences
    X, y_seq = [], []
    for i in range(len(X_scaled) - sequence_length):
        X.append(X_scaled[i:i + sequence_length])
        y_seq.append(y.iloc[i + sequence_length - 1])

    return np.array(X), np.array(y_seq)

In [3]:

ticker = "SPY"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

prediction_date = input("Choose date to predict for: today or YYYY-MM-DD: ")


In [5]:
# =============================================================================
# PULL DATA FROM DB
# =============================================================================
df = downlaod_symbol_data(ticker) # period = "1day"
df = create_momentum_feat(df, ticker)
df = format_idx_date(df)

if prediction_date != "today":
    #date = "2024-02-29"
    df = df[df.index < prediction_date]
    
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,open_low,open_close,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-06,173.54,174.11,172.16,172.69,125672000,0.0,0.0,0.0,0.795206,0.489801,...,,,,,,,,,,
2015-02-09,172.04,172.77,171.51,171.92,87219000,0.0,0.0,0.0,0.308068,0.069751,...,,,,,,,,,,
2015-02-10,172.97,174.01,171.96,173.75,96164200,0.0,0.0,0.0,0.583916,-0.450945,...,,,,,,,,,,
2015-02-11,173.58,174.29,172.93,173.85,91087800,0.0,0.0,0.0,0.374467,-0.155548,...,0.67,,,,,,,,,
2015-02-12,174.66,175.58,173.89,175.53,97545900,0.0,0.0,0.0,0.440857,-0.498111,...,2.1,1.64,,,,,,,,


In [12]:

# =============================================================================
# LOAD KMEANS MODEL FOR LABELLING 
# =============================================================================
### LOAD KMEANS MODEL ###
KMEANS_PATH = f"kmeans_models/{ticker}/"
KMEANS_FILES = os.listdir(KMEANS_PATH)
print('Choose a file for clustering: ', KMEANS_FILES)
KMEANS_FILES.remove('Junk')
idx = 0 if len(KMEANS_FILES) < 2 else int(input("Select file index: "))
KMEANS_NAME = KMEANS_FILES[idx]
print("Chosen K_MEANS MODEL file: ", KMEANS_NAME)
FILE = KMEANS_PATH + KMEANS_NAME
loaded_kmeans = joblib.load(FILE)
del KMEANS_PATH, KMEANS_NAME, idx, FILE, KMEANS_FILES

### ASSIGN CLUSTER TO OBSERVATION
data = df[["open_low", "open_close", "gap"]].dropna()
k_predictions = pd.DataFrame(loaded_kmeans.predict(data), columns = ["labels"], index = data.index)

df_model = df.merge(k_predictions, left_index = True, right_index = True)
del data, k_predictions, loaded_kmeans

Choose a file for clustering:  ['Junk', 'kmeans_model_df_SPY_k3_202402012133.joblib']
Chosen K_MEANS MODEL file:  kmeans_model_df_SPY_k3_202402012133.joblib


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
# =============================================================================
# DATA TRANSFORMATION  - SCALING AND LSTM FORMATTING 
# =============================================================================
### LOAD FEAT LIST TO ORDER THE DATA ###
FEAT_PATH = f"model_features/{ticker}/"
FEAT_FILES = os.listdir(FEAT_PATH)
FEAT_FILES.remove('Junk')
print('Choose a features list to use:', FEAT_FILES)
idx = 0 if len(FEAT_FILES) < 2 else int(input("Select file index (e.g. 0,1,2)"))
FEAT_NAME = FEAT_FILES[idx]
print('Selected Feature list: ', FEAT_NAME)
MODEL_FEAT = pd.read_csv(FEAT_PATH + FEAT_NAME)['0'].to_list()
    

Choose a features list to use: ['LSTM_df_SPY_k3_202402012133_NFEAT23.csv']
Selected Feature list:  LSTM_df_SPY_k3_202402012133_NFEAT23.csv


In [14]:
MODEL_FEAT.remove('last_day')
end_date = df_model.index.max()
df_model = df_model[MODEL_FEAT].dropna()
df_model.columns

Index(['labels', 'open_low', 'open_close', 'gap', 'open_high', 'low_close',
       'high_close', 'high_low', 'Dividends', 'Volume', 'SPY_mom1', 'SPY_mom2',
       'SPY_mom3', 'SPY_mom4', 'SPY_mom5', 'SPY_mom10', 'SPY_mom15',
       'SPY_mom20', 'SPY_mom60', 'SPY_mom120', 'SPY_mom180', 'SPY_mom240'],
      dtype='object')

In [17]:
# MIGHT NOT BE REQUIRED (seq - lenght)
seq_length =  1
df_model = df_model.sort_index(ascending = False)
df_model.head()

Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-05,1,0.342968,-0.596031,-0.189438,-0.621004,-0.94223,-0.024819,0.958022,0.0,30567900,...,0.4,-0.14,0.4,-0.37,3.78,1.49,2.57,12.19,15.39,23.08
2025-02-04,1,0.091999,-0.660723,0.010037,-0.747704,-0.753415,-0.086336,0.833472,0.0,33457800,...,-0.54,-0.0,-0.45,-0.21,3.51,1.66,4.7,13.57,14.69,22.69
2025-02-03,1,0.367827,-0.860513,-1.520388,-1.285707,-1.232874,-0.419797,1.632544,0.0,65857200,...,-0.67,-1.12,-0.27,0.03,2.98,2.25,5.26,12.88,15.34,21.21
2025-01-31,0,1.061728,0.934979,0.406585,-0.404938,-0.128109,-1.334514,1.460752,0.0,66671500,...,-0.45,0.41,-1.01,1.72,2.09,2.69,5.75,14.14,16.65,21.42
2025-01-30,2,0.536459,-0.17882,0.357256,-0.437115,-0.719137,-0.257171,0.969337,0.0,39281300,...,0.95,-0.48,-0.77,2.07,2.79,2.86,6.76,17.41,17.29,22.91


In [18]:

class CustomLSTMModel_2(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        """
        Args:
            input_size (int): The number of input features.
            hidden_sizes (list): A list of hidden sizes for each LSTM layer (e.g., [128, 64]).
            output_size (int): The number of output features.
        """
        super(CustomLSTMModel_2, self).__init__()
        
        # Define the first LSTM layer
        self.lstm1 = nn.LSTM(input_size, hidden_sizes[0], batch_first=True)
        
        # Define the second LSTM layer
        self.lstm2 = nn.LSTM(hidden_sizes[0], hidden_sizes[1], batch_first=True)
        
        # Fully connected layer for final output
        self.fc = nn.Linear(hidden_sizes[1], output_size)

    def forward(self, x):
        # Pass through the first LSTM layer
        out, _ = self.lstm1(x)
        
        # Pass through the second LSTM layer
        out, _ = self.lstm2(out)
        
        # Pass the final output through the fully connected layer
        out = self.fc(out[:, -1, :])  # Use the last time-step's output
        return out

In [19]:
# LOAD LSTM MODEL STATE DICT  
MODEL_PATH = f"lstm_models/Testing/{ticker}/"
LSTM_FILES = os.listdir(MODEL_PATH)
try:
    LSTM_FILES.remove('Junk')
except ValueError:
    print(' ')
print("LSTM Files: ",LSTM_FILES)
idx = 0 if len(LSTM_FILES) < 2 else int(input("Select file index: "))
MODEL_NAME = LSTM_FILES[idx]
print("Chosen LSTM, MODEL file: ", MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training parameters
INPUT_SIZE = 21
#HIDDEN_SIZES = [128,64,32] # for custom lstm 
HIDDEN_SIZES = [32,16] # for custom lstm 
OUTPUT_SIZE = 3
#NUM_LAYERS = 2
LR = 0.001
SEQUENCE_LENGTH = 1


# INSTANTIATE MODEL 
model = CustomLSTMModel_2(INPUT_SIZE, HIDDEN_SIZES, OUTPUT_SIZE)
# LOAD LSTM MODEL STATE DICT  
model.load_state_dict(torch.load(f = MODEL_PATH + MODEL_NAME ))

 
LSTM Files:  ['LSTM_Class_Epoch_349_TestAcc_0.98_TrainAcc_0.99_202502051454']
Chosen LSTM, MODEL file:  LSTM_Class_Epoch_349_TestAcc_0.98_TrainAcc_0.99_202502051454


<All keys matched successfully>

In [20]:
X, y = prepare_data(df_model, SEQUENCE_LENGTH)

print('X shape: ', X.shape)
print('y shape: ', y.shape)

X shape:  (2274, 1, 21)
y shape:  (2274,)


In [23]:
X_tensor = torch.from_numpy(X).type(torch.float).to('cpu').squeeze(0)

#### PREDICTION #### 
model.eval()

with torch.inference_mode():

    output = model(X_tensor)
    pred = torch.softmax(output, dim = 1).argmax(dim = 1)


## possible mistake in creating the predictions df - dates might not align properly
predictions = pd.DataFrame(pred.to("cpu").numpy(), columns = ["predictions"], index = df_model.index[:-1])

predictions.head()

Unnamed: 0_level_0,predictions
Date,Unnamed: 1_level_1
2025-02-05,1
2025-02-04,1
2025-02-03,1
2025-01-31,0
2025-01-30,2


In [24]:

# Cluster stats
STATS_PATH = f"Data/{ticker}/k_stats/"
STATS_FILES = os.listdir(STATS_PATH)
print("KMEANS Stats files: ", STATS_FILES)
STATS_FILES.remove('Junk')
idx = 0 if len(STATS_FILES) < 2 else int(input("Select file index: "))
STATS_NAME = STATS_FILES[idx]
print("Chosen K_STATS file: ", STATS_NAME)
cluster_stats = pd.read_csv(STATS_PATH + STATS_NAME).set_index("Unnamed: 0")

del STATS_PATH, idx, STATS_NAME

KMEANS Stats files:  ['Junk', 'KMEANS_Stats_df_SPY_k3_202402012133.csv']
Chosen K_STATS file:  KMEANS_Stats_df_SPY_k3_202402012133.csv


In [25]:

n_clusters = 3 

actions = {}

for cluster in range(n_clusters):

    mean_profit = cluster_stats.loc["mean", f"open_low_{cluster}"]
    mean_loss = cluster_stats.loc["mean", f"open_close_{cluster}"]
    
    if mean_profit > mean_loss and mean_loss > 0:
        # actions[cluster] = f"Place a SELL ORDER in {ticker} on the OPEN. Profit target: {mean_profit} pct"
        actions[cluster] = f"SELL"
    
    else:
        # actions[cluster] = f"DO NOT TRADE {ticker}"
        actions[cluster] = f"HOLD"


print(ticker, actions[pred[0].item()])

SPY HOLD


In [27]:

predictions = pd.DataFrame(pred.to("cpu").numpy(), columns = ["predictions"])
predictions
    

Unnamed: 0,predictions
0,1
1,1
2,1
3,0
4,2
...,...
2269,0
2270,0
2271,1
2272,0


In [33]:
predictions.value_counts()

predictions
2              1327
1               666
0               281
Name: count, dtype: int64