In [9]:
import os
import numpy as np 
import pandas as pd 
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime 
from pathlib import Path
from Preprocessing_functions import *
from techinical_analysis import * 

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import label_binarize

In [3]:
ticker = "SPY"

# LOAD DF FOR MODEL BUILDING 
FILE_PATH = f"Data/{ticker}/df/"
print("DataFrames for model building: ", os.listdir(FILE_PATH))
idx = 0 if len(os.listdir(FILE_PATH)) < 3 else int(input("Select file index: "))
DF_NAME = os.listdir(FILE_PATH)[idx] 
FILE_PATH_NAME = FILE_PATH + DF_NAME

df_model = pd.read_parquet(FILE_PATH_NAME)
df_model = format_idx_date(df_model)
df_model.head()

DataFrames for model building:  ['df_SPY_k3_202402012133.parquet', 'Junk']


Unnamed: 0_level_0,labels,open_low,open_close,gap,open_high,low_close,high_close,high_low,Dividends,Volume,...,SPY_mom3,SPY_mom4,SPY_mom5,SPY_mom10,SPY_mom15,SPY_mom20,SPY_mom60,SPY_mom120,SPY_mom180,SPY_mom240
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-15,0,1.363875,1.293933,0.380295,-0.192341,-0.070909,-1.483421,1.553229,0.0,176613900,...,-1.79,-2.56,-3.34,-3.17,-4.2,1.12,5.16,1.69,7.28,16.5
2015-01-16,1,0.112333,-1.43668,-0.124004,-1.531276,-1.550755,-0.09317,1.61882,0.0,211879600,...,-0.23,-0.5,-1.28,-1.85,-2.96,0.47,4.47,2.98,8.67,17.21
2015-01-20,2,1.103176,0.168379,0.384683,-0.156767,-0.945224,-0.324638,1.257971,0.0,130991100,...,0.6,-0.01,-0.29,0.17,-3.06,-1.74,5.44,3.64,9.06,17.61
2015-01-21,1,0.274116,-0.787356,-0.279167,-1.073137,-1.06439,-0.282747,1.332949,0.0,122942700,...,2.04,1.11,0.49,1.64,-2.7,-1.66,4.76,4.15,9.41,16.66
2015-01-22,1,0.812306,-1.036986,0.445576,-1.111879,-1.864436,-0.07407,1.903025,0.0,174356000,...,2.22,3.56,2.61,1.88,-0.72,-0.66,5.5,7.83,12.01,16.95


In [4]:
ticker = "SPY"
n_clusters = 3 
time_period = "360mo" # must be the same as in 1_Data_Acquisition or larger

In [5]:
### LOAD KMEANS MODEL ###
KMEANS_PATH = f"kmeans_models/{ticker}/"
#print(os.getcwd())
KMEANS_MODEL_PATH = os.listdir(KMEANS_PATH)
try:
    KMEANS_MODEL_PATH.remove('Junk')
except ValueError:
    print(' ')    

print(KMEANS_MODEL_PATH)
idx = 0 if len(KMEANS_MODEL_PATH) < 2 else int(input("Select file index: "))
KMEANS_NAME = KMEANS_MODEL_PATH[idx]
print("Chosen K_MEANS MODEL file: ", KMEANS_NAME)
FILE = KMEANS_PATH + KMEANS_NAME
loaded_kmeans = joblib.load(FILE)

### LOAD FEAT LIST TO ORDER THE DATA ###
FEAT_PATH = f"model_features/{ticker}/"
FEAT_FILES = os.listdir(FEAT_PATH)
try:
    FEAT_FILES.remove('Junk')
except ValueError:
    print(' ')
print(FEAT_FILES)
idx = 0 if len(FEAT_FILES) < 2 else int(input("Select file index (e.g. 0,1,2)"))
FEAT_NAME = FEAT_FILES[idx]
MODEL_FEAT = pd.read_csv(FEAT_PATH + FEAT_NAME)['0'].to_list()
#MODEL_FEAT.pop(-1)

# Cluster stats
STATS_PATH = f"Data/{ticker}/k_stats/"
STATS_FILES = os.listdir(STATS_PATH)
try:
    STATS_FILES.remove('Junk')
except ValueError:
    print(' ')
print("KMEANS Stats files: ", STATS_FILES)
idx = 0 if len(STATS_FILES) < 2 else int(input("Select file index: "))
STATS_NAME = STATS_FILES[idx]
print("Chosen K_STATS file: ", STATS_NAME)
cluster_stats = pd.read_csv(STATS_PATH + STATS_NAME).set_index("Unnamed: 0")

# LOAD DF FOR MODEL BUILDING TO CHECK DATE RANGES 
DF_PATH = f"Data/{ticker}/df/"
DF_FILES = os.listdir(DF_PATH)
try:
    DF_FILES.remove('Junk')
except ValueError:
    print(' ')
print("DataFrames for model building: ", DF_FILES)
idx = 0 if len(DF_FILES) < 2 else int(input("Select file index: "))
DF_NAME = DF_FILES[idx] 
print("Chosen DataFrame file: ", DF_NAME)
df_dates = pd.read_parquet(DF_PATH + DF_NAME)
df_dates = format_idx_date(df_dates)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

['kmeans_model_df_SPY_k3_202402012133.joblib']
Chosen K_MEANS MODEL file:  kmeans_model_df_SPY_k3_202402012133.joblib
['LSTM_df_SPY_k3_202402012133_NFEAT23.csv']
KMEANS Stats files:  ['KMEANS_Stats_df_SPY_k3_202402012133.csv']
Chosen K_STATS file:  KMEANS_Stats_df_SPY_k3_202402012133.csv
DataFrames for model building:  ['df_SPY_k3_202402012133.parquet']
Chosen DataFrame file:  df_SPY_k3_202402012133.parquet


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
### DOWNLOAD DATA ###
df = downlaod_symbol_data(ticker, period = time_period)
# Use an alternative to yfinance
#df = download_data(ticker, days = 7200)
df = format_idx_date(df)
print('Start date: ',df.index.min())
df.head()

Start date:  1995-05-01 00:00:00


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,open_low,open_close,open_high,high_low,low_close,high_close,gap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1995-05-01,30.36,30.42,30.3,30.3,518700,0.0,0.0,0.0,0.197628,0.197628,-0.197628,0.394477,0.0,-0.394477,
1995-05-02,30.33,30.41,30.26,30.37,228400,0.0,0.0,0.0,0.230795,-0.131883,-0.263765,0.493259,-0.363516,-0.131536,0.09901
1995-05-03,30.47,30.79,30.47,30.79,724700,0.0,0.0,0.0,0.0,-1.050213,-1.050213,1.039298,-1.050213,0.0,0.329272
1995-05-04,30.83,31.03,30.7,30.77,311400,0.0,0.0,0.0,0.421667,0.194616,-0.648719,1.063487,-0.228013,-0.837899,0.129912
1995-05-05,30.9,30.9,30.67,30.73,314900,0.0,0.0,0.0,0.744337,0.550162,0.0,0.744337,-0.195631,-0.550162,0.422489


In [7]:
# REMOVE DATA SNOOPING 
out_sample = True
manual = True

if out_sample is True:
    
    if manual != True:
        start_date = df_dates.index.min()
        df = df[df.index <= start_date]
    
    else: 
        #Manually choosing the cutoff date
        df = df[df.index >= '2022-07-01']
        #df = df[df.index >= '2010-01-01']
    
    del DF_NAME, df_dates 

In [10]:
df = create_momentum_feat(df, ticker).dropna()
df = momentum_oscillators(df)
df = volatility(df)
df = reversal_patterns(df) 
df = continuation_patterns(df)
df = magic_doji(df)
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits',
       'Capital Gains', 'open_low', 'open_close', 'open_high', 'high_low',
       'low_close', 'high_close', 'gap', 'SPY_mom1', 'SPY_mom2', 'SPY_mom3',
       'SPY_mom4', 'SPY_mom5', 'SPY_mom10', 'SPY_mom15', 'SPY_mom20',
       'SPY_mom60', 'SPY_mom120', 'SPY_mom180', 'SPY_mom240', 'rsi',
       'macd_values', 'macd_signal_line', 'atr', 'bband_up', 'bband_mid',
       'bband_low', 'hammer', 'hanging_man', 'engulfing_pattern', 'dark_cloud',
       'piercing_line', 'morning_star', 'evening_star', 'shooting_star',
       'inverted_hammer', 'harami', 'harami_cross', 'belt_hold',
       'upsidegap_two_crows', 'three_black_crows', 'three_white_soldiers',
       'advance_block', 'stalled_pattern', 'counterattack', 'tasuki',
       'rf_three_methods', 'separating_lines', 'long_legged_doji',
       'gravestone_doji', 'dragonfly_doji', 'tristar_doji'],
      dtype='object')

In [22]:

class DeepNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DeepNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# Load 
input_dim = 21
output_dim = 2
model = DeepNN(input_dim, output_dim)
model.load_state_dict(torch.load("NN_models/simple_nn_multiclass.pt"))
model.eval()

DeepNN(
  (net): Sequential(
    (0): Linear(in_features=21, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=2, bias=True)
  )
)

In [23]:
### ASSIGN CLUSTER TO OBSERVATION ###
data = df[["open_low", "open_close", "gap"]].dropna()
print(data.shape)
k_predictions = pd.DataFrame(loaded_kmeans.predict(data), columns = ["labels"], index = data.index)
#data = data.merge(k_predictions, left_index = True, right_index = True)#.reset_index()
del FILE, KMEANS_PATH, loaded_kmeans

df_model = df.merge(k_predictions, left_index = True, right_index = True)
df_model.head()

(229, 3)


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,open_low,open_close,...,stalled_pattern,counterattack,tasuki,rf_three_methods,separating_lines,long_legged_doji,gravestone_doji,dragonfly_doji,tristar_doji,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-05-30,517.93,518.6,514.78,516.04,46468500,0.0,0.0,0.0,0.60819,0.364914,...,0,0,0,0,0,0,0,0,0,2
2024-05-31,517.01,520.87,511.85,520.74,90785800,0.0,0.0,0.0,0.998046,-0.721456,...,0,0,0,0,0,0,0,0,0,1
2024-06-03,522.37,522.66,516.03,521.17,46835700,0.0,0.0,0.0,1.213699,0.229722,...,0,0,0,0,0,0,0,0,0,2
2024-06-04,519.84,522.5,518.36,521.75,34632700,0.0,0.0,0.0,0.284703,-0.367421,...,0,0,0,0,0,0,0,0,0,2
2024-06-05,524.1,527.97,522.09,527.95,47610400,0.0,0.0,0.0,0.383515,-0.734593,...,0,0,0,0,0,0,0,0,0,1


In [38]:
df_model

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains,open_low,open_close,...,stalled_pattern,counterattack,tasuki,rf_three_methods,separating_lines,long_legged_doji,gravestone_doji,dragonfly_doji,tristar_doji,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-05-30,517.93,518.60,514.78,516.04,46468500,0.0,0.0,0.0,0.608190,0.364914,...,0,0,0,0,0,0,0,0,0,2
2024-05-31,517.01,520.87,511.85,520.74,90785800,0.0,0.0,0.0,0.998046,-0.721456,...,0,0,0,0,0,0,0,0,0,1
2024-06-03,522.37,522.66,516.03,521.17,46835700,0.0,0.0,0.0,1.213699,0.229722,...,0,0,0,0,0,0,0,0,0,2
2024-06-04,519.84,522.50,518.36,521.75,34632700,0.0,0.0,0.0,0.284703,-0.367421,...,0,0,0,0,0,0,0,0,0,2
2024-06-05,524.10,527.97,522.09,527.95,47610400,0.0,0.0,0.0,0.383515,-0.734593,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-23,540.43,545.43,533.88,535.42,90590700,0.0,0.0,0.0,1.211998,0.927040,...,0,0,0,0,0,0,0,0,0,0
2025-04-24,536.72,547.43,535.45,546.69,64150400,0.0,0.0,0.0,0.236622,-1.857579,...,0,0,0,0,0,0,0,0,0,1
2025-04-25,546.65,551.05,543.69,550.64,61119600,0.0,0.0,0.0,0.541480,-0.729900,...,0,0,0,0,0,0,0,0,0,1
2025-04-28,551.39,553.55,545.02,550.85,47476700,0.0,0.0,0.0,1.155262,0.097934,...,0,0,0,0,0,100,0,0,0,2


In [None]:
y_pred = df_model['labels'].values

array([2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 1, 0,
       0, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 0, 1, 2, 2, 2, 2, 1, 0,
       2, 2, 0, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2,
       2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 1, 1, 1, 2, 2, 2, 2, 0, 1, 2, 0, 2,
       0, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 0, 1, 1, 1, 2, 0, 2, 1,
       1, 1, 2, 2, 1, 2, 0, 2, 2, 2, 0, 1, 0, 2, 1, 2, 1, 0, 0, 0, 0, 1,
       1, 2, 1, 2, 1, 1, 2, 0, 2, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 2,
       0, 2, 0, 1, 0, 1, 1, 2, 1])

In [47]:

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_model[MODEL_FEAT[1:]].values)
y_true = df_model['labels'].values

# Convert to torch tensors (no batches!)
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)

with torch.no_grad():
    logits = model(X_tensor)
    probs = torch.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1).numpy()
    probs_np = probs.numpy()
    
#print(f"\n--- {dataset_name} Performance ---")
print("Accuracy:", accuracy_score(y_true, preds))
print("Precision (macro):", precision_score(y_true, preds, average="macro"))
print("Recall (macro):", recall_score(y_true, preds, average="macro"))
print("F1 Score (macro):", f1_score(y_true, preds, average="macro"))

Accuracy: 0.3231441048034934
Precision (macro): 0.19305417082087073
Recall (macro): 0.398562091503268
F1 Score (macro): 0.2591763587155292


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
