In [None]:
import os, glob
import numpy as np
import pandas as pd
from datetime import time
from pytz import timezone
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [None]:
# 1) —— Data Import & Preprocessing —— #

DATA_DIR = "../SPX"
OUTPUT_FEATURE_CSV = "../csvfiles_python/features_360_raw.csv"
OUTPUT_ALLSET_CSV  = "../csvfiles_python/allSet_360_raw.csv"

def load_data(path):
    files = glob.glob(os.path.join(path, "*.txt"))
    dfs = []
    for f in sorted(files):
        df = pd.read_csv(f, names=["DateTime","Open","High","Low","Close"],
                         sep=",", parse_dates=["DateTime"])
        dfs.append(df)
    spx = pd.concat(dfs, ignore_index=True)
    # localize to EST
    spx["DateTime"] = spx["DateTime"].dt.tz_localize(timezone("US/Eastern"))
    spx.set_index("DateTime", inplace=True)
    # filter trading hours 9:30–16:00
    return spx.between_time("09:30","16:00")

spx = load_data(DATA_DIR)

# Drop duplicates
spx = spx[~spx.index.duplicated(keep='first')]

In [None]:
# 2) —— Create daily labels —— #

# group by calendar date
spx["Date"] = spx.index.date
groups = spx.groupby("Date")

# last-minute close price per day
lmP = groups["Close"].last()

# average price from start to (end − 30 mins)
avgP = groups.apply(lambda df: df["Close"].iloc[:-30].mean())

# binary label: 1 if avgP < lmP else 0
y = (avgP < lmP).astype(int)
y.name = "Y"

In [None]:
# 3) —— Technical‐Indicator Functions —— #

# Basic EMAs & SMAs
def SMA(x, n):
    return x.rolling(n, min_periods=n).mean()

def EMA(x, n):
    return x.ewm(span=n, adjust=False).mean()

def DEMA(x, n):
    e = EMA(x, n)
    return 2*e - EMA(e, n)

# True Range & ATR
def ATR(H, L, C, n=14):
    prevC = C.shift(1)
    tr = pd.concat([
        H - L,
        (H - prevC).abs(),
        (L - prevC).abs()
    ], axis=1).max(axis=1)
    return EMA(tr, n)

# Average Directional Index (ADX)
def ADX(H, L, C, n=14):
    up = H.diff()
    down = -L.diff()
    plusDM  = np.where((up>down)&(up>0), up, 0.0)
    minusDM = np.where((down>up)&(down>0), down, 0.0)
    tr = pd.concat([
        H-L,
        (H - C.shift(1)).abs(),
        (L - C.shift(1)).abs()
    ], axis=1).max(axis=1)
    atr = EMA(tr, n)
    plusDI  = 100 * EMA(pd.Series(plusDM, index=H.index), n) / atr
    minusDI = 100 * EMA(pd.Series(minusDM, index=H.index), n) / atr
    dx = 100 * ( (plusDI - minusDI).abs() / (plusDI + minusDI) )
    return EMA(dx, n)

In [None]:
# Aroon
def aroon(xH, xL, n=14):
    def _aroon_up(series):
        idx = series.argmax()
        return ((n - (len(series)-1 - idx)) / n) * 100
    def _aroon_dn(series):
        idx = series.argmin()
        return ((n - (len(series)-1 - idx)) / n) * 100

    au = xH.rolling(n).apply(_aroon_up, raw=True)
    ad = xL.rolling(n).apply(_aroon_dn, raw=True)
    return pd.DataFrame({"aroonUp": au, "aroonDn": ad})

# Bollinger Bands
def BBands(x, n=20, k=2):
    m = SMA(x, n)
    sd = x.rolling(n).std()
    return pd.DataFrame({
        "bb_up":   m + k*sd,
        "bb_mid":  m,
        "bb_low":  m - k*sd
    })

# Commodity Channel Index
def CCI(H, L, C, n=20):
    TP = (H + L + C) / 3
    M  = SMA(TP, n)
    MD = TP.rolling(n).apply(lambda s: np.mean(np.abs(s - s.mean())), raw=True)
    return (TP - M) / (0.015 * MD)

# Chaikin Volatility
def chaikin_volatility(H, L, n=10, ema_n=10):
    hl = H - L
    e1 = EMA(hl, ema_n)
    e2 = e1.shift(n)
    return (e1 - e2) / e2 * 100

# Close Location Value
def CLV(H, L, C):
    return ((C - L) - (H - C)) / (H - L)