In [None]:
# %pip install -r requirements.txt

In [19]:
import os, glob
import numpy as np
import pandas as pd
import pywt
from datetime import time
from pytz import timezone
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [20]:
# 1) —— Data Import & Preprocessing —— #

DATA_DIR = "../SPX"
DATA_DIR_new = "../SPX_new"
FEATURE_CSV = "../csvfiles_new/features_360.csv"
ALLSET_CSV  = "../csvfiles_new/allSet_360.csv"

def load_data(path):
    files = glob.glob(os.path.join(path, "*.txt"))
    dfs = []
    for f in sorted(files):
        df = pd.read_csv(f, names=["DateTime","Open","High","Low","Close"],
                         sep=",", parse_dates=["DateTime"])
        dfs.append(df)
    spx = pd.concat(dfs, ignore_index=True)
    # localize to EST
    spx["DateTime"] = spx["DateTime"].dt.tz_localize(timezone("US/Eastern"))
    spx.set_index("DateTime", inplace=True)
    #Keep dates up until 2020
    spx = spx.loc[:'2020']
    # filter trading hours 9:30–16:00
    return spx.between_time("09:30","16:00")

spx = load_data(DATA_DIR)

def load_data_new(path):
    file_name = "SPX_2021_2025.csv"
    full_file_path = os.path.join(path, file_name)
    spx_new = pd.read_csv(full_file_path, 
                     names=["DateTime", "Open", "High", "Low", "Close"],
                     sep=",", 
                     parse_dates=["DateTime"],
                     header=0)
    # Set dates to datetime
    spx_new["DateTime"] = pd.to_datetime(spx_new["DateTime"], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
    # localize to EST
    spx_new["DateTime"] = spx_new["DateTime"].dt.tz_localize(timezone("US/Eastern"))
    spx_new.set_index("DateTime", inplace=True)
    # filter trading hours 9:30–16:00
    spx_new = spx_new.between_time("09:30", "16:00")

    print(f"Pomyślnie załadowano i przetworzono plik: {file_name}")
    return spx_new

spx_new = load_data_new(DATA_DIR_new)

# Drop duplicates
spx = spx[~spx.index.duplicated(keep='first')]

spx = pd.concat([spx, spx_new], ignore_index=False)

Pomyślnie załadowano i przetworzono plik: SPX_2021_2025.csv


In [21]:
# 1a) —— Discrete Wavelet Shrinkage on each price series —— #

def wav_shrink(x, wavelet="db2", level=1, mode="soft"):
    """
    Decompose x via DWT, apply universal soft-threshold to detail coeffs,
    and reconstruct.
    """
    coeffs = pywt.wavedec(x, wavelet, level=level)
    # estimate noise sigma from finest detail coeffs
    detail = coeffs[-1]
    sigma = np.median(np.abs(detail)) / 0.6745
    # universal threshold
    thr = sigma * np.sqrt(2 * np.log(len(x)))
    # threshold detail coefficients only
    coeffs_shrunk = [coeffs[0]] + [pywt.threshold(c, thr, mode=mode) for c in coeffs[1:]]
    return pywt.waverec(coeffs_shrunk, wavelet)[: len(x)]

# apply to each column, keep pandas Series with same index
dwt_Close = pd.Series(
    wav_shrink(spx["Close"].values), index=spx.index, name="Close")
dwt_High  = pd.Series(
    wav_shrink(spx["High"].values),  index=spx.index, name="High")
dwt_Low   = pd.Series(
    wav_shrink(spx["Low"].values),   index=spx.index, name="Low")
dwt_Open  = pd.Series(
    wav_shrink(spx["Open"].values),  index=spx.index, name="Open")

# override spx with the denoised (DWT) series
spx_dwt = spx.copy()
for col, series in [("High", dwt_High), ("Low", dwt_Low),
                    ("Close", dwt_Close), ("Open", dwt_Open)]:
    spx_dwt[col] = series
spx = spx_dwt

In [22]:
# 2) —— Create daily labels based on DWT-filtered Close —— #

spx["Date"] = spx.index.date
groups = spx.groupby("Date")

# last-minute dwt_Close each day
lmP = groups["Close"].last()

# average from first bar up to bar-30 (i.e. minus 30 minutes)
avgP = groups.apply(lambda df: df["Close"].iloc[:-30].mean())

y = (avgP < lmP).astype(int)
y.name = "Y"

  avgP = groups.apply(lambda df: df["Close"].iloc[:-30].mean())


In [23]:
# 3) —— Indicator definitions (copied from first translation) —— #

def SMA(x, n=20):
    return x.rolling(n, min_periods=n).mean()

def EMA(x, n=20):
    return x.ewm(span=n, adjust=False).mean()

def DEMA(x, n=20):
    e = EMA(x, n)
    return 2*e - EMA(e, n)

def ATR(H, L, C, n=14):
    prevC = C.shift(1)
    tr = pd.concat([
        H - L,
        (H - prevC).abs(),
        (L - prevC).abs()
    ], axis=1).max(axis=1)
    return EMA(tr, n)

def ADX(H, L, C, n=14):
    up = H.diff()
    down = -L.diff()
    plusDM  = np.where((up>down)&(up>0), up, 0.0)
    minusDM = np.where((down>up)&(down>0), down, 0.0)
    tr = pd.concat([
        H-L,
        (H - C.shift(1)).abs(),
        (L - C.shift(1)).abs()
    ], axis=1).max(axis=1)
    atr = EMA(tr, n)
    plusDI  = 100 * EMA(pd.Series(plusDM, index=H.index), n) / atr
    minusDI = 100 * EMA(pd.Series(minusDM, index=H.index), n) / atr
    dx = 100 * ((plusDI - minusDI).abs() / (plusDI + minusDI))
    return EMA(dx, n)

In [24]:
def aroon(xH, xL, n=14):
    def _aroon_up(series):
        idx = series.argmax()
        return ((n - (len(series)-1 - idx)) / n) * 100
    def _aroon_dn(series):
        idx = series.argmin()
        return ((n - (len(series)-1 - idx)) / n) * 100
    au = xH.rolling(n).apply(_aroon_up, raw=True)
    ad = xL.rolling(n).apply(_aroon_dn, raw=True)
    return pd.DataFrame({"aroonUp": au, "aroonDn": ad})

def BBands(x, n=20, k=2):
    m  = SMA(x, n)
    sd = x.rolling(n).std()
    return pd.DataFrame({
        "bb_up":  m + k*sd,
        "bb_mid": m,
        "bb_dn":  m - k*sd
    })

def CCI(H, L, C, n=20):
    TP = (H + L + C) / 3
    M  = SMA(TP, n)
    MD = TP.rolling(n).apply(lambda s: np.mean(np.abs(s - s.mean())), raw=True)
    return (TP - M) / (0.015 * MD)

def chaikin_volatility(H, L, n=10, ema_n=10):
    hl = H - L
    e1 = EMA(hl, ema_n)
    e2 = e1.shift(n)
    return (e1 - e2) / e2 * 100

def CLV(H, L, C):
    return ((C - L) - (H - C)) / (H - L)

In [25]:
def CMO(x, n=14):
    diff = x.diff()
    up = diff.where(diff>0, 0.0).rolling(n).sum()
    dn = (-diff).where(diff<0, 0.0).rolling(n).sum()
    return 100 * (up - dn) / (up + dn)

def CTI(C, n=10):
    m   = SMA(C, n)
    dev = (C - m).abs()
    return (C - m) / SMA(dev, n)

def DonchianChannel(H, L, n=20):
    up  = H.rolling(n).max()
    dn  = L.rolling(n).min()
    mid = (up + dn) / 2
    return pd.DataFrame({"dc_up": up, "dc_mid": mid, "dc_dn": dn})

def DPO(x, n=20):
    m     = SMA(x, n)
    shift = int(n/2 + 1)
    return x.shift(shift) - m

def DVI(C, n=14):
    rng  = C.diff().abs()
    prev = rng.shift(n)
    dvi  = (rng - prev) / prev * 100
    return dvi.where(prev != 0)

In [26]:
def GMMA(x):
    short = [3,5,8,10,12,15]
    long  = [30,35,40,45,50,60]
    df = {}
    for i in short: df[f"gmma_s{i}"] = EMA(x, i)
    for i in long:  df[f"gmma_l{i}"] = EMA(x, i)
    return pd.DataFrame(df)

def KST(C, r1=10,r2=15,r3=20,r4=30,n1=10,n2=10,n3=10,n4=15):
    roc1 = C.diff(r1)/C.shift(r1)
    roc2 = C.diff(r2)/C.shift(r2)
    roc3 = C.diff(r3)/C.shift(r3)
    roc4 = C.diff(r4)/C.shift(r4)
    kst    = (SMA(roc1,n1)*1 + SMA(roc2,n2)*2 +
              SMA(roc3,n3)*3 + SMA(roc4,n4)*4)
    signal = SMA(kst, 9)
    return pd.DataFrame({"kst": kst, "signal": signal})

def lags(H, L, C, n=1):
    return C.diff(n)

def MACD(C, n_fast=12, n_slow=26, n_sig=9):
    fast = EMA(C, n_fast)
    slow = EMA(C, n_slow)
    macd = fast - slow
    sig  = EMA(macd, n_sig)
    hist = macd - sig
    return pd.DataFrame({"macd": macd, "signal": sig, "hist": hist})

In [27]:
def PBands(C, n=20, pct=0.025):
    m = SMA(C, n)
    return pd.DataFrame({
        "pb_up":  m * (1 + pct),
        "pb_mid": m,
        "pb_dn":  m * (1 - pct)
    })

def ROC(C, n=1):
    return (C - C.shift(n)) / C.shift(n) * 100

def momentum(C, n=1):
    return C - C.shift(n)

def RSI(C, n=14):
    diff  = C.diff()
    up    = diff.where(diff>0, 0.0)
    dn    = -diff.where(diff<0,0.0)
    ma_up = EMA(up, n)
    ma_dn = EMA(dn, n)
    rs    = ma_up / ma_dn
    return 100 - (100 / (1 + rs))

def runSum(C, n=10):    return C.rolling(n).sum()
def runMin(C, n=10):    return C.rolling(n).min()
def runMax(C, n=10):    return C.rolling(n).max()
def runMedian(C,n=10):  return C.rolling(n).median()

In [28]:
def SAR(H, L, af0=0.02, af_step=0.02, af_max=0.2):
    up = True
    af = af0
    ep = H.iloc[0]
    sar = [L.iloc[0]]
    for i in range(1, len(H)):
        prev = sar[-1]
        if up:
            candidate = prev + af*(ep - prev)
            candidate = min(candidate, L.iloc[i-1], L.iloc[i])
            if H.iloc[i] > ep:
                ep = H.iloc[i]; af = min(af+af_step, af_max)
            if L.iloc[i] < candidate:
                up = False; sar.append(ep); ep = L.iloc[i]; af = af0
            else:
                sar.append(candidate)
        else:
            candidate = prev + af*(ep - prev)
            candidate = max(candidate, H.iloc[i-1], H.iloc[i])
            if L.iloc[i] < ep:
                ep = L.iloc[i]; af = min(af+af_step, af_max)
            if H.iloc[i] > candidate:
                up = True; sar.append(ep); ep = H.iloc[i]; af = af0
            else:
                sar.append(candidate)
    return pd.Series(sar, index=H.index)

def volatility(H, L, C, n=10):
    lr = np.log(C / C.shift(1))
    return lr.rolling(n).std() * np.sqrt(252 * (390/(16*60)))

def ultimate_oscillator(H, L, C, s1=7,s2=14,s3=28,w1=4,w2=2,w3=1):
    bp  = C - np.minimum(L, C.shift(1))
    tr  = np.maximum(H-L, np.maximum((H-C.shift(1)).abs(), (L-C.shift(1)).abs()))
    avg1 = bp.rolling(s1).sum() / tr.rolling(s1).sum()
    avg2 = bp.rolling(s2).sum() / tr.rolling(s2).sum()
    avg3 = bp.rolling(s3).sum() / tr.rolling(s3).sum()
    return 100 * (w1*avg1 + w2*avg2 + w3*avg3) / (w1+w2+w3)

In [29]:
def VHF(C, n=28):
    num = C.rolling(n).max() - C.rolling(n).min()
    den = C.diff().abs().rolling(n).sum()
    return num / den

def williamsAD(H, L, C):
    return ((C - L) - (H - C)) / (H - L)

def WPR(H, L, C, n=14):
    highest = H.rolling(n).max()
    lowest  = L.rolling(n).min()
    return (highest - C) / (highest - lowest) * -100

def ZigZag(H, L, C, pct=0.05):
    zz = [np.nan]*len(C)
    last_pivot = C.iloc[0]
    last_dir   = None
    for i in range(1, len(C)):
        change = (C.iloc[i] - last_pivot) / last_pivot
        if last_dir is None:
            if abs(change) > pct:
                last_dir, last_pivot = np.sign(change), C.iloc[i]
                zz[i] = last_pivot
        else:
            if (last_dir>0 and C.iloc[i]> last_pivot) or \
               (last_dir<0 and C.iloc[i]< last_pivot):
                last_pivot = C.iloc[i]; zz[i] = last_pivot
            elif abs(change) > pct:
                last_dir, last_pivot = -last_dir, C.iloc[i]
                zz[i] = last_pivot
    return pd.Series(zz, index=C.index)

In [30]:
def TRIX(C, n=9):
    ema1 = EMA(C, n)
    ema2 = EMA(ema1, n)
    ema3 = EMA(ema2, n)
    return ema3.pct_change() * 100

def TDI(C, n_rsi=13, n_sig=2, n_bb=34, sd=1.618):
    rsi    = RSI(C, n_rsi)
    signal = SMA(rsi, n_sig)
    bb     = BBands(rsi, n_bb, sd)
    return pd.DataFrame({
        "rsi":    rsi,
        "signal": signal,
        "bb_up":  bb["bb_up"],
        "bb_mid": bb["bb_mid"],
        "bb_dn":  bb["bb_dn"]
    })

def SMI(H, L, C, n=14, n_fast=3, n_slow=25, n_sig=9):
    m = (H.rolling(n).max() + L.rolling(n).min())/2
    d = (H.rolling(n).max() - L.rolling(n).min())/2
    num = EMA(EMA(C-m, n_fast), n_slow)
    den = EMA(EMA(d,      n_fast), n_slow)
    smi    = 100 * (num/den)
    signal = EMA(smi, n_sig)
    return pd.DataFrame({"smi": smi, "signal": signal})

In [31]:
# 4) —— Compute & aggregate “first-n-bar” daily averages —— #

def compute_daily_averages(spx):
    H, L, C, O = spx["High"], spx["Low"], spx["Close"], spx["Open"]
    day_slices = []
    for date, df in spx.groupby("Date"):
        idx = df.index
        day_slices.append((idx[0], idx[-31]))
    # precompute all indicators
    inds = {
        "avg_ADX":               ADX(H,L,C),
        "avg_aroonUp":           aroon(H,L)["aroonUp"],
        "avg_aroonDn":           aroon(H,L)["aroonDn"],
        "avg_ATR":               ATR(H,L,C),
        "avg_BBands_up":         BBands(C)["bb_up"],
        "avg_BBands_mid":        BBands(C)["bb_mid"],
        "avg_BBands_dn":         BBands(C)["bb_dn"],
        "avg_CCI":               CCI(H,L,C),
        "avg_chaikinVolatility": chaikin_volatility(H,L),
        "avg_CLV":               CLV(H,L,C),
        "avg_CMOClose":          CMO(C),
        "avg_CTI":               CTI(C),
        "avg_Donchian_up":       DonchianChannel(H,L)["dc_up"],
        "avg_Donchian_mid":      DonchianChannel(H,L)["dc_mid"],
        "avg_Donchian_dn":       DonchianChannel(H,L)["dc_dn"],
        "avg_DPOClose":          DPO(C),
        "avg_DVIClose":          DVI(C),
        "avg_GMMAClose":         GMMA(C).mean(axis=1),
        "avg_KSTClose":          KST(C)["kst"],
        "avg_lagsClose":         lags(H,L,C),
        "avg_MACD":              MACD(C)["macd"],
        "avg_PBands_up":         PBands(C)["pb_up"],
        "avg_PBands_mid":        PBands(C)["pb_mid"],
        "avg_PBands_dn":         PBands(C)["pb_dn"],
        "avg_ROCClose":          ROC(C),
        "avg_momentumClose":     momentum(C),
        "avg_RSIClose":          RSI(C),
        "avg_runSum":            runSum(C,10),
        "avg_runMin":            runMin(C,10),
        "avg_runMax":            runMax(C,10),
        "avg_runMedian":         runMedian(C,10),
        "avg_SAR":               SAR(H,L),
        "avg_SMAClose":          SMA(C,20),
        "avg_EMAClose":          EMA(C,20),
        "avg_DEMAClose":         DEMA(C,20),
        "avg_SNR":               VHF(C,28),
        "avg_SMI":               SMI(H,L,C)["smi"],
        "avg_TDI":               TDI(C)["rsi"],
        "avg_TRIX":              TRIX(C),
        "avg_ultimateOscillator":ultimate_oscillator(H,L,C),
        "avg_VHF":               VHF(C),
        "avg_volatility":        volatility(H,L,C),
        "avg_williamsAD":        williamsAD(H,L,C),
        "avg_WPR":               WPR(H,L,C),
        "avg_ZigZag":            ZigZag(H,L,C)
    }
    data = {}
    for name, series in inds.items():
        if isinstance(series, pd.DataFrame):
            for col in series.columns:
                key = f"{name}_{col}"
                data[key] = [
                    series[col].loc[s:e].mean() for s,e in day_slices
                ]
        else:
            data[name] = [
                series.loc[s:e].mean() for s,e in day_slices
            ]
    return pd.DataFrame(data, index=y.index)

# compute & save feature matrix
features = compute_daily_averages(spx)
features["Y"] = y
features.to_csv(FEATURE_CSV, index_label="Date")

In [32]:
# 5) —— Build allSet & Modeling —— #

allset = features.dropna().reset_index(drop=True)
allset.to_csv(ALLSET_CSV, index=False)

# prepare data
X   = allset.drop("Y", axis=1).values
y_  = allset["Y"].values
nx  = len(allset)
train_n = int(np.floor(nx * 2/3))

X_train, X_test = X[:train_n], X[train_n:]
y_train, y_test = y_[:train_n], y_[train_n:]

In [33]:
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

def eval_model(X_tr, y_tr, X_te, y_te):
    rf = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)
    _ = cross_val_score(rf, X_tr, y_tr, cv=cv, scoring="accuracy")
    rf.fit(X_tr, y_tr)
    y_pred = rf.predict(X_te)
    return {
        "accuracy": accuracy_score(y_te, y_pred),
        "precision": precision_score(y_te, y_pred, pos_label=0),
        "recall": recall_score(y_te, y_pred, pos_label=0),
        "f1": f1_score(y_te, y_pred, pos_label=0),
        "cm": confusion_matrix(y_te, y_pred)
    }

In [34]:
results = {}
# original
results["original"] = eval_model(X_train_s, y_train, X_test_s, y_test)
# undersample
rus = RandomUnderSampler(random_state=1)
X_ru, y_ru = rus.fit_resample(X_train_s, y_train)
results["under"] = eval_model(X_ru, y_ru, X_test_s, y_test)
# oversample
ros = RandomOverSampler(random_state=1)
X_ro, y_ro = ros.fit_resample(X_train_s, y_train)
results["over"] = eval_model(X_ro, y_ro, X_test_s, y_test)
# SMOTE
sm = SMOTE(random_state=1)
X_sm, y_sm = sm.fit_resample(X_train_s, y_train)
results["smote"] = eval_model(X_sm, y_sm, X_test_s, y_test)

# summarize
comp = pd.DataFrame({
    m: {
        "Accuracy":  results[m]["accuracy"],
        "Precision": results[m]["precision"],
        "Recall":    results[m]["recall"],
        "F1":        results[m]["f1"]
    } for m in results
}).T

print(comp)

comp.to_csv("../csvfiles_new/comparison_360.csv")


          Accuracy  Precision    Recall        F1
original  0.601923   0.547231  0.711864  0.618785
under     0.623077   0.563694  0.750000  0.643636
over      0.696154   0.683962  0.614407  0.647321
smote     0.607692   0.552632  0.711864  0.622222
