# 1_feature_engineering.ipynb

Build ML‑ready feature matrix and target labels from:
1. EEG signals (data/raw/eeg_raw.csv)
2. 1‑minute S&P 500 prices(data/raw/btc_1min.csv)

Output:
* Feature matrix X (data/processed/X_features.csv)
* Price direction label y (data/processed/y_labels.csv)

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from typing import Final

In [2]:
# load paths
RAW_DIR = Path("data/raw")
OUT_DIR = Path("data/processed"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# set column names
BANDS = [
    "Delta", "Theta", "LowAlpha", "HighAlpha",
    "LowBeta", "HighBeta", "LowGamma", "MidGamma",
]
PRICE_COL = "btc_close"

# window (minutes) for z‑scoring EEG bands
ROLL_Z = 20
# lookback length for RSI
RSI_L = 14
# window for moving-average
SMA_FAST, SMA_SLOW = 5, 20
# window for realized volatility
VOL_WIN = 20
# future return label
K: Final[int] = 5

# 1. Load data

In [3]:
# load EEG & price CSV
eeg = pd.read_csv(RAW_DIR / "eeg_1min.csv", index_col=0, parse_dates=True)
btc_close = pd.read_csv(RAW_DIR / "btc_1min.csv", index_col=0, parse_dates=True)["Close"].rename("btc_close")

# merge on time index
common_idx = eeg.index.intersection(btc_close.index)
eeg = eeg.loc[common_idx]
btc_close = btc_close.loc[common_idx]
df = eeg.join(btc_close)

In [4]:
# check
df.head()

Unnamed: 0_level_0,Ts,HighMomentum,LowMomentum,FastMomentum,Volatility,NormClose,NormVolume,Delta,HighAlpha,HighBeta,LowAlpha,LowBeta,LowGamma,Meditation,MidGamma,Theta,btc_close
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-01-24 05:39:00+00:00,1643002740,1.007071,0.989536,1.0,0.000623,0.565208,0.058927,4387.0,621.0,4255.0,3531.0,3068.0,1888.0,24.0,595.0,12156.0,34996.19
2022-01-24 05:40:00+00:00,1643002800,1.00526,0.993904,0.999567,0.000554,0.30681,0.047043,14240.0,3749.0,4866.0,33206.0,13332.0,4680.0,50.0,1586.0,38160.0,34996.19
2022-01-24 05:41:00+00:00,1643002860,1.003738,0.98803,1.001863,0.000523,0.177404,0.011224,5236.0,2923.0,2158.0,2288.0,3139.0,1053.0,40.0,1245.0,10817.0,35028.9
2022-01-24 05:42:00+00:00,1643002920,1.001051,0.990933,0.999251,0.000451,0.106995,0.028177,23797.0,10226.0,5048.0,4931.0,3411.0,4687.0,57.0,758.0,9297.0,35032.4
2022-01-24 05:43:00+00:00,1643002980,1.001129,0.995166,0.999608,0.000406,0.789535,0.041188,9658.0,2392.0,17737.0,6608.0,4482.0,8600.0,43.0,2865.0,20818.0,35094.45


# 2. Features (X): neural bands features

In [5]:
# rolling z‑score for neural bands
for band in BANDS:
    roll_mean = df[band].rolling(ROLL_Z, min_periods=ROLL_Z).mean()
    roll_std = df[band].rolling(ROLL_Z, min_periods=ROLL_Z).std()

    # avoid division by zero: if std == 0, set z‑score to 0
    z = (df[band] - roll_mean) / roll_std.replace(0, np.nan)
    df[f"z_{band}"] = z.fillna(0)

In [6]:
df.head()

Unnamed: 0_level_0,Ts,HighMomentum,LowMomentum,FastMomentum,Volatility,NormClose,NormVolume,Delta,HighAlpha,HighBeta,...,Theta,btc_close,z_Delta,z_Theta,z_LowAlpha,z_HighAlpha,z_LowBeta,z_HighBeta,z_LowGamma,z_MidGamma
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-24 05:39:00+00:00,1643002740,1.007071,0.989536,1.0,0.000623,0.565208,0.058927,4387.0,621.0,4255.0,...,12156.0,34996.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-24 05:40:00+00:00,1643002800,1.00526,0.993904,0.999567,0.000554,0.30681,0.047043,14240.0,3749.0,4866.0,...,38160.0,34996.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-24 05:41:00+00:00,1643002860,1.003738,0.98803,1.001863,0.000523,0.177404,0.011224,5236.0,2923.0,2158.0,...,10817.0,35028.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-24 05:42:00+00:00,1643002920,1.001051,0.990933,0.999251,0.000451,0.106995,0.028177,23797.0,10226.0,5048.0,...,9297.0,35032.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-01-24 05:43:00+00:00,1643002980,1.001129,0.995166,0.999608,0.000406,0.789535,0.041188,9658.0,2392.0,17737.0,...,20818.0,35094.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Features (X): Bitcoin price

In [7]:
## relative strength index (RSI-14)
# price change per bar
price_delta = df[PRICE_COL].diff()

# upward/downward moves
up = price_delta.clip(lower=0)
down = -price_delta.clip(upper=0)

# smoothed average gains/losses over 14-bar window
roll_up = up.rolling(RSI_L).mean()
roll_down = down.rolling(RSI_L).mean()

# relative strength = average gain / average loss
rs = roll_up / roll_down.replace(0, np.nan) # avoid /0 = inf
rs = rs.replace([np.inf, -np.inf], np.nan)

# RSI = 100 - 100/(1+RS)
df["rsi14"] = 100 - (100 / (1 + rs))

# average loss = 0, RS = 0 = lower bound
mask_down0 = roll_down == 0
df.loc[mask_down0, "rsi14"] = 0

# average gain = 0, RS = inf = upper bound
mask_up0   = roll_up == 0
df.loc[mask_down0, "rsi14"] = 100

# price flat (gain/loss = 0), RSI = neutral
both0 = mask_down0 & mask_up0
df.loc[both0, "rsi14"] = 50

In [8]:
# moving averages crossover strength
df["sma_fast"] = df[PRICE_COL].rolling(SMA_FAST).mean()
df["sma_slow"] = df[PRICE_COL].rolling(SMA_SLOW).mean()
df["sma_diff"] = df["sma_fast"] - df["sma_slow"]

In [9]:
# volatility & Bollinger-band width proxy
log_ret = np.log(df[PRICE_COL] / df[PRICE_COL].shift(1))
df["rv20"] = log_ret.rolling(VOL_WIN).std() * np.sqrt(VOL_WIN)
df["log_ret1"] = np.log(df[PRICE_COL] / df[PRICE_COL].shift(1))
df["bb_width"] = (df["sma_fast"].rolling(20).std() /
                  df["sma_slow"]).fillna(0)

In [10]:
df.head()
# first 14 will be NaN

Unnamed: 0_level_0,Ts,HighMomentum,LowMomentum,FastMomentum,Volatility,NormClose,NormVolume,Delta,HighAlpha,HighBeta,...,z_HighBeta,z_LowGamma,z_MidGamma,rsi14,sma_fast,sma_slow,sma_diff,rv20,log_ret1,bb_width
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-24 05:39:00+00:00,1643002740,1.007071,0.989536,1.0,0.000623,0.565208,0.058927,4387.0,621.0,4255.0,...,0.0,0.0,0.0,,,,,,,0.0
2022-01-24 05:40:00+00:00,1643002800,1.00526,0.993904,0.999567,0.000554,0.30681,0.047043,14240.0,3749.0,4866.0,...,0.0,0.0,0.0,,,,,,0.0,0.0
2022-01-24 05:41:00+00:00,1643002860,1.003738,0.98803,1.001863,0.000523,0.177404,0.011224,5236.0,2923.0,2158.0,...,0.0,0.0,0.0,,,,,,0.000934,0.0
2022-01-24 05:42:00+00:00,1643002920,1.001051,0.990933,0.999251,0.000451,0.106995,0.028177,23797.0,10226.0,5048.0,...,0.0,0.0,0.0,,,,,,0.0001,0.0
2022-01-24 05:43:00+00:00,1643002980,1.001129,0.995166,0.999608,0.000406,0.789535,0.041188,9658.0,2392.0,17737.0,...,0.0,0.0,0.0,,35029.626,,,,0.00177,0.0


In [11]:
df.tail()

Unnamed: 0_level_0,Ts,HighMomentum,LowMomentum,FastMomentum,Volatility,NormClose,NormVolume,Delta,HighAlpha,HighBeta,...,z_HighBeta,z_LowGamma,z_MidGamma,rsi14,sma_fast,sma_slow,sma_diff,rv20,log_ret1,bb_width
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-25 02:26:00+00:00,1645755960,1.00563,0.989866,1.000658,0.000524,0.839036,0.173049,50925.0,10694.0,3445.0,...,-0.558931,1.603885,-0.141813,73.198551,38686.79,38585.399,101.391,0.004768,-0.001708,0.002199
2022-02-25 02:27:00+00:00,1645756020,1.014298,0.985345,1.002956,0.000561,1.0,0.198382,7490.0,5176.0,6430.0,...,0.269226,0.046705,0.58759,63.593756,38673.58,38592.0965,81.4835,0.004631,0.0,0.002183
2022-02-25 02:28:00+00:00,1645756080,1.012688,0.987471,0.99986,0.000535,0.822648,0.104419,15415.0,1398.0,5504.0,...,0.066827,0.088274,-0.140276,70.715977,38674.052,38601.577,72.475,0.004898,0.001769,0.00216
2022-02-25 02:29:00+00:00,1645756140,1.001944,0.995406,0.998592,0.000439,0.079081,0.022989,43100.0,2419.0,1330.0,...,-1.105129,-1.354079,-0.833809,69.448014,38669.392,38609.7745,59.6175,0.004977,-0.000663,0.002126
2022-02-25 02:30:00+00:00,1645756200,1.005359,0.992635,1.000345,0.000462,0.951671,0.066967,19821.0,6602.0,8330.0,...,0.833866,3.218505,0.149241,66.962679,38662.306,38620.475,41.831,0.004632,-0.000314,0.002054


# 4. Target label (Y): next k minute direction

In [12]:
# k minute log-return
fwd_ret = np.log(btc_close.shift(-K) / btc_close)

# {-1, 0, 1, NaN}
y = np.sign(fwd_ret) # {-1, 0, 1, NaN}

df["y"] = y

# keep only non-zero signs +-1
df = df[np.isfinite(df["y"]) & (df["y"] != 0)]
df["y"] = df["y"].astype(int)

In [13]:
df.head()

Unnamed: 0_level_0,Ts,HighMomentum,LowMomentum,FastMomentum,Volatility,NormClose,NormVolume,Delta,HighAlpha,HighBeta,...,z_LowGamma,z_MidGamma,rsi14,sma_fast,sma_slow,sma_diff,rv20,log_ret1,bb_width,y
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-24 05:39:00+00:00,1643002740,1.007071,0.989536,1.0,0.000623,0.565208,0.058927,4387.0,621.0,4255.0,...,0.0,0.0,,,,,,,0.0,1
2022-01-24 05:40:00+00:00,1643002800,1.00526,0.993904,0.999567,0.000554,0.30681,0.047043,14240.0,3749.0,4866.0,...,0.0,0.0,,,,,,0.0,0.0,1
2022-01-24 05:41:00+00:00,1643002860,1.003738,0.98803,1.001863,0.000523,0.177404,0.011224,5236.0,2923.0,2158.0,...,0.0,0.0,,,,,,0.000934,0.0,1
2022-01-24 05:42:00+00:00,1643002920,1.001051,0.990933,0.999251,0.000451,0.106995,0.028177,23797.0,10226.0,5048.0,...,0.0,0.0,,,,,,0.0001,0.0,1
2022-01-24 05:43:00+00:00,1643002980,1.001129,0.995166,0.999608,0.000406,0.789535,0.041188,9658.0,2392.0,17737.0,...,0.0,0.0,,35029.626,,,,0.00177,0.0,1


# 5. Final feature matrix & label vector

In [14]:
# price & neural context features
feat_cols = [c for c in df.columns if c.startswith("z_") or c in {"rsi14", "sma_diff", "rv20"}]

# target labels
Xy = df[feat_cols + ["y"]].dropna()

print(f"Final dataset: {Xy.shape[0]} rows, {len(feat_cols)} features")

Final dataset: 293 rows, 11 features


In [15]:
# output
Xy[feat_cols].to_csv(OUT_DIR / "X_features.csv")
Xy[["y"]].to_csv(OUT_DIR / "y_labels.csv")
print("Features: ", OUT_DIR / "X_features.csv")
print("Labels: ", OUT_DIR / "y_labels.csv")

Features:  data\processed\X_features.csv
Labels:  data\processed\y_labels.csv
