In [1]:
# This is necessary to recognize the modules
import os
import sys
import warnings

warnings.filterwarnings("ignore")

root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(root_path)

In [2]:
from core.data_sources.clob import CLOBDataSource

# Get trading rules and candles
clob = CLOBDataSource()

In [3]:
clob.load_candles_cache(root_path)

In [4]:
candles = clob.candles_cache[("binance", "BTC-USDT", "1s")]

In [5]:
df = candles.data

In [22]:
df.shape

(259201, 11)

In [23]:
from core.backtesting.triple_barrier_method import triple_barrier_method

df["side"] = 1
df_with_tbm = triple_barrier_method(df, tp=3.5, sl=3.5, tl=300, std_span=200, trade_cost=0.0000)

In [24]:
df_with_tbm.close_type.value_counts()

close_type
 0    120953
 1     69658
-1     68391
Name: count, dtype: int64

In [25]:
df_with_tbm.target.describe()

count       259002
mean    0.00029277
std     0.00023568
min              0
25%       0.000147
50%     0.00022979
75%     0.00036489
max     0.00276736
Name: target, dtype: float64

In [26]:
from sklearn.preprocessing import StandardScaler

# Add technical indicators using pandas_ta

# Create a copy to work with
df_with_indicators = df_with_tbm.copy()

# Bollinger Bands with different lengths
df_with_indicators.ta.bbands(length=20, std=2, append=True)  # Standard BB
df_with_indicators.ta.bbands(length=50, std=2, append=True)  # Longer term BB

# MACD with different parameters
df_with_indicators.ta.macd(fast=12, slow=26, signal=9, append=True)  # Standard MACD
df_with_indicators.ta.macd(fast=8, slow=21, signal=5, append=True)  # Faster MACD

# RSI with different lengths
df_with_indicators.ta.rsi(length=14, append=True)  # Standard RSI
df_with_indicators.ta.rsi(length=21, append=True)  # Longer RSI

# Moving averages
df_with_indicators.ta.sma(length=20, append=True)  # Short MA
df_with_indicators.ta.sma(length=50, append=True)  # Medium MA
df_with_indicators.ta.ema(length=20, append=True)  # Short EMA
df_with_indicators.ta.ema(length=50, append=True)  # Medium EMA

# Volatility and momentum indicators
df_with_indicators.ta.atr(length=14, append=True)  # ATR
df_with_indicators.ta.stoch(k=14, d=3, append=True)  # Stochastic
df_with_indicators.ta.adx(length=14, append=True)  # ADX

# Replace df_with_tbm with df_with_indicators for further processing
df_processed = df_with_indicators.copy()

# df_processed.reset_index(inplace=True, drop=True)

# 1. Remove unnecessary columns
columns_to_drop = [
    "timestamp",
    "taker_buy_base_volume",
    "volume",
    "close_time",
    "real_class",
    "ret",
    "tp",
    "sl",
    "take_profit_time",
    "stop_loss_time",
    "tl",
    "side",
]
df_processed = df_processed.drop(columns=columns_to_drop)
# 2. Convert prices to returns
price_columns = ["open", "high", "low", "close"]
for col in price_columns:
    df_processed[f"{col}_ret"] = df_processed[col].pct_change()
df_processed = df_processed.drop(columns=price_columns)

# 3. Create buy/sell volume ratio
df_processed["buy_volume_ratio"] = df_processed["taker_buy_quote_volume"] / df_processed["quote_asset_volume"]
df_processed = df_processed.drop(columns=["taker_buy_quote_volume"])

# 4. Drop any rows with NaN values (first row will have NaN due to returns calculation)
df_processed = df_processed.dropna()

# 5. Get all numeric columns for scaling (excluding the target 'close_type')
numeric_columns = df_processed.select_dtypes(include=["float64", "int64"]).columns.tolist()
numeric_columns.remove("close_type")  # Don't scale the target variable

# 6. Apply StandardScaler to all numeric columns
scaler = StandardScaler()
df_processed[numeric_columns] = scaler.fit_transform(df_processed[numeric_columns])

# Show the first few rows of the processed dataset
print("Processed dataset shape:", df_processed.shape)
df_processed.head()

Processed dataset shape: (240656, 37)


Unnamed: 0_level_0,quote_asset_volume,n_trades,target,close_type,BBL_20_2.0,BBM_20_2.0,BBU_20_2.0,BBB_20_2.0,BBP_20_2.0,BBL_50_2.0,...,STOCHk_14_3_3,STOCHd_14_3_3,ADX_14,DMP_14,DMN_14,open_ret,high_ret,low_ret,close_ret,buy_volume_ratio
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-18 16:20:10,-0.17323493,-0.27235869,3.70589097,0,-0.94909502,-0.94882988,-0.94842439,0.15942773,-0.12233273,-0.93843517,...,0.00328006,-0.48793767,-1.13720323,0.05946534,0.09147632,0.95714205,-0.00608311,0.96638133,-0.00232992,-0.2627434
2025-02-18 16:20:11,-0.14077429,1.18096547,3.66904575,0,-0.94909502,-0.94882988,-0.94842439,0.15942773,0.54074167,-0.93842534,...,0.52590183,0.01921942,-1.1568574,0.42010618,-0.26705432,-0.00291423,1.69027923,0.00279518,1.66164269,1.20429389
2025-02-18 16:20:12,5.62551405,0.53735048,3.63117929,0,-0.94909502,-0.94882988,-0.94842439,0.15942773,0.54074167,-0.9384501,...,0.85837151,0.47667918,-1.17492212,0.42010618,-0.26705432,1.67560285,-0.00608311,1.69250618,-0.00232992,1.20429389
2025-02-18 16:20:13,0.02445977,-0.28273958,3.59182243,0,-0.94909502,-0.94882988,-0.94842439,0.15942773,0.54074167,-0.93847355,...,1.19084119,0.88457741,-1.19166855,0.41947157,-0.26641645,-0.00490991,-0.00608311,-0.00122777,-0.00232992,1.18995635
2025-02-18 16:20:14,-0.16962355,-0.33464401,3.5551323,0,-0.94909502,-0.94882988,-0.94842439,0.15942773,0.53995323,-0.93849558,...,1.19044587,1.11283205,-1.20708365,0.41878862,-0.26681826,-0.00091855,-0.00608311,0.00078361,-0.00430831,-0.66041691


In [28]:
candles_path = os.path.join(root_path, "data", "features_df")
filename = os.path.join(candles_path, f"{candles.connector_name}|{candles.trading_pair}|{candles.interval}.parquet")
df_processed.to_parquet(filename, engine="pyarrow", compression="snappy", index=True)

In [29]:
# dump the scaler
import joblib

joblib.dump(scaler, os.path.join(root_path, "models", "scaler.pkl"))

['/Users/dman/Documents/code/quants-lab/data/scaler.pkl']