In [None]:
# ============ IMPORTS ============
import pandas as pd
import numpy as np
import sys
from scipy.signal import find_peaks
from src.triple_barrier import triple_barrier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# ============ LOAD CACHED DATA ============
processed_data_cache = pd.read_pickle('data_cache/processed_data.pkl')
bitcoin_price_and_features = processed_data_cache['bitcoin_price_and_features']
bitcoin_features = processed_data_cache['bitcoin_features']

# Start with absolute features
labelled_df = bitcoin_features.copy()

# Add Close price for reference
labelled_df['Close'] = bitcoin_price_and_features['Close']

In [None]:
# ============ PEAK DETECTION & LABELING ============
# Detect peaks
# peak_indices, properties = find_peaks(
#     labelled_df['Close'].values,
#     prominence = labelled_df['Close'].mean() * 0.10,
#     distance=30
# )

# # Initialize 'Near_Peak' column
# labelled_df['Near_Peak'] = 0

# # Label near-peak regions
# WINDOW_DAYS = 15
# for peak_idx in peak_indices:
#     start = max(0, peak_idx - WINDOW_DAYS)
#     end = min(len(labelled_df), peak_idx + WINDOW_DAYS)
#     labelled_df.iloc[start:end, labelled_df.columns.get_loc('Near_Peak')] = 1

In [None]:
# ============ TRIPLE BARRIER LABELLING ============
# ========== CONFIG ==========
window = 7  # days
profit_target = 0.7 # profit_target times volatility for profit-taking
stop_loss = 0.2 # stop_loss times volatility for stop-loss
min_return_threshold = 0.005  # Anything above this threshold will be marked as +1. Currently disabled in triple_barrier.py
# ============================

# ============ TRIPLE BARRIER LABELLING ============
labels, returns, hit_day = triple_barrier(
    price_series = labelled_df['Close'],
    volatility_series = labelled_df['Volatility_EWMA'],
    holding_period = window,
    profit_mult = profit_target,
    stop_mult = stop_loss,
    min_ret_threshold = min_return_threshold
)
labelled_df['Actual_Return_7day'] = returns
labelled_df['Barrier_Hit_Day'] = hit_day
labelled_df['Label_7day'] = labels
labelled_df = labelled_df.dropna()

In [None]:
# ============ SPLIT DATA ============
split_idx = int(len(labelled_df) * 0.8)
labelled_df["Set"] = "Train"
labelled_df.loc[labelled_df.index[split_idx:], "Set"] = "Test"

# Define X, y
target_col = "Label_7day"
drop_cols = [target_col, "Close","Barrier_Hit_Day","Near_Peak","Actual_Return_7day","Set"]  # keep only features

feature_cols = [col for col in labelled_df.columns if col not in drop_cols]

X_train = labelled_df.loc[labelled_df["Set"] == "Train", feature_cols]
y_train = labelled_df.loc[labelled_df["Set"] == "Train", target_col]
X_test = labelled_df.loc[labelled_df["Set"] == "Test", feature_cols]
y_test = labelled_df.loc[labelled_df["Set"] == "Test", target_col]

# ============ SCALE DATA ============
X_train = pd.DataFrame(
            MinMaxScaler().fit_transform(X_train),
            columns=X_train.columns,
            index=X_train.index
        )
X_test = pd.DataFrame(
            MinMaxScaler().fit_transform(X_test),
            columns=X_test.columns,
            index=X_test.index
        )

In [None]:
# =========== CACHE LABELLED DATA ============
pd.to_pickle({
    'labelled_df':labelled_df,
    'X_train':X_train,
    'X_test':X_test,
    'y_train':y_train,
    'y_test':y_test
}, 'data_cache/labelled_data.pkl')