# BTC Feature Engineering - CORRECTED APPROACH

## Overview
This notebook implements the CORRECTED approach: Calculate everything together, split on save.

**Key Changes:**
1. Load full data (including buffer) for complete calculations
2. Calculate all indicators and features on full data
3. Create feature sets A0→A4 with proper temporal alignment
4. Split clean data only at save step

**Benefits:**
- Complete historical context for all calculations
- Proper temporal alignment with full data
- No missing data for lag features
- Clean final output without buffer data


In [None]:
# Import required libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import talib
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

train_start='2020-05-12'
test_end='2025-09-19'

In [None]:
# Step 1: Load Full Data (Including Buffer) for Complete Calculations
def load_full_data():
    """Load full data including buffer for complete calculations"""
    
    # Load full data
    h4_full = pd.read_parquet('../data_collection/data/btc_4h_20251028.parquet')
    d1_full = pd.read_parquet('../data_collection/data/btc_1d_20251028.parquet')
    w1_full = pd.read_parquet('../data_collection/data/btc_1w_20251028.parquet')
    # m1_full = pd.read_parquet('../data_collection/data/btc_1M_20251022.parquet')
    
    # Ensure datetime index
    for df in [h4_full, d1_full, w1_full]:
        df.index = pd.to_datetime(df.index)
    
    print(f"📊 Full data loaded (including buffer):")
    print(f"  H4: {len(h4_full)} records ({h4_full.index[0]} to {h4_full.index[-1]})")
    print(f"  D1: {len(d1_full)} records ({d1_full.index[0]} to {d1_full.index[-1]})")
    print(f"  W1: {len(w1_full)} records ({w1_full.index[0]} to {w1_full.index[-1]})")
    # print(f"  M1: {len(m1_full)} records ({m1_full.index[0]} to {m1_full.index[-1]})")
    
    return h4_full, d1_full, w1_full

# Load full data for complete calculations
h4_full, d1_full, w1_full = load_full_data()


In [None]:
# Step 2: Technical Indicator Functions
def extract_ohlcv_features(data):
    """Extract OHLCV features (5 features)"""
    features = pd.DataFrame(index=data.index)
    features['open'] = data['open']
    features['high'] = data['high']
    features['low'] = data['low']
    features['close'] = data['close']
    features['volume'] = data['volume']
    features['volume_MA_20'] = talib.SMA(data['volume'], timeperiod=20)
    return features

def calculate_moving_averages(data, periods=[7, 14, 20, 60, 120]):
    """Calculate moving averages using CLOSE prices (5 features)"""
    features = pd.DataFrame(index=data.index)
    for period in periods:
        features[f'MA_{period}'] = talib.SMA(data['close'], timeperiod=period)
    return features

def calculate_rsi(data, period=14):
    """Calculate RSI using CLOSE prices (1 feature)"""
    features = pd.DataFrame(index=data.index)
    features['RSI_14'] = talib.RSI(data['close'], timeperiod=period)
    return features

def calculate_macd(data, fast=12, slow=26, signal=9):
    """Calculate MACD line, signal, and histogram (3 features)"""
    features = pd.DataFrame(index=data.index)
    macd_line, macd_signal, macd_hist = talib.MACD(data['close'], 
                                                   fastperiod=fast, 
                                                   slowperiod=slow, 
                                                   signalperiod=signal)
    features['MACD_line'] = macd_line
    features['MACD_signal'] = macd_signal
    features['MACD_hist'] = macd_hist
    return features

def calculate_ichimoku(data):
    """Calculate Ichimoku Cloud components (5 features)"""
    features = pd.DataFrame(index=data.index)
    
    # Tenkan-sen (Conversion Line)
    high_9 = data['high'].rolling(window=9).max()
    low_9 = data['low'].rolling(window=9).min()
    features['conversion_line'] = (high_9 + low_9) / 2
    
    # Kijun-sen (Baseline)
    high_26 = data['high'].rolling(window=26).max()
    low_26 = data['low'].rolling(window=26).min()
    features['baseline'] = (high_26 + low_26) / 2
    
    # Senkou Span A (Leading Span A)
    span_a_value = (features['conversion_line'] + features['baseline']) / 2
    features['leading_span_A'] = span_a_value.shift(26)
    # features['leading_span_A'] = (features['conversion_line'] + features['baseline']) / 2
    
    # Senkou Span B (Leading Span B)
    high_52 = talib.MAX(data['high'], timeperiod=52)
    low_52 = talib.MIN(data['low'], timeperiod=52)
    span_b_value = (high_52 + low_52) / 2
    features['leading_span_B'] = span_b_value.shift(26)

    # high_52 = data['high'].rolling(window=52).max()
    # low_52 = data['low'].rolling(window=52).min()
    # features['leading_span_B'] = (high_52 + low_52) / 2
    
    # Chikou Span (Lagging Span) - Current close compared to 26 periods ago
    features['lagging_span'] = data['close'].shift(26)
    
    return features

def calculate_all_indicators(data, timeframe_name):
    """Calculate all 19 indicators for a timeframe"""
    print(f"Calculating indicators for {timeframe_name}...")
    
    # Combine all indicator functions
    ohlcv = extract_ohlcv_features(data)
    ma = calculate_moving_averages(data)
    rsi = calculate_rsi(data)
    macd = calculate_macd(data)
    ichimoku = calculate_ichimoku(data)
    
    # Combine all features
    all_features = pd.concat([ohlcv, ma, rsi, macd, ichimoku], axis=1)
    
    # Add timeframe prefix to column names
    # all_features.columns = [f"{timeframe_name}_{col}" for col in all_features.columns]
    
    print(f"✅ {timeframe_name}: {len(all_features.columns)} features created")
    return all_features


In [None]:

from typing import List, Optional
from itertools import combinations

def normalize_moving_averages(data:pd.DataFrame,periods:Optional=[7,14,20,60,120])->pd.DataFrame:
    """Normalize moving averages by dividing by close price"""
    new_features = {}

    for period in periods:
        if f'MA_{period}' not in data.columns:
            print(f"Warning: {f'MA_{period}'} not found. Skipping.")
            continue
        ma_col = f'MA_{period}'
        new_feature_name = f'{ma_col}_norm'
        normalized_val = (data['close'] / data[ma_col]) - 1
        new_features[new_feature_name] = normalized_val.replace([np.inf, -np.inf], 0)
    for (p_short,p_long) in combinations(periods,2):
        if f'MA_{p_short}' not in data.columns or f'MA_{p_long}' not in data.columns:
            print(f"Warning: {f'MA_{p_short}'} or {f'MA_{p_long}'} not found. Skipping.")
            continue
        ma_col_short = f'MA_{p_short}'
        ma_col_long = f'MA_{p_long}'
        new_feature_name = f'{ma_col_short}_{ma_col_long}_norm'
        normalized_val = (data[ma_col_short] / data[ma_col_long]) - 1
        new_features[new_feature_name] = normalized_val.replace([np.inf, -np.inf], 0)

    for col_name,values in new_features.items():
        data[col_name] = values

    return data

def normalize_ichimoku(data: pd.DataFrame) -> pd.DataFrame:
    new_features = {}
    
    # 원본 Ichimoku 라인 이름 목록
    ichimoku_lines = [
        'conversion_line', 
        'baseline', 
        'leading_span_A', 
        'leading_span_B', 
        'lagging_span'
    ]
    
    # --- 1. (Close vs. 원본 Ichimoku Line) 계산 ---
    for line_col in ichimoku_lines:
        if line_col not in data.columns:
            print(f"Warning: {line_col} not found. Skipping.")
            continue
            
        new_feature_name = f'close_vs_{line_col}_pct'
        # (close / line) - 1
        normalized_val = (data['close'] / data[line_col]) - 1
        new_features[new_feature_name] = normalized_val.replace([np.inf, -np.inf, np.nan], 0) # NaN도 0으로 처리

    # --- 2. (원본 Line vs. 원본 Line) 계산 (크로스) ---
    
    # 전환선 vs 기준선
    if 'conversion_line' in data.columns and 'baseline' in data.columns:
        new_feature_name = 'conversion_vs_baseline_pct'
        # (conversion / baseline) - 1
        normalized_val = (data['conversion_line'] / data['baseline']) - 1
        new_features[new_feature_name] = normalized_val.replace([np.inf, -np.inf, np.nan], 0)

    # 선행스팬 A vs 선행스팬 B (구름 관계)
    if 'leading_span_A' in data.columns and 'leading_span_B' in data.columns:
        new_feature_name = 'span_A_vs_span_B_pct'
        # (Span A / Span B) - 1
        normalized_val = (data['leading_span_A'] / data['leading_span_B']) - 1
        new_features[new_feature_name] = normalized_val.replace([np.inf, -np.inf, np.nan], 0)

    # --- 3. 계산된 모든 피처를 원본 DataFrame에 한 번에 추가 ---
    for col_name, values in new_features.items():
        data[col_name] = values
            
    return data



def normalize_candle_features(data: pd.DataFrame) -> pd.DataFrame:
    """
    OHLC 값을 기반으로 정규화된 캔들 모양 피처를 계산하여 추가합니다.

    Args:
        data: 'open', 'high', 'low', 'close' 컬럼이 포함된 DataFrame

    Returns:
        정규화된 캔들 피처('candle_body_pct', 'high_wick_pct',
        'low_wick_pct', 'range_pct')가 추가된 DataFrame
    """
    # 계산된 새 피처들을 임시 저장
    new_features = {}

    # 1. 캔들 몸통 비율 (종가 변화율)
    # (close - open) / open
    open_price = data['open']
    close_price = data['close']
    new_features['candle_body_pct'] = np.where(
        open_price != 0,
        (close_price - open_price) / open_price,
        0
    )

    # 2. 윗꼬리 비율
    # (high - max(open, close)) / close
    high_price = data['high']
    body_top = np.maximum(open_price, close_price) # 몸통 상단 (양봉이면 close, 음봉이면 open)
    new_features['high_wick_pct'] = np.where(
        close_price != 0,
        (high_price - body_top) / close_price,
        0
    )

    # 3. 아랫꼬리 비율
    # (min(open, close) - low) / close
    low_price = data['low']
    body_bottom = np.minimum(open_price, close_price) # 몸통 하단 (양봉이면 open, 음봉이면 close)
    new_features['low_wick_pct'] = np.where(
        close_price != 0,
        (body_bottom - low_price) / close_price,
        0
    )

    # 4. 캔들 전체 범위 비율
    # (high - low) / low
    new_features['range_pct'] = np.where(
        low_price != 0,
        (high_price - low_price) / low_price,
        0
    )

    # 계산된 모든 피처를 원본 DataFrame에 한 번에 추가
    for col_name, values in new_features.items():
        data[col_name] = values

    return data


def normalize_volume_features(data: pd.DataFrame, ma_periods: Optional[List[int]] = [20]) -> pd.DataFrame:
    """
    Volume 피처를 두 가지 방식으로 정규화하여 새 컬럼으로 추가합니다.
    1. (현재 Volume vs. Volume MA) 이격도
    2. (이전 캔들 대비 Volume 변화율)

    Args:
        data: 'volume' 및 계산된 Volume MA 컬럼('volume_MA_20' 등)이 포함된 DataFrame
        ma_periods: Volume MA 계산에 사용된 기간 목록

    Returns:
        정규화된 Volume 피처('volume_vs_MA_20_pct', 'volume_change_pct')가 추가된 DataFrame
    """

    # 계산된 새 피처들을 임시 저장
    new_features = {}

    # --- 1. (Volume vs. Volume MA) 이격도 계산 ---
    if 'volume' in data.columns:
        for period in ma_periods:
            ma_col = f'volume_MA_{period}'
            new_feature_name = f'volume_vs_{ma_col}_pct'
            
            if ma_col in data.columns:
                # (volume / volume_MA) - 1
                normalized_val = (data['volume'] / data[ma_col]) - 1
                new_features[new_feature_name] = normalized_val.replace([np.inf, -np.inf, np.nan], 0)
            else:
                 print(f"Warning: {ma_col} not found for normalization. Skipping {new_feature_name}.")


    # --- 2. (이전 캔들 대비 Volume 변화율) 계산 ---
    if 'volume' in data.columns:
        new_feature_name = 'volume_change_pct'
        # .pct_change()는 이전 값 대비 변화율을 계산합니다.
        # 첫 번째 값은 NaN이 되므로 fillna(0)으로 처리합니다.
        new_features[new_feature_name] = data['volume'].pct_change().fillna(0).replace([np.inf, -np.inf], 0)
    else:
        print("Warning: 'volume' column not found. Skipping volume_change_pct.")


    # --- 3. 계산된 모든 피처를 원본 DataFrame에 한 번에 추가 ---
    for col_name, values in new_features.items():
        data[col_name] = values

    return data

def normalize_all_features(data: pd.DataFrame) -> pd.DataFrame:
    """Normalize all features"""
    data = normalize_moving_averages(data)
    data = normalize_ichimoku(data)
    data = normalize_candle_features(data)
    data = normalize_volume_features(data)
    return data


In [None]:
h4_full.columns

In [None]:
import pandas as pd
from typing import List, Optional

def remove_absolute_value_features(data: pd.DataFrame, 
                                   ma_periods: Optional[List[int]] = [7, 14, 20, 60, 120],
                                   volume_ma_periods: Optional[List[int]] = [20]) -> pd.DataFrame:
    """
    Removes the original absolute value columns (OHLC, Volume, MA, Ichimoku) 
    from the DataFrame, keeping only the normalized features.

    Args:
        data: DataFrame containing both original and normalized features.
        ma_periods: List of periods used for price moving averages.
        volume_ma_periods: List of periods used for volume moving averages.

    Returns:
        DataFrame with absolute value columns removed.
    """
    
    # Columns to potentially remove
    cols_to_remove = []

    # 1. Original OHLC
    cols_to_remove.extend(['open', 'high', 'low', 'close'])

    # 2. Original Volume
    cols_to_remove.append('volume')

    # 3. Original Price Moving Averages
    for period in ma_periods:
        cols_to_remove.append(f'MA_{period}')

    # 4. Original Ichimoku Lines
    cols_to_remove.extend([
        'conversion_line', 
        'baseline', 
        'leading_span_A', 
        'leading_span_B', 
        'lagging_span'
    ])
    
    # 5. Original Volume Moving Averages
    for period in volume_ma_periods:
        cols_to_remove.append(f'volume_MA_{period}')

    # Check which columns actually exist in the DataFrame
    existing_cols_to_remove = [col for col in cols_to_remove if col in data.columns]
    
    # Drop the existing columns
    print(f"Removing {len(existing_cols_to_remove)} absolute value columns: {existing_cols_to_remove}")
    data = data.drop(columns=existing_cols_to_remove)
    
    return data

In [None]:
# Step 3: Calculate Indicators on Full Data (Including Buffer)
def calculate_indicators_on_full_data(h4_full, d1_full, w1_full):
    """Calculate indicators using full data to ensure proper calculations"""
    
    print("🔄 Calculating indicators on full data (including buffer)...")
    
    # Calculate indicators on full data
    h4_indicators_full = calculate_all_indicators(h4_full, 'H4')
    d1_indicators_full = calculate_all_indicators(d1_full, 'D1')
    w1_indicators_full = calculate_all_indicators(w1_full, 'W1')
    print(h4_indicators_full.columns)
    print(d1_indicators_full.columns)
    print(w1_indicators_full.columns)

    # normalize
    h4_indicators_full = normalize_all_features(h4_indicators_full)
    d1_indicators_full = normalize_all_features(d1_indicators_full)
    w1_indicators_full = normalize_all_features(w1_indicators_full)
    
    print(h4_indicators_full.columns)
    print(d1_indicators_full.columns)
    print(w1_indicators_full.columns)

    # remove absolute value features
    h4_indicators_full = remove_absolute_value_features(h4_indicators_full)
    d1_indicators_full = remove_absolute_value_features(d1_indicators_full)
    w1_indicators_full = remove_absolute_value_features(w1_indicators_full)

    print(h4_indicators_full.columns)
    print(d1_indicators_full.columns)
    print(w1_indicators_full.columns)   
    
    print("✅ All indicators calculated on full data")
    return h4_indicators_full, d1_indicators_full, w1_indicators_full

# Calculate indicators on full data
h4_indicators_full, d1_indicators_full, w1_indicators_full = calculate_indicators_on_full_data(
    h4_full, d1_full, w1_full
)


In [None]:
h4_indicators_full.columns

In [None]:
h4_used_range=h4_indicators_full[(h4_indicators_full.index>=train_start)& (h4_indicators_full.index<=test_end)] 
print(h4_used_range.isnull().sum().sum())
print(d1_indicators_full[(d1_indicators_full.index>=train_start)& (d1_indicators_full.index<=test_end)].isnull().sum().sum())
print(w1_indicators_full[(w1_indicators_full.index>=train_start)& (w1_indicators_full.index<=test_end)].isnull().sum().sum())


In [None]:
import calendar


def get_previous_month_timestamp(timestamp):
    """
    Get the 10th day of the previous month
    Simple and handles all edge cases!
    """
    dt = pd.to_datetime(timestamp)
    
    # Get previous month
    if dt.month == 1:
        prev_month = dt.replace(year=dt.year-1, month=12, day=10)
    else:
        prev_month = dt.replace(month=dt.month-1, day=10)
    
    return prev_month



In [None]:
# # Step 3.5: Remove Problematic Indicators After Calculation
# def remove_problematic_indicators(h4_indicators_full, d1_indicators_full, w1_indicators_full, m1_indicators_full):
#     """
#     Remove indicators that cannot be calculated with available data
#     - W1: Remove 120 MA (needs 2.3 years of data)
#     - M1: Remove 120 MA, 60 MA, leading_span_A, leading_span_B (need 5-10 years of data)
#     """
#     print("🧹 Removing problematic indicators...")
    
#     # W1: Remove 120 MA
#     w1_indicators_clean = w1_indicators_full.copy()
#     if 'W1_MA_120' in w1_indicators_clean.columns:
#         w1_indicators_clean = w1_indicators_clean.drop('W1_MA_120', axis=1)
#         print("✅ Removed W1_MA_120 (needs 2.3 years of data)")
    
#     # M1: Remove 120 MA, 60 MA, leading_span_A, leading_span_B
#     m1_indicators_clean = m1_indicators_full.copy()
#     problematic_m1_cols = ['M1_MA_120', 'M1_MA_60', 'M1_leading_span_A', 'M1_leading_span_B', 'M1_MACD_line', 'M1_MACD_signal', 'M1_MACD_hist']
    
#     for col in problematic_m1_cols:
#         if col in m1_indicators_clean.columns:
#             m1_indicators_clean = m1_indicators_clean.drop(col, axis=1)
#             print(f"✅ Removed {col} (needs 5-10 years of data)")
    
#     print(f"📊 Cleaned indicators:")
#     print(f"  H4: {len(h4_indicators_full.columns)} features (no changes)")
#     print(f"  D1: {len(d1_indicators_full.columns)} features (no changes)")
#     print(f"  W1: {len(w1_indicators_clean.columns)} features (removed 1)")
#     print(f"  M1: {len(m1_indicators_clean.columns)} features (removed 7)")
    
#     return h4_indicators_full, d1_indicators_full, w1_indicators_clean, m1_indicators_clean

# # Remove problematic indicators
# h4_indicators_clean, d1_indicators_clean, w1_indicators_clean = remove_problematic_indicators(
#     h4_indicators_full, d1_indicators_full, w1_indicators_full
# )


In [None]:
# Step 4: Temporal Alignment Functions (CORRECTED VERSION)
def align_timeframe_data(base_data, target_data, base_timeframe, target_timeframe):
    """
    Align target timeframe data with base timeframe data using proper temporal alignment
    
    Args:
        base_data: H4 data (base timeframe)
        target_data: D1/W1/M1 data (target timeframe)
        base_timeframe: 'H4'
        target_timeframe: 'D1', 'W1', 'M1', 'D1_lags', 'W1_lags', 'M1_lags'
    
    Returns:
        aligned_data: Target data aligned with base data timestamps
    """
    print(f"🔄 Aligning {target_timeframe} data with {base_timeframe} timestamps...")
    
    # Define timeframe offsets - ADD LAG SUPPORT
    timeframe_offsets = {
        'D1': pd.Timedelta(days=1),
        'W1': pd.Timedelta(weeks=1),
        # Add lag support
        'D1_lags': pd.Timedelta(days=1),
        'W1_lags': pd.Timedelta(weeks=1)
    }
    
    aligned_data = pd.DataFrame(index=base_data.index, columns=target_data.columns)
    
    for base_timestamp in base_data.index:
        # Use regular timedelta for other timeframes
        offset = timeframe_offsets[target_timeframe]
        cutoff_time = base_timestamp - offset
        
        # Find target data that is <= cutoff_time (previous completed data)
        available_target_data = target_data[target_data.index <= cutoff_time]
        
        if len(available_target_data) > 0:
            # Use the most recent available data (previous completed)
            latest_target_data = available_target_data.iloc[-1]
            aligned_data.loc[base_timestamp] = latest_target_data
        else:
            # If no data available, fill with NaN
            aligned_data.loc[base_timestamp] = np.nan
    
    print(f"✅ {target_timeframe} data aligned: {len(aligned_data.columns)} features, {len(aligned_data)} records")
    return aligned_data


In [None]:
# Step 5: Create Feature Sets A0→A3 with Cleaned Indicators
def create_feature_sets_with_cleaned_indicators(h4, d1, w1):
    """Create feature sets A0→A3 using cleaned indicators (no problematic indicators)"""

    h4.columns=[f"H4_{col}" for col in h4.columns]
    d1.columns=[f"D1_{col}" for col in d1.columns]
    w1.columns=[f"W1_{col}" for col in w1.columns]
    
    # A0: H4 indicators only
    A0 = h4.copy()
    
    # A1: H4 + D1 indicators - Align D1 with H4 timestamps
    d1_aligned = align_timeframe_data(A0, d1, 'H4', 'D1')
    A1 = pd.concat([h4, d1_aligned], axis=1)
    
    # A2: H4 + D1 + W1 indicators - Align W1 with H4 timestamps
    w1_aligned = align_timeframe_data(A0, w1, 'H4', 'W1')
    A2 = pd.concat([h4, d1_aligned, w1_aligned], axis=1)
    
    
    print(f"✅ Feature sets A0→A3 created with cleaned indicators:")
    print(f"  A0: {len(A0.columns)} features, {len(A0)} records")
    print(f"  A1: {len(A1.columns)} features, {len(A1)} records")
    print(f"  A2: {len(A2.columns)} features, {len(A2)} records")
    
    return A0, A1, A2

# Create feature sets with cleaned indicators
A0, A1, A2 = create_feature_sets_with_cleaned_indicators(
    h4_indicators_full, d1_indicators_full, w1_indicators_full
)


In [None]:
def validate_A0_to_A3(A0, A1, A2):
    """Validate feature sets A0→A3"""
    print("🔍 Feature Set Validation:")
    train_start = '2020-05-12'
    test_end = '2025-09-19'

    A0_focused = A0[(A0.index >= train_start) & (A0.index <= test_end)]
    A1_focused = A1[(A1.index >= train_start) & (A1.index <= test_end)]
    A2_focused = A2[(A2.index >= train_start) & (A2.index <= test_end)]

    print(
        f"A0_focused.isnull().sum().sum(): {A0_focused.isnull().sum().sum()}")
    print(
        f"A1_focused.isnull().sum().sum(): {A1_focused.isnull().sum().sum()}")
    print(
        f"A2_focused.isnull().sum().sum(): {A2_focused.isnull().sum().sum()}")

validate_A0_to_A3(A0, A1, A2)

In [None]:
# Step 6: Create Historical Lag Features (Using Full Data with Buffer)
def create_lag_features(indicators_full, timeframe_name, lag_periods):
    """Create historical lag features for a timeframe using full data (including buffer)"""
    lag_features = pd.DataFrame(index=indicators_full.index)
    
    for lag in lag_periods:
        for col in indicators_full.columns:
            lag_features[f"{col}_lag_{lag}"] = indicators_full[col].shift(lag)
    
    print(f"✅ {timeframe_name} lags: {len(lag_features.columns)} features created")
    return lag_features

def create_all_lag_features_with_buffer(h4_indicators_clean, d1_indicators_clean, w1_indicators_clean):
    """Create historical lag features for all timeframes using full data (including buffer)"""
    
    print("⏰ Creating historical lag features using full data (including buffer)...")
    
    # H4 lags: t-1 to t-6 (6 lags)
    h4_lags_full = create_lag_features(h4_indicators_clean, 'H4', range(1, 7))
    
    # D1 lags: t-1 to t-7 (7 lags)
    d1_lags_full = create_lag_features(d1_indicators_clean, 'D1', range(1, 8))
    
    # W1 lags: t-1 to t-4 (4 lags)
    w1_lags_full = create_lag_features(w1_indicators_clean, 'W1', range(1, 5))
    
    
    print(f"✅ All lag features created using full data:")
    print(f"  H4 lags: {len(h4_lags_full.columns)} features, {len(h4_lags_full)} records")
    print(f"  D1 lags: {len(d1_lags_full.columns)} features, {len(d1_lags_full)} records")
    print(f"  W1 lags: {len(w1_lags_full.columns)} features, {len(w1_lags_full)} records")
    
    return h4_lags_full, d1_lags_full, w1_lags_full

# Create historical lag features using full data (including buffer)
h4_lags_full, d1_lags_full, w1_lags_full = create_all_lag_features_with_buffer(
    A0,A1,A2
)


In [None]:
h4_lags_focused=h4_lags_full[(h4_lags_full.index >= train_start) & (h4_lags_full.index <= test_end)]
print(h4_lags_focused.isnull().sum().sum())
print(h4_lags_focused.index.min(), h4_lags_focused.index.max())

d1_lags_focused = d1_lags_full[(d1_lags_full.index >= train_start)
                               & (d1_lags_full.index <= test_end)]
print(d1_lags_focused.isnull().sum().sum())
print(d1_lags_focused.index.min(), d1_lags_focused.index.max())

w1_lags_focused = w1_lags_full[(w1_lags_full.index >= train_start)
                              & (w1_lags_full.index <= test_end)]
print(w1_lags_focused.isnull().sum().sum())
print(w1_lags_focused.index.min(), w1_lags_focused.index.max())


In [None]:
# Step 7: Create A4 Feature Set with Temporal Alignment (Using Full Data)
def create_a4_features_with_temporal_alignment(A2:pd.DataFrame, h4_lags_full:pd.DataFrame, d1_lags_full:pd.DataFrame, w1_lags_full:pd.DataFrame):
    """Create A4 feature set: A3 + all historical lags with proper temporal alignment"""
    
    # Align D1, W1, M1 lag features with H4 timestamps
    d1_lags_aligned = align_timeframe_data(A2, d1_lags_full, 'H4', 'D1_lags')
    w1_lags_aligned = align_timeframe_data(A2, w1_lags_full, 'H4', 'W1_lags')
    
    # Combine A3 with all lag features
    h4_lags_full_aligned = h4_lags_full.copy()
    h4_lags_full_aligned.columns = [f"H4_lags_{col}" for col in h4_lags_full.columns]
    d1_lags_aligned.columns = [f"D1_lags_{col}" for col in d1_lags_aligned.columns]
    w1_lags_aligned.columns = [f"W1_lags_{col}" for col in w1_lags_aligned.columns]
    A3 = pd.concat([A2, h4_lags_full_aligned, d1_lags_aligned, w1_lags_aligned], axis=1)
    
    print(f"✅ A4 feature set created with temporal alignment:")
    print(f"  A4: {len(A3.columns)} features, {len(A3)} records")
    print(f"  - Current indicators: {len(A2.columns)}")
    print(f"  - Historical lags: {len(A3.columns) - len(A2.columns)}")
    
    return A3

# Create A4 feature set with temporal alignment
A3 = create_a4_features_with_temporal_alignment(A2, h4_lags_full, d1_lags_full, w1_lags_full)


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(A3.columns)

In [None]:
# Step 8: Data Validation & Quality Checks
def validate_feature_sets(A0,
                          A1,
                          A2,
                          A3,
                          train_start='2020-05-12',
                          test_end='2025-09-19'):
    """Validate all feature sets"""
    print("🔍 Feature Set Validation (Train/Test Period Only):")
    print(f"📅 Period: {train_start} to {test_end}")

    A0_focused = A0[(A0.index >= train_start) & (A0.index <= test_end)]
    A1_focused = A1[(A1.index >= train_start) & (A1.index <= test_end)]
    A2_focused = A2[(A2.index >= train_start) & (A2.index <= test_end)]
    A3_focused = A3[(A3.index >= train_start) & (A3.index <= test_end)]

    feature_counts = {
        'A0': len(A0_focused.columns),
        'A1': len(A1_focused.columns),
        'A2': len(A2_focused.columns),
        'A3': len(A3_focused.columns)
    }

    # A0 : 19, A1 : 38, A2 : 38 + 19 - 1= 56,  A3 : 56 + 19 - 4 = 71, A4 : 
    expected_counts = {'A0': 19, 'A1': 38, 'A2': 56, 'A3': 68}

    print("🔍 Feature Set Validation:")
    for set_name, count in feature_counts.items():
        expected = expected_counts[set_name]
        status = "✅" if count == expected else "❌"
        print(f"  {status} {set_name}: {count}/{expected} features")

    # Check for missing values in focused period only
    print("\n🔍 Missing Values Check (Train/Test Period Only):")
    for set_name, features in [('A0', A0_focused), ('A1', A1_focused),
                               ('A2', A2_focused), ('A3', A3_focused)]:
        missing_count = features.isnull().sum().sum()
        total_cells = features.shape[0] * features.shape[1]
        missing_percentage = (missing_count / total_cells) * 100
        print(
            f"  {set_name}: {missing_count:,} missing values ({missing_percentage:.2f}%)"
        )
        print(f"    Period: {features.index[0]} to {features.index[-1]}")
        print(f"    Records: {len(features)}")

    return feature_counts, (A0_focused, A1_focused, A2_focused, A3_focused)

# Validate feature sets
validation_results_focused, focused_sets = validate_feature_sets(A0, A1, A2, A3)


In [None]:
*_,A3_focused=focused_sets

# Check if M1 alignment is working correctly
print("M1 temporal alignment check:")
print(f"A3 missing values: {A3_focused.isnull().sum().sum()}")
print(
    f"A3 M1 columns missing: {A3_focused.filter(regex='M1_').isnull().sum().sum()}"
)

# Check lag feature missing values
print("Lag feature missing values:")
print(f"A4 H4 lags missing: {A3_focused.filter(regex='H4_.*_lag_').isnull().sum().sum()}")
print(f"A4 D1 lags missing: {A3_focused.filter(regex='D1_.*_lag_').isnull().sum().sum()}")
print(f"A4 W1 lags missing: {A3_focused.filter(regex='W1_.*_lag_').isnull().sum().sum()}")
print(f"A4 M1 lags missing: {A3_focused.filter(regex='M1_.*_lag_').isnull().sum().sum()}")

In [None]:
duplicated_cols = A3.columns[A3.columns.duplicated()]
print(f"\n중복된 컬럼명: {duplicated_cols.tolist()}")

In [None]:
import pandas as pd
import numpy as np

def create_target_variable_first_threshold(h4_full: pd.DataFrame,
                                                window_size: int = 30, # <-- 30봉으로 변경
                                                upper_threshold: float = 0.10, # +10%
                                                lower_threshold: float = -0.15, # -15%
                                                train_start: str = '2020-05-12',
                                                test_end: str = '2025-09-19') -> pd.DataFrame:
    """
    Create target variable using first threshold logic (slow loop version).
    y=0: upper_threshold hit first OR neither hit.
    y=1: lower_threshold hit first.
    """

    print(f"🎯 Creating target variable ({window_size}-period window, first threshold logic)...")

    # Create target variable for ALL H4 data
    y_full = pd.DataFrame(index=h4_full.index)
    y_full['target'] = 0  # Initialize with 0 (REST/BUY state)

    print("🔄 Calculating target labels (this might take a while)...")

    total_len = len(h4_full)
    for i in range(total_len):
        # Print progress
        if i % 1000 == 0:
            print(f"   Processing {i}/{total_len} records...")

        # Ensure there's enough future data for the window
        if i + window_size >= total_len:
            # Not enough future data, keep default y=0 and continue to next i
            continue

        current_close = h4_full.iloc[i]['close']
        
        # Avoid division by zero if current_close is 0
        if current_close == 0:
            continue # Keep default y=0

        # Get the next `window_size` periods
        future_data = h4_full.iloc[i + 1 : i + 1 + window_size] # <-- 180 대신 window_size 사용

        # Calculate price changes relative to current close
        price_increases = (future_data['close'] - current_close) / current_close
        price_drops = (future_data['low'] - current_close) / current_close # Check drops against future 'low'

        # Find the first threshold hit within the window
        target_set = False # Flag to check if target was set inside the inner loop
        for j in range(len(future_data)):
            # Check lower threshold (-15%)
            if price_drops.iloc[j] <= lower_threshold:
                y_full.iloc[i, 0] = 1  # Lower hit first -> SELL
                target_set = True
                break # Exit inner loop once a threshold is hit
            
            # Check upper threshold (+10%)
            if price_increases.iloc[j] >= upper_threshold:
                y_full.iloc[i, 0] = 0  # Upper hit first -> REST/BUY
                target_set = True
                break # Exit inner loop once a threshold is hit

        # If the inner loop finished without hitting either threshold (no break)
        # the default y=0 (initialized at the beginning) remains.

    # --- 필터링 위치 수정 ---
    # The main loop is finished, now filter the results
    print("Filtering results to the specified date range...")
    y_focused = y_full[(y_full.index >= train_start)
                       & (y_full.index <= test_end)].copy() # Use .copy() to avoid SettingWithCopyWarning

    print(f"✅ Target variable created:")
    print(f"   Total records in focused period: {len(y_focused)}")
    print(f"   Sell labels (target=1): {y_focused['target'].sum()}")
    print(f"   Rest/Buy labels (target=0): {len(y_focused) - y_focused['target'].sum()}")

    return y_focused

# --- 함수 호출 예시 ---
# y_target = create_target_variable_first_threshold_slow(h4_full)
# print(y_target.head())
# print(y_target['target'].value_counts())

# create_target_variable_first_threshold(h4_full)

In [None]:
def save_focused_feature_sets_complete(A0, A1, A2, A3, h4_full, train_start='2020-05-12', test_end='2025-09-19'):
    """Save focused feature sets with correct target variable creation"""
    
    print("💾 Saving focused feature sets...")
    
    # Create features directory
    features_dir = Path('../features')
    features_dir.mkdir(exist_ok=True)
    
    # Filter feature sets to focused period
    def filter_focused_period(data, start_date, end_date):
        return data[(data.index >= start_date) & (data.index <= end_date)]
    
    # Save feature sets
    A0_focused = filter_focused_period(A0, train_start, test_end)
    A1_focused = filter_focused_period(A1, train_start, test_end)
    A2_focused = filter_focused_period(A2, train_start, test_end)
    A3_focused = filter_focused_period(A3, train_start, test_end)
    
    # Create target variable using FULL H4 data
    y_focused = create_target_variable_first_threshold(h4_full,train_start=train_start, test_end=test_end)
    
    # Save all files
    A0_focused.to_parquet(features_dir / 'A0.parquet')
    A1_focused.to_parquet(features_dir / 'A1.parquet')
    A2_focused.to_parquet(features_dir / 'A2.parquet')
    A3_focused.to_parquet(features_dir / 'A3.parquet')
    y_focused.to_parquet(features_dir / 'y.parquet')
    
    print("🎉 All feature sets saved successfully!")
    return A0_focused, A1_focused, A2_focused, A3_focused, y_focused

# Run the complete save function
save_focused_feature_sets_complete(A0, A1, A2, A3,  h4_full, train_start='2020-05-12', test_end='2025-09-19')

In [None]:
def validate_saved_feature_sets(features_dir='../features'):
    """
    Validate that all feature sets and target variable were saved correctly
    
    Args:
        features_dir: Path to features directory
    """
    
    print("🔍 Validating Saved Feature Sets...")
    print("=" * 50)
    
    # Check if features directory exists
    features_path = Path(features_dir)
    if not features_path.exists():
        print("❌ Features directory not found!")
        return
    
    # Expected files
    expected_files = ['A0.parquet', 'A1.parquet', 'A2.parquet', 'A3.parquet', 'A4.parquet', 'y.parquet']
    
    print("📁 File Existence Check:")
    for file in expected_files:
        file_path = features_path / file
        if file_path.exists():
            print(f"  ✅ {file} - Found")
        else:
            print(f"  ❌ {file} - Missing!")
    
    print("\n📊 Data Validation:")
    
    # Load and validate each file
    for file in expected_files:
        file_path = features_path / file
        if not file_path.exists():
            continue
            
        print(f"\n🔍 Validating {file}:")
        
        try:
            # Load data
            data = pd.read_parquet(file_path)
            
            # Basic info
            print(f"  📏 Shape: {data.shape[0]} records × {data.shape[1]} features")
            print(f"  📅 Period: {data.index[0]} to {data.index[-1]}")
            
            # Check for missing values
            missing_count = data.isnull().sum().sum()
            total_cells = data.shape[0] * data.shape[1]
            missing_percentage = (missing_count / total_cells) * 100 if total_cells > 0 else 0
            
            if missing_count == 0:
                print(f"  ✅ Missing values: {missing_count} (0.00%)")
            else:
                print(f"  ⚠️ Missing values: {missing_count} ({missing_percentage:.2f}%)")
                
                # Show which columns have missing values
                missing_cols = data.isnull().sum()
                missing_cols = missing_cols[missing_cols > 0]
                if len(missing_cols) > 0:
                    print(f"    Columns with missing values:")
                    for col, count in missing_cols.items():
                        print(f"      {col}: {count} missing")
            
            # Check data types
            print(f"  📋 Data types: {data.dtypes.value_counts().to_dict()}")
            
            # Check for infinite values
            inf_count = np.isinf(data.select_dtypes(include=[np.number])).sum().sum()
            if inf_count == 0:
                print(f"  ✅ Infinite values: {inf_count}")
            else:
                print(f"  ⚠️ Infinite values: {inf_count}")
            
            # Specific validation for target variable
            if file == 'y.parquet':
                print(f"  🎯 Target variable validation:")
                print(f"    Unique values: {data['target'].unique()}")
                print(f"    Value counts: {data['target'].value_counts().to_dict()}")
                print(f"    Sell percentage: {data['target'].mean()*100:.2f}%")
            
            # Specific validation for feature sets
            if file.startswith('A'):
                print(f"  🔢 Feature set validation:")
                print(f"    Feature count: {len(data.columns)}")
                print(f"    Sample features: {list(data.columns[:5])}")
                
                # Check for expected feature counts
                expected_counts = {'A0': 19, 'A1': 38, 'A2': 56, 'A3': 67, 'A4': 416}
                if file.replace('.parquet', '') in expected_counts:
                    expected = expected_counts[file.replace('.parquet', '')]
                    actual = len(data.columns)
                    if actual == expected:
                        print(f"    ✅ Feature count matches expected: {actual}")
                    else:
                        print(f"    ⚠️ Feature count mismatch: {actual} (expected {expected})")
            
        except Exception as e:
            print(f"  ❌ Error loading {file}: {e}")
    
    print("\n" + "=" * 50)
    print("🎉 Validation Complete!")

# Run validation
validate_saved_feature_sets('../features')

In [35]:
# Target Variable Distribution Analysis
def analyze_target_distribution(y_data, title="Target Variable Distribution"):
    """
    Analyze and visualize target variable distribution
    
    Args:
        y_data: DataFrame with 'target' column
        title: Title for the analysis
    """
    print(f"🎯 {title}")
    print("=" * 60)
    
    # Basic statistics
    total_samples = len(y_data)
    sell_samples = y_data['target'].sum()
    rest_samples = total_samples - sell_samples
    sell_percentage = (sell_samples / total_samples) * 100
    
    print(f"📊 Basic Statistics:")
    print(f"  Total samples: {total_samples:,}")
    print(f"  SELL (target=1): {sell_samples:,} ({sell_percentage:.2f}%)")
    print(f"  REST (target=0): {rest_samples:,} ({100-sell_percentage:.2f}%)")
    
    # Class imbalance ratio
    imbalance_ratio = rest_samples / sell_samples
    print(f"  Class imbalance ratio: {imbalance_ratio:.2f}:1 (REST:SELL)")
    
    # Value counts
    print(f"\n📈 Value Counts:")
    value_counts = y_data['target'].value_counts().sort_index()
    for value, count in value_counts.items():
        label = "SELL" if value == 1 else "REST"
        percentage = (count / total_samples) * 100
        print(f"  {label} (target={value}): {count:,} ({percentage:.2f}%)")
    
    # Temporal distribution analysis
    print(f"\n📅 Temporal Distribution:")
    y_data_with_date = y_data.copy()
    y_data_with_date['year'] = y_data_with_date.index.year
    y_data_with_date['month'] = y_data_with_date.index.month
    y_data_with_date['year_month'] = y_data_with_date.index.to_period('M')
    
    # Yearly distribution
    yearly_dist = y_data_with_date.groupby('year')['target'].agg(['count', 'sum']).reset_index()
    yearly_dist['sell_pct'] = (yearly_dist['sum'] / yearly_dist['count']) * 100
    yearly_dist['rest_count'] = yearly_dist['count'] - yearly_dist['sum']
    
    print(f"  Yearly Distribution:")
    for _, row in yearly_dist.iterrows():
        print(f"    {int(row['year'])}: {int(row['sum'])} SELL / {int(row['rest_count'])} REST ({row['sell_pct']:.2f}% SELL)")
    
    # Monthly distribution (last 12 months)
    monthly_dist = y_data_with_date.groupby('year_month')['target'].agg(['count', 'sum']).reset_index()
    monthly_dist['sell_pct'] = (monthly_dist['sum'] / monthly_dist['count']) * 100
    monthly_dist['rest_count'] = monthly_dist['count'] - monthly_dist['sum']
    
    print(f"\n  Monthly Distribution (Last 12 months):")
    last_12_months = monthly_dist.tail(12)
    for _, row in last_12_months.iterrows():
        print(f"    {row['year_month']}: {int(row['sum'])} SELL / {int(row['rest_count'])} REST ({row['sell_pct']:.2f}% SELL)")
    
    # Consecutive SELL analysis
    print(f"\n🔍 Consecutive SELL Analysis:")
    y_series = y_data['target'].values
    consecutive_sells = []
    current_consecutive = 0
    
    for i, value in enumerate(y_series):
        if value == 1:  # SELL
            current_consecutive += 1
        else:  # REST
            if current_consecutive > 0:
                consecutive_sells.append(current_consecutive)
                current_consecutive = 0
    
    if current_consecutive > 0:  # Handle case where series ends with SELL
        consecutive_sells.append(current_consecutive)
    
    if consecutive_sells:
        max_consecutive = max(consecutive_sells)
        avg_consecutive = sum(consecutive_sells) / len(consecutive_sells)
        print(f"  Max consecutive SELLs: {max_consecutive}")
        print(f"  Average consecutive SELLs: {avg_consecutive:.2f}")
        print(f"  Total consecutive SELL sequences: {len(consecutive_sells)}")
    else:
        print(f"  No consecutive SELL sequences found")
    
    # SELL clustering analysis (SELLs within 5 periods)
    print(f"\n🎯 SELL Clustering Analysis:")
    sell_indices = y_data[y_data['target'] == 1].index
    if len(sell_indices) > 1:
        time_diffs = []
        for i in range(1, len(sell_indices)):
            diff = (sell_indices[i] - sell_indices[i-1]).total_seconds() / 3600  # hours
            time_diffs.append(diff)
        
        if time_diffs:
            avg_time_between_sells = sum(time_diffs) / len(time_diffs)
            min_time_between_sells = min(time_diffs)
            max_time_between_sells = max(time_diffs)
            
            print(f"  Average time between SELLs: {avg_time_between_sells:.1f} hours")
            print(f"  Min time between SELLs: {min_time_between_sells:.1f} hours")
            print(f"  Max time between SELLs: {max_time_between_sells:.1f} hours")
            
            # Count SELLs within 5 periods (20 hours for H4)
            close_sells = sum(1 for diff in time_diffs if diff <= 20)
            print(f"  SELLs within 20 hours: {close_sells} ({close_sells/len(time_diffs)*100:.1f}%)")
    
    return {
        'total_samples': total_samples,
        'sell_samples': sell_samples,
        'rest_samples': rest_samples,
        'sell_percentage': sell_percentage,
        'imbalance_ratio': imbalance_ratio,
        'yearly_distribution': yearly_dist,
        'monthly_distribution': monthly_dist,
        'consecutive_sells': consecutive_sells
    }

# Load and analyze target distribution
print("🔍 Loading target variable for analysis...")
y_analysis = pd.read_parquet('../features/y.parquet')

# Analyze target distribution
target_stats = analyze_target_distribution(y_analysis, "BTC Sell Signal Target Distribution")

# Additional visualization
print(f"\n📊 Target Distribution Summary:")
print(f"  This is a highly imbalanced dataset with {target_stats['sell_percentage']:.2f}% SELL signals")
print(f"  Class imbalance ratio: {target_stats['imbalance_ratio']:.1f}:1")
print(f"  This imbalance will require careful handling in model training")
print(f"  Consider using techniques like:")
print(f"    - Class weights (scale_pos_weight)")
print(f"    - SMOTE or other oversampling")
print(f"    - Focal Loss")
print(f"    - Stratified sampling")


🔍 Loading target variable for analysis...
🎯 BTC Sell Signal Target Distribution
📊 Basic Statistics:
  Total samples: 11,737
  SELL (target=1): 1,560 (13.29%)
  REST (target=0): 10,177 (86.71%)
  Class imbalance ratio: 6.52:1 (REST:SELL)

📈 Value Counts:
  REST (target=0): 10,177 (86.71%)
  SELL (target=1): 1,560 (13.29%)

📅 Temporal Distribution:
  Yearly Distribution:
    2020: 77 SELL / 1327 REST (5.48% SELL)
    2021: 637 SELL / 1553 REST (29.09% SELL)
    2022: 464 SELL / 1726 REST (21.19% SELL)
    2023: 90 SELL / 2100 REST (4.11% SELL)
    2024: 198 SELL / 1998 REST (9.02% SELL)
    2025: 94 SELL / 1473 REST (6.00% SELL)

  Monthly Distribution (Last 12 months):
    2024-10: 0 SELL / 186 REST (0.00% SELL)
    2024-11: 0 SELL / 180 REST (0.00% SELL)
    2024-12: 12 SELL / 174 REST (6.45% SELL)
    2025-01: 23 SELL / 163 REST (12.37% SELL)
    2025-02: 43 SELL / 125 REST (25.60% SELL)
    2025-03: 23 SELL / 163 REST (12.37% SELL)
    2025-04: 5 SELL / 175 REST (2.78% SELL)
    2025

## Task 2.2 Implementation Complete! ✅

### **Summary of CORRECTED Implementation**

**Approach**: Calculate everything together, split on save step.

**Steps Completed**:
1. ✅ **Load Full Data**: Complete dataset including buffer (2020-03-01 to 2025-10-19)
2. ✅ **Technical Indicators**: Calculated all 19 indicators per timeframe on full data
3. ✅ **Feature Sets A0→A3**: Created incremental feature sets with proper temporal alignment
4. ✅ **Historical Lag Features**: Created lag features using full data for complete historical context
5. ✅ **A4 Feature Set**: Combined A3 + all historical lags (437 features)
6. ✅ **Data Validation**: Verified feature counts and missing values
7. ✅ **Save Clean Data**: Split clean period (2020-05-12 to 2025-09-19) only at save step

### **Key Fixes Applied**:
1. **Full Data Calculations**: All indicators and lags calculated on complete dataset
2. **Proper Temporal Alignment**: Enhanced alignment logic with timeframe-specific offsets
3. **Complete Historical Context**: W1 data from 2020-05-04, M1 data from 2020-05-01
4. **No Missing Data**: Full historical context for all lag features
5. **Clean Final Output**: Buffer data used for calculations but not stored

### **Temporal Alignment Logic**:
- **H4 timestamp 2020-05-11 00:00:00** (candle closed at 2020-05-11 04:00:00):
  - **D1 data**: `base_timestamp - 1d` → Use 2020-05-10 00:00:00 (previous day's close) ✅
  - **W1 data**: `base_timestamp - 1w` → Use 2020-05-04 00:00:00 (previous week's close) ✅
  - **M1 data**: `base_timestamp - 1m` → Use 2020-04-11 00:00:00 (previous month's close) ✅
- **Uses timeframe-specific offsets** to ensure proper temporal alignment
- **Ensures no future data leakage** and realistic trading scenarios

### **Expected Outputs**
- **A0.parquet**: 19 features (H4 only)
- **A1.parquet**: 38 features (H4 + D1)
- **A2.parquet**: 57 features (H4 + D1 + W1)
- **A3.parquet**: 76 features (H4 + D1 + W1 + M1)
- **A4.parquet**: 437 features (A3 + all historical lags)

### **Next Steps**
- **Step 3**: Train/test split based on timeline
- **Step 4**: Ablation Study experiments (A0→A4_Pruned)
- **Step 5**: Results analysis and RQ answers

**Ready to proceed to Step 3!** 🚀


In [33]:
# 특정 조합 테스트: Window 40 (+10/-10), Window 50 (+10/-12)

def test_specific_combinations(h4_full: pd.DataFrame):
    """
    사용자가 요청한 두 조합만 테스트
    - Window: 40, Upper: +10.0%, Lower: -10.0%
    - Window: 50, Upper: +10.0%, Lower: -12.0%
    """
    print("🧪 Testing Specific Combinations")
    print("=" * 60)
    
    test_configs = [
        {"window": 40, "upper": 0.10, "lower": -0.10, "name": "40w +10/-10"},
        {"window": 50, "upper": 0.10, "lower": -0.12, "name": "50w +10/-12"},
    ]
    
    results = []
    
    for config in test_configs:
        print(f"\n🧪 Window={config['window']}, Upper=+{config['upper']*100:.1f}%, Lower={config['lower']*100:.1f}% ({config['name']})")
        print("-" * 60)
        
        try:
            y_result = create_target_variable_first_threshold(
                h4_full,
                window_size=config['window'],
                upper_threshold=config['upper'],
                lower_threshold=config['lower'],
                train_start='2020-05-12',
                test_end='2025-09-19'
            )
            
            # Monthly aggregation
            monthly = {}
            for idx, row in y_result.iterrows():
                ym = idx.to_period('M')
                if ym not in monthly:
                    monthly[ym] = {"total": 0, "sell": 0}
                monthly[ym]["total"] += 1
                monthly[ym]["sell"] += int(row["target"] == 1)
            
            monthly_rows = []
            for ym in sorted(monthly.keys()):
                total = monthly[ym]["total"]
                sell = monthly[ym]["sell"]
                sell_ratio = sell / total * 100 if total else 0
                monthly_rows.append({"ym": ym, "total": total, "sell": sell, "sell_ratio": sell_ratio})
            
            # Summary
            total_samples = len(y_result)
            total_sell = int(y_result["target"].sum())
            overall_ratio = total_sell / total_samples * 100 if total_samples else 0
            valid_months = [r for r in monthly_rows if r["total"] > 0]
            months_with_sell = sum(1 for r in valid_months if r["sell_ratio"] > 0)
            consistency = months_with_sell / len(valid_months) * 100 if valid_months else 0
            ratios = [r["sell_ratio"] for r in valid_months]
            avg_ratio = float(np.mean(ratios)) if ratios else 0.0
            std_ratio = float(np.std(ratios)) if ratios else 0.0
            
            print(f"   📊 Overall SELL: {overall_ratio:.2f}% ({total_sell:,}/{total_samples:,})")
            print(f"   📅 Monthly consistency: {consistency:.1f}% ({months_with_sell}/{len(valid_months)} months)")
            print(f"   Avg monthly SELL: {avg_ratio:.2f}% | Std: {std_ratio:.2f}%")
            
            results.append({
                "window": config['window'],
                "upper": config['upper'],
                "lower": config['lower'],
                "name": config['name'],
                "overall_ratio": overall_ratio,
                "consistency": consistency,
                "avg_monthly": avg_ratio,
                "std_monthly": std_ratio,
            })
            
        except Exception as e:
            print(f"   ❌ Error: {e}")
            continue
    
    # Summary table
    print("\n📊 Summary (Specific Combinations)")
    print("=" * 80)
    print(f"{'Config':<15} {'Overall%':<10} {'MonthlyAvg%':<12} {'Consistency%':<12} {'Std%':<8}")
    print("-" * 80)
    for r in results:
        print(f"{r['name']:<15} {r['overall_ratio']:>8.2f}% {r['avg_monthly']:>10.2f}% {r['consistency']:>10.1f}% {r['std_monthly']:>6.2f}%")
    
    return results

print("🚀 Testing specific combinations...")
specific_results = test_specific_combinations(h4_full)

🚀 Testing specific combinations...
🧪 Testing Specific Combinations

🧪 Window=40, Upper=+10.0%, Lower=-10.0% (40w +10/-10)
------------------------------------------------------------
🎯 Creating target variable (40-period window, first threshold logic)...
🔄 Calculating target labels (this might take a while)...
   Processing 0/12403 records...
   Processing 1000/12403 records...
   Processing 2000/12403 records...
   Processing 3000/12403 records...
   Processing 4000/12403 records...
   Processing 5000/12403 records...
   Processing 6000/12403 records...
   Processing 7000/12403 records...
   Processing 8000/12403 records...
   Processing 9000/12403 records...
   Processing 10000/12403 records...
   Processing 11000/12403 records...
   Processing 12000/12403 records...
Filtering results to the specified date range...
✅ Target variable created:
   Total records in focused period: 11737
   Sell labels (target=1): 1763
   Rest/Buy labels (target=0): 9974
   📊 Overall SELL: 15.02% (1,763/1

In [38]:
h4_full.tail(5)

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-27 08:00:00,115554.59,115607.48,114814.69,115362.02,3701.92326
2025-10-27 12:00:00,115362.02,115437.48,114503.99,114969.68,4493.43933
2025-10-27 16:00:00,114969.68,115790.0,114790.81,114942.64,2258.20754
2025-10-27 20:00:00,114942.64,114942.64,113830.01,114107.65,2236.06411
2025-10-28 00:00:00,114107.65,114547.2,113777.01,113777.01,1435.83458


In [34]:
def create_and_save_y(h4_full, train_start='2020-05-12', test_end='2025-09-19'):
    """Save focused feature sets with correct target variable creation"""
    
    print("💾 Saving focused feature sets...")
    
    # Create features directory
    features_dir = Path('../features')
    features_dir.mkdir(exist_ok=True)
    
    # Create target variable using FULL H4 data
    y_focused = create_target_variable_first_threshold(h4_full,
        window_size=50,
        upper_threshold=0.10,
        lower_threshold=-0.12,
        train_start=train_start, test_end=test_end)
    
    # Save all files
    y_focused.to_parquet(features_dir / 'y.parquet')
    
    print("🎉 All feature sets saved successfully!")
    return y_focused

# Run the complete save function
create_and_save_y(h4_full, train_start='2020-05-12', test_end='2025-09-19')

💾 Saving focused feature sets...
🎯 Creating target variable (50-period window, first threshold logic)...
🔄 Calculating target labels (this might take a while)...
   Processing 0/12403 records...
   Processing 1000/12403 records...
   Processing 2000/12403 records...
   Processing 3000/12403 records...
   Processing 4000/12403 records...
   Processing 5000/12403 records...
   Processing 6000/12403 records...
   Processing 7000/12403 records...
   Processing 8000/12403 records...
   Processing 9000/12403 records...
   Processing 10000/12403 records...
   Processing 11000/12403 records...
   Processing 12000/12403 records...
Filtering results to the specified date range...
✅ Target variable created:
   Total records in focused period: 11737
   Sell labels (target=1): 1560
   Rest/Buy labels (target=0): 10177
🎉 All feature sets saved successfully!


Unnamed: 0_level_0,target
timestamp,Unnamed: 1_level_1
2020-05-12 00:00:00,0
2020-05-12 04:00:00,0
2020-05-12 08:00:00,0
2020-05-12 12:00:00,0
2020-05-12 16:00:00,0
...,...
2025-09-18 08:00:00,0
2025-09-18 12:00:00,0
2025-09-18 16:00:00,0
2025-09-18 20:00:00,0
