In [None]:
# 每次运行时重新加载config.py
from importlib import reload
import config as c
reload(c)

print(f"已加载配置：factor={c.factor}, factor2={c.factor2}, operation={c.operation}, preprocess={c.preprocess}, USE_ALL_MODELS={c.USE_ALL_MODELS}" )

## Resampling 1min data to wanted timeframe

In [None]:
import os
import pandas as pd
import config as c

def prepare_price_data(
    csv_path: str,  # 輸入 CSV 檔案完整路徑
    datasource: str = 'bybit_btcusdt',
    factor: str = 'price',
    timeframe: str = '1D', 
    delay_minutes: int = 0
):
    """
    讀取 1m 資料，轉成指定的時間週期 (timeframe)，
    可選擇延遲(正值)或提前(負值)時間索引，並自動存檔到當前工作目錄。

    參數：
      csv_path      : 【完整路徑】輸入的 1m 級別 CSV 檔案
      datasource    : 資料來源名稱 (如 bybit_btcusdt)
      factor        : 影響因子名稱 (如 price)
      timeframe     : 轉換後的時間週期，如 '1H'、'1D' 等 (預設 '1D')
      delay_minutes : 時間平移的分鐘數 (正值 = 延後；負值 = 提前)

    回傳：
      pandas DataFrame (resampled 後的結果)，
      並將結果輸出為 CSV，命名格式：
      {datasource}_{factor}_{timeframe}_{start_time}_{end_time}.csv
    """
    # 1. 讀取 CSV，解析時間
    df = pd.read_csv(
        csv_path, 
        parse_dates=['Time']  # pandas 會自動解析時間格式
    )

    # 2. 將 'Time' 欄設為索引
    df.set_index('Time', inplace=True)

    # 3. 時間平移 (延遲 / 提前)
    if delay_minutes != 0:
        df.index = df.index + pd.Timedelta(minutes=delay_minutes)

    # 4. 定義 resample 聚合方式
    if c.candle_exchange == 'bybit':
        ohlc_dict = {
            'Open': 'first',
            'High': 'max',
            'Low': 'min',
            'Close': 'last',
            'Volume': 'sum',
            'Turnover': 'sum'
        }
    else:
        ohlc_dict = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum',
    }
    
    # 5. 進行 resample
    df_resampled = df.resample(timeframe).agg(ohlc_dict).dropna(how='any')

    # Use Time to create one more column named 'start_time' that is in unix timestamp
    df_resampled['start_time'] = df_resampled.index.astype('int64') // 10**6
    # df_resampled['start_time'] = df_resampled['start_time'].astype('float64')

    # 6. 獲取開始與結束時間 (格式 YYYY-MM-DD)
    if not df_resampled.empty:
        start_time = df_resampled.index[0].strftime('%Y-%m-%d')
        end_time = df_resampled.index[-1].strftime('%Y-%m-%d')

        # 7. 構建輸出檔案名稱
        output_filename = f"./data/resample_{datasource}_{timeframe}_-{c.shift_candle_minite}m.csv"
        output_path = os.path.join(os.getcwd(), output_filename)  # 當前工作目錄

        # 8. 輸出 CSV
        df_resampled.to_csv(output_path)
        print(f"✅ 檔案已儲存：{output_path}")
    else:
        print("⚠️ Resampled DataFrame 為空，未產生輸出檔案！")

    return df_resampled

df_r = prepare_price_data(
    csv_path=f"./data/{c.candle_exchange}_{c.symbol.lower()}usdt_price_1m.csv",
    datasource=f'{c.candle_exchange}_{c.symbol.lower()}',
    factor='price',
    timeframe=c.interval,
    delay_minutes=-c.shift_candle_minite
)

print(df_r.head()) 

## Data Visualization

In [None]:
from utilsnumpy import load_all_data, combines_data, data_processing
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import config as c
from datetime import datetime

def visualize_factors():
    """
    加載並視覺化因子數據，支持因子組合和預處理
    
    Returns:
        dict: 包含蠟燭圖數據、因子數據和因子名稱的字典
    """
    print("Loading data...")
    # 加載原始數據
    raw_candle, raw_factor = load_all_data(c.candle_file, c.factor_file, c.factor2_file, c.factor, c.factor2)
    
    # 處理時間格式以便於繪圖
    raw_factor['time'] = pd.to_datetime(raw_factor['start_time'], unit='ms')
    raw_candle['time'] = pd.to_datetime(raw_candle['start_time'], unit='ms')
    
    # 獲取時間範圍信息用於標題
    start_date = raw_candle['time'].min().strftime('%Y-%m-%d')
    end_date = raw_candle['time'].max().strftime('%Y-%m-%d')
    date_range = f"{start_date} to {end_date}"
    
    # 初始化繪圖
    fig, axes = plt.subplots(3, 1, figsize=(15, 18), sharex=True, dpi=100)
    
    # 繪製價格數據
    axes[0].plot(raw_candle['time'], raw_candle['Close'], color='#1f77b4', linewidth=1.5)
    axes[0].set_title(f'Price Data: {c.symbol.upper()}', fontsize=16, fontweight='bold')
    axes[0].set_ylabel('Price (USD)', fontsize=14)
    axes[0].grid(True, alpha=0.3)
    
    # 添加價格變化百分比
    price_change = (raw_candle['Close'].iloc[-1] - raw_candle['Close'].iloc[0]) / raw_candle['Close'].iloc[0] * 100
    price_text = f"Price Change: {price_change:.2f}%"
    axes[0].text(0.99, 0.05, price_text, transform=axes[0].transAxes, 
                ha='right', fontsize=12, 
                bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8))
    
    # 繪製原始因子數據
    axes[1].plot(raw_factor['time'], raw_factor[c.factor], color='#2ca02c', linewidth=1.5, label=c.factor)
    if c.factor2 in raw_factor.columns:
        axes[1].plot(raw_factor['time'], raw_factor[c.factor2], color='#d62728', linewidth=1.5, label=c.factor2)
    axes[1].set_title('Raw Factor Data', fontsize=16, fontweight='bold')
    axes[1].set_ylabel('Factor Value', fontsize=14)
    axes[1].grid(True, alpha=0.3)
    axes[1].legend(loc='best', frameon=True, framealpha=0.8)
    
    # 處理組合因子和預處理
    factor_data = raw_factor.copy()
    factor_to_display = c.factor
    factor_label = c.factor
    
    # 如果需要組合因子
    if c.operation != 'none' and c.factor2 in factor_data.columns:
        print(f"Combining factors with operation: {c.operation}")
        factor_data, combined_name = combines_data(
            factor_data,
            c.factor,
            c.factor2,
            c.operation
        )
        factor_to_display = combined_name
        factor_label = f"{c.factor} {c.operation} {c.factor2}"
    
    # 如果需要預處理
    if c.preprocess != "direct":
        print(f"Applying preprocessing: {c.preprocess}")
        # 注意：從paste.txt的代碼來看，這裡可能有一個問題
        # data_processing在其他部分可能返回兩個值，但這裡只捕獲了一個
        try:
            # 嘗試新版API（返回DataFrame和名稱）
            processed_df, new_factor_name = data_processing(factor_data, c.preprocess, factor_to_display)
            factor_data = processed_df
            factor_to_display = new_factor_name
        except ValueError:
            # 舊版API（直接返回處理後的值）
            processed_values = data_processing(factor_data[factor_to_display], c.preprocess, factor_to_display)
            factor_data[f"{factor_to_display}_{c.preprocess}"] = processed_values
            factor_to_display = f"{factor_to_display}_{c.preprocess}"
        
        factor_label = f"{factor_label} ({c.preprocess})"
    
    # 繪製處理後的因子數據
    axes[2].plot(factor_data['time'], factor_data[factor_to_display], color='#9467bd', linewidth=1.5)
    axes[2].set_title('Processed Factor Data', fontsize=16, fontweight='bold')
    axes[2].set_ylabel('Factor Value', fontsize=14)
    axes[2].set_xlabel('Date', fontsize=14)
    axes[2].grid(True, alpha=0.3)
    
    # 在標題旁添加因子描述
    axes[2].text(0.01, 0.95, f"Factor: {factor_label}", transform=axes[2].transAxes, 
                 fontsize=12, verticalalignment='top', 
                 bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 添加數據範圍
    factor_data_description = f"""
    Data points: {len(factor_data)}
    Date range: {date_range}
    """
    axes[2].text(0.99, 0.05, factor_data_description, transform=axes[2].transAxes,
                ha='right', fontsize=10, va='bottom',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 格式化X軸日期
    for ax in axes:
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
    
    # 添加整體標題
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
    plt.suptitle(f'Factor Analysis - {c.symbol.upper()} ({timestamp})', fontsize=20, y=0.98)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    
    # 保存圖像
    if hasattr(c, 'save_plot') and c.save_plot:
        filename = f"factor_analysis_{c.symbol}_{c.factor}_{datetime.now().strftime('%Y%m%d')}.png"
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"圖像已保存為: {filename}")
    
    plt.show()
    
    # 返回處理後的數據，以便進一步分析
    return {
        'candle': raw_candle,
        'factor': factor_data,
        'factor_name': factor_to_display
    }

def visualize_factor_correlation(data, n_bins=20):
    """
    視覺化因子與價格的相關性
    
    Args:
        data: 由visualize_factors函數返回的數據字典
        n_bins: 散點圖顏色分組數量，用於密度顯示
    """
    if data is None:
        print("沒有數據可用於相關性分析")
        return
    
    candle_df = data['candle']
    factor_df = data['factor']
    factor_name = data['factor_name']
    
    # 合併價格和因子數據
    merged_df = pd.merge_asof(
        candle_df.sort_values('time'), 
        factor_df[['time', factor_name]].sort_values('time'),
        on='time',
        direction='nearest'
    )
    
    # 計算不同的價格變化率
    merged_df['price_change_1d'] = merged_df['Close'].pct_change()
    
    # 如果時間間隔允許，也計算5日和10日變化率
    if len(merged_df) > 10:
        merged_df['price_change_5d'] = merged_df['Close'].pct_change(5)
        merged_df['price_change_10d'] = merged_df['Close'].pct_change(10)
    
    # 去除NaN值
    merged_df = merged_df.dropna()
    
    # 多個時間框架的相關係數
    correlation_1d = merged_df['price_change_1d'].corr(merged_df[factor_name])
    corr_results = [f"1-day: {correlation_1d:.4f}"]
    
    if 'price_change_5d' in merged_df.columns:
        correlation_5d = merged_df['price_change_5d'].corr(merged_df[factor_name])
        corr_results.append(f"5-day: {correlation_5d:.4f}")
    
    if 'price_change_10d' in merged_df.columns:
        correlation_10d = merged_df['price_change_10d'].corr(merged_df[factor_name])
        corr_results.append(f"10-day: {correlation_10d:.4f}")
    
    # 創建一個2x2網格圖表用於多個相關性視圖
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    plt.subplots_adjust(hspace=0.3, wspace=0.3)
    
    # 基本散點圖（左上）
    axes[0, 0].scatter(merged_df[factor_name], merged_df['price_change_1d'], 
                      alpha=0.6, c='#1f77b4', edgecolors='none')
    axes[0, 0].axhline(y=0, color='r', linestyle='-', alpha=0.3)
    axes[0, 0].axvline(x=0, color='r', linestyle='-', alpha=0.3)
    
    # 添加趨勢線
    z = np.polyfit(merged_df[factor_name], merged_df['price_change_1d'], 1)
    p = np.poly1d(z)
    x_range = np.linspace(merged_df[factor_name].min(), merged_df[factor_name].max(), 100)
    axes[0, 0].plot(x_range, p(x_range), "r--", alpha=0.8)
    
    axes[0, 0].set_title(f'Price Change vs {factor_name}\nCorrelation: {correlation_1d:.4f}', fontsize=14)
    axes[0, 0].set_xlabel(factor_name, fontsize=12)
    axes[0, 0].set_ylabel('1-Day Price Change (%)', fontsize=12)
    axes[0, 0].grid(True, alpha=0.3)
    
    # 核密度散點圖（右上）- 使用顏色表示點的密度
    from scipy.stats import gaussian_kde
    
    # 計算2D密度
    xy = np.vstack([merged_df[factor_name], merged_df['price_change_1d']])
    try:
        z = gaussian_kde(xy)(xy)
        # 根據密度排序點，這樣密度高的點會出現在頂部
        idx = z.argsort()
        x, y, z = merged_df[factor_name].iloc[idx], merged_df['price_change_1d'].iloc[idx], z[idx]
        scatter = axes[0, 1].scatter(x, y, c=z, cmap='viridis', 
                                   edgecolor='none', alpha=0.8, s=30)
        plt.colorbar(scatter, ax=axes[0, 1], label='Density')
    except Exception as e:
        print(f"無法創建密度圖: {e}")
        axes[0, 1].scatter(merged_df[factor_name], merged_df['price_change_1d'], 
                          alpha=0.6, c='#2ca02c', edgecolors='none')
    
    axes[0, 1].axhline(y=0, color='r', linestyle='-', alpha=0.3)
    axes[0, 1].axvline(x=0, color='r', linestyle='-', alpha=0.3)
    axes[0, 1].set_title('Density Plot', fontsize=14)
    axes[0, 1].set_xlabel(factor_name, fontsize=12)
    axes[0, 1].set_ylabel('1-Day Price Change (%)', fontsize=12)
    axes[0, 1].grid(True, alpha=0.3)
    
    # 二維直方圖（左下）
    h = axes[1, 0].hist2d(merged_df[factor_name], merged_df['price_change_1d'], 
                         bins=n_bins, cmap='Blues')
    plt.colorbar(h[3], ax=axes[1, 0], label='Count')
    axes[1, 0].set_title('2D Histogram', fontsize=14)
    axes[1, 0].set_xlabel(factor_name, fontsize=12)
    axes[1, 0].set_ylabel('1-Day Price Change (%)', fontsize=12)
    axes[1, 0].grid(True, alpha=0.3)
    
    # 多時間框架相關性（右下）
    if 'price_change_5d' in merged_df.columns and 'price_change_10d' in merged_df.columns:
        correlations = [
            correlation_1d,
            correlation_5d,
            correlation_10d
        ]
        labels = ['1-Day', '5-Day', '10-Day']
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
        
        axes[1, 1].bar(labels, correlations, color=colors, alpha=0.7)
        axes[1, 1].axhline(y=0, color='r', linestyle='-', alpha=0.3)
        axes[1, 1].set_title('Correlation Across Timeframes', fontsize=14)
        axes[1, 1].set_ylabel('Correlation Coefficient', fontsize=12)
        axes[1, 1].grid(True, alpha=0.3)
        
        # 添加相關性值作為標籤
        for i, v in enumerate(correlations):
            axes[1, 1].text(i, v + 0.02 if v >= 0 else v - 0.08,
                          f"{v:.4f}", ha='center', fontsize=10)
    else:
        # 如果數據不足，只顯示因子分佈
        axes[1, 1].hist(merged_df[factor_name], bins=30, alpha=0.7, color='#1f77b4')
        axes[1, 1].set_title(f'Distribution of {factor_name}', fontsize=14)
        axes[1, 1].set_xlabel(factor_name, fontsize=12)
        axes[1, 1].set_ylabel('Frequency', fontsize=12)
        axes[1, 1].grid(True, alpha=0.3)
    
    # 添加超級標題
    plt.suptitle(f'Correlation Analysis: {c.symbol.upper()} vs {factor_name}', 
                fontsize=16, y=0.98, fontweight='bold')
    
    # 保存圖像
    if hasattr(c, 'save_plot') and c.save_plot:
        filename = f"correlation_{c.symbol}_{factor_name}_{datetime.now().strftime('%Y%m%d')}.png"
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"圖像已保存為: {filename}")
    
    plt.show()
    
    # 返回相關係數結果
    return {
        'correlations': corr_results,
        'merged_data': merged_df
    }

def visualize_factor_distribution(data):
    """
    視覺化因子值的分佈
    
    Args:
        data: 由visualize_factors函數返回的數據字典
    """
    if data is None:
        print("沒有數據可用於分佈分析")
        return
    
    factor_df = data['factor']
    factor_name = data['factor_name']
    
    # 去除NaN值
    factor_values = factor_df[factor_name].dropna().values
    
    # 創建2x2網格圖表用於多種分佈視圖
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    plt.subplots_adjust(hspace=0.3, wspace=0.3)
    
    # 1. 基本直方圖（左上）
    n, bins, patches = axes[0, 0].hist(factor_values, bins=50, alpha=0.7, color='#1f77b4', 
                                      edgecolor='black', linewidth=0.5)
    
    # 添加基本統計信息
    mean_val = np.mean(factor_values)
    median_val = np.median(factor_values)
    std_val = np.std(factor_values)
    
    axes[0, 0].axvline(mean_val, color='r', linestyle='dashed', linewidth=1.5, label=f'Mean: {mean_val:.4f}')
    axes[0, 0].axvline(median_val, color='g', linestyle='dashed', linewidth=1.5, label=f'Median: {median_val:.4f}')
    
    axes[0, 0].set_title('Histogram', fontsize=14)
    axes[0, 0].set_xlabel(factor_name, fontsize=12)
    axes[0, 0].set_ylabel('Frequency', fontsize=12)
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].legend()
    
    # 2. 核密度估計（右上）
    try:
        from scipy.stats import gaussian_kde
        density = gaussian_kde(factor_values)
        x = np.linspace(min(factor_values), max(factor_values), 1000)
        axes[0, 1].plot(x, density(x), 'r-', linewidth=2)
        axes[0, 1].fill_between(x, density(x), alpha=0.3, color='#ff7f0e')
        axes[0, 1].set_title('Kernel Density Estimation', fontsize=14)
    except Exception as e:
        print(f"無法創建密度圖: {e}")
        axes[0, 1].hist(factor_values, bins=50, alpha=0.7, color='#ff7f0e', 
                      density=True, edgecolor='black', linewidth=0.5)
        axes[0, 1].set_title('Normalized Histogram', fontsize=14)
    
    axes[0, 1].axvline(mean_val, color='r', linestyle='dashed', linewidth=1.5)
    axes[0, 1].axvline(median_val, color='g', linestyle='dashed', linewidth=1.5)
    axes[0, 1].set_xlabel(factor_name, fontsize=12)
    axes[0, 1].set_ylabel('Density', fontsize=12)
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. 箱型圖（左下）
    axes[1, 0].boxplot(factor_values, vert=False, showmeans=True, 
                      meanprops={'marker':'o', 'markerfacecolor':'red', 'markeredgecolor':'red'},
                      flierprops={'marker':'x', 'markerfacecolor':'red', 'markeredgecolor':'red'})
    axes[1, 0].set_title('Boxplot', fontsize=14)
    axes[1, 0].set_xlabel(factor_name, fontsize=12)
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. ECDF（右下）- 經驗累積分佈函數
    sorted_data = np.sort(factor_values)
    ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    axes[1, 1].step(sorted_data, ecdf, linewidth=2, color='#2ca02c')
    
    # 繪製關鍵百分位數的垂直線
    percentiles = [25, 50, 75]
    colors = ['#ff9896', '#9467bd', '#8c564b']
    
    for p, color in zip(percentiles, colors):
        percentile_val = np.percentile(factor_values, p)
        axes[1, 1].axvline(percentile_val, color=color, linestyle='--', 
                          label=f'{p}th percentile: {percentile_val:.4f}')
    
    axes[1, 1].set_title('Empirical Cumulative Distribution', fontsize=14)
    axes[1, 1].set_xlabel(factor_name, fontsize=12)
    axes[1, 1].set_ylabel('Cumulative Probability', fontsize=12)
    axes[1, 1].grid(True, alpha=0.3)
    axes[1, 1].legend(loc='lower right')
    
    # 添加文本框顯示統計信息
    stats_text = f"""
    Mean: {mean_val:.4f}
    Median: {median_val:.4f}
    Std Dev: {std_val:.4f}
    Skewness: {np.percentile(factor_values, 75) - 2*np.percentile(factor_values, 50) + np.percentile(factor_values, 25):.4f}
    Min: {np.min(factor_values):.4f}
    Max: {np.max(factor_values):.4f}
    Count: {len(factor_values)}
    """
    
    # 放在箱型圖上方
    axes[1, 0].text(0.05, 0.95, stats_text, transform=axes[1, 0].transAxes, 
                  fontsize=10, verticalalignment='top', 
                  bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 添加超級標題
    plt.suptitle(f'Distribution Analysis: {factor_name}', 
                fontsize=16, y=0.98, fontweight='bold')
    
    # 保存圖像
    if hasattr(c, 'save_plot') and c.save_plot:
        filename = f"distribution_{c.symbol}_{factor_name}_{datetime.now().strftime('%Y%m%d')}.png"
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"圖像已保存為: {filename}")
    
    plt.show()
    
    # 計算並返回更多統計信息
    from scipy import stats
    
    try:
        # 嘗試計算高級統計量
        factor_skewness = stats.skew(factor_values)
        factor_kurtosis = stats.kurtosis(factor_values)
        shapiro_test = stats.shapiro(factor_values)
        ks_normal_test = stats.kstest(factor_values, 'norm', args=(mean_val, std_val))
        
        print(f"分佈統計資訊：")
        print(f"  偏度: {factor_skewness:.4f} ({'正偏' if factor_skewness > 0 else '負偏'})")
        print(f"  峰度: {factor_kurtosis:.4f} ({'尖峰' if factor_kurtosis > 0 else '平峰'})")
        print(f"  Shapiro-Wilk正態性檢驗 p值: {shapiro_test.pvalue:.6f} ({'可能是正態分佈' if shapiro_test.pvalue > 0.05 else '非正態分佈'})")
        print(f"  KS正態性檢驗 p值: {ks_normal_test.pvalue:.6f} ({'可能是正態分佈' if ks_normal_test.pvalue > 0.05 else '非正態分佈'})")
    except Exception as e:
        print(f"計算高級統計量時發生錯誤: {e}")
    
    # 返回基本統計信息
    return {
        "mean": mean_val,
        "median": median_val,
        "std": std_val,
        "min": np.min(factor_values),
        "max": np.max(factor_values),
        "count": len(factor_values),
        "quartiles": [np.percentile(factor_values, p) for p in [25, 50, 75]]
    }

def visualize_factor_time_series(data):
    """
    進階時間序列分析視圖
    
    Args:
        data: 由visualize_factors函數返回的數據字典
    """
    if data is None:
        print("沒有數據可用於時間序列分析")
        return
    
    candle_df = data['candle']
    factor_df = data['factor']
    factor_name = data['factor_name']
    
    # 確保時間列格式正確
    factor_df['time'] = pd.to_datetime(factor_df['time'])
    candle_df['time'] = pd.to_datetime(candle_df['time'])
    
    # 合併數據
    merged_df = pd.merge_asof(
        candle_df.sort_values('time'), 
        factor_df[['time', factor_name]].sort_values('time'),
        on='time',
        direction='nearest'
    )
    
    # 設置時間索引
    merged_df.set_index('time', inplace=True)
    
    # 如果數據點足夠，計算移動平均
    has_ma = len(merged_df) >= 30
    if has_ma:
        merged_df[f'{factor_name}_MA30'] = merged_df[factor_name].rolling(window=30).mean()
        merged_df[f'{factor_name}_MA60'] = merged_df[factor_name].rolling(window=60).mean()
    
    # 創建2x2網格圖表
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    plt.subplots_adjust(hspace=0.3, wspace=0.3)
    
    # 1. 主要時間序列圖（左上）
    axes[0, 0].plot(merged_df.index, merged_df[factor_name], color='#1f77b4', linewidth=1.5, label=factor_name)
    
    if has_ma:
        axes[0, 0].plot(merged_df.index, merged_df[f'{factor_name}_MA30'], 
                       color='#ff7f0e', linewidth=1.5, label='30-Period MA')
        axes[0, 0].plot(merged_df.index, merged_df[f'{factor_name}_MA60'], 
                       color='#2ca02c', linewidth=1.5, label='60-Period MA')
    
    axes[0, 0].set_title(f'{factor_name} Time Series', fontsize=14)
    axes[0, 0].set_ylabel('Value', fontsize=12)
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].legend(loc='best')
    axes[0, 0].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    axes[0, 0].xaxis.set_major_locator(mdates.MonthLocator(interval=3))
    plt.setp(axes[0, 0].xaxis.get_majorticklabels(), rotation=45)
    
    # 2. 因子變化率（右上）
    merged_df[f'{factor_name}_chg'] = merged_df[factor_name].pct_change()
    axes[0, 1].plot(merged_df.index, merged_df[f'{factor_name}_chg'], 
                   color='#d62728', linewidth=1.5)
    axes[0, 1].set_title(f'{factor_name} Percent Change', fontsize=14)
    axes[0, 1].set_ylabel('% Change', fontsize=12)
    axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    axes[0, 1].xaxis.set_major_locator(mdates.MonthLocator(interval=3))
    plt.setp(axes[0, 1].xaxis.get_majorticklabels(), rotation=45)
    
    # 3. 累積分佈隨時間變化（左下）
    try:
        # 創建一個colormap，用於區分時間
        import matplotlib.cm as cm
        import matplotlib.colors as mcolors
        from matplotlib.collections import LineCollection
        
        # 將時間序列分為若干段
        n_segments = 4
        segment_size = len(merged_df) // n_segments
        
        # 依時間順序為每個段落設置不同顏色
        colors = plt.cm.viridis(np.linspace(0, 1, n_segments))
        
        # 繪製分段的ECDF
        for i in range(n_segments):
            start_idx = i * segment_size
            end_idx = start_idx + segment_size if i < n_segments - 1 else len(merged_df)
            
            segment_data = merged_df[factor_name].iloc[start_idx:end_idx].dropna().values
            if len(segment_data) > 0:
                sorted_data = np.sort(segment_data)
                ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
                
                # 獲取該段的時間標籤
                time_label = merged_df.index[start_idx].strftime('%Y-%m-%d')
                
                axes[1, 0].step(sorted_data, ecdf, linewidth=2, color=colors[i], 
                              label=f'Period {i+1}: from {time_label}')
        
        axes[1, 0].set_title('ECDF Evolution Over Time', fontsize=14)
        axes[1, 0].set_xlabel(factor_name, fontsize=12)
        axes[1, 0].set_ylabel('Cumulative Probability', fontsize=12)
        axes[1, 0].grid(True, alpha=0.3)
        axes[1, 0].legend(loc='best')
    except Exception as e:
        print(f"無法創建累積分佈變化圖: {e}")
        # 備用圖：使用最簡單的滾動統計
        if has_ma:
            axes[1, 0].plot(merged_df.index, merged_df[f'{factor_name}'].rolling(window=30).std(), 
                         color='#9467bd', linewidth=1.5, label='30-Period Std')
            axes[1, 0].set_title(f'{factor_name} Rolling Volatility', fontsize=14)
            axes[1, 0].set_ylabel('Standard Deviation', fontsize=12)
            axes[1, 0].grid(True, alpha=0.3)
            axes[1, 0].legend(loc='best')
            axes[1, 0].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
            axes[1, 0].xaxis.set_major_locator(mdates.MonthLocator(interval=3))
            plt.setp(axes[1, 0].xaxis.get_majorticklabels(), rotation=45)
        else:
            axes[1, 0].text(0.5, 0.5, 'Insufficient data for rolling metrics', 
                          transform=axes[1, 0].transAxes, ha='center', va='center',
                          fontsize=12)
    
    # 4. 價格與因子疊加圖（右下）
    ax1 = axes[1, 1]
    ax2 = ax1.twinx()
    
    # 繪製價格線
    price_line, = ax1.plot(merged_df.index, merged_df['Close'], color='#1f77b4', linewidth=1.5, label='Price')
    ax1.set_ylabel('Price', fontsize=12, color='#1f77b4')
    ax1.tick_params(axis='y', colors='#1f77b4')
    
    # 繪製因子線
    factor_line, = ax2.plot(merged_df.index, merged_df[factor_name], color='#d62728', linewidth=1.5, label=factor_name)
    ax2.set_ylabel(factor_name, fontsize=12, color='#d62728')
    ax2.tick_params(axis='y', colors='#d62728')
    
    # 組合兩個圖例
    lines = [price_line, factor_line]
    ax1.legend(lines, [line.get_label() for line in lines], loc='upper left')
    
    ax1.set_title('Price vs Factor Overlay', fontsize=14)
    ax1.grid(True, alpha=0.3)
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax1.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
    plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45)
    
    # 添加超級標題
    plt.suptitle(f'Advanced Time Series Analysis: {factor_name}', 
                fontsize=16, y=0.98, fontweight='bold')
    
    # 保存圖像
    if hasattr(c, 'save_plot') and c.save_plot:
        filename = f"time_series_{c.symbol}_{factor_name}_{datetime.now().strftime('%Y%m%d')}.png"
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"圖像已保存為: {filename}")
    
    plt.show()
    
    return merged_df

if __name__ == "__main__":
    print(f"正在執行 {c.symbol} 的因子分析，使用因子: {c.factor}" + 
          (f" 和 {c.factor2}" if c.factor2 and c.operation != 'none' else ""))
    
    # 執行因子視覺化
    data = visualize_factors()
    
    # 可視化因子與價格的相關性
    correlation_results = visualize_factor_correlation(data)
    
    # 可視化因子分佈
    stats = visualize_factor_distribution(data)
    
    # 額外執行時間序列分析
    try:
        print("\n執行時間序列分析...")
        merged_data = visualize_factor_time_series(data)
        print("分析完成！")
    except Exception as e:
        print(f"時間序列分析時發生錯誤: {e}")
        
    print("\n分析總結:")
    print(f"- 因子: {data['factor_name']}")
    if 'correlation_results' in locals() and correlation_results and 'correlations' in correlation_results:
        print(f"- 相關性: {', '.join(correlation_results['correlations'])}")
    if 'stats' in locals() and stats:
        print(f"- 平均值: {stats['mean']:.4f}, 中位數: {stats['median']:.4f}, 標準差: {stats['std']:.4f}")
        print(f"- 範圍: {stats['min']:.4f} 到 {stats['max']:.4f}, 樣本數: {stats['count']}")

## Train Split Loop + Heatmap (Inclding looping Preprocess)

In [None]:
from utilsnumpy import nan_count, load_all_data, combines_data, data_processing, precompute_rolling_stats, backtest_cached
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import config as c
import numpy as np
import pandas as pd
import seaborn as sns
import sys
from dask import delayed, compute
from dask.diagnostics import ProgressBar

def parse_manual_selection(filepath, all_models):
    """
    解析手動選擇的模型與進出場方式。
    """
    with open(filepath, "r") as file:
        lines = [line.strip() for line in file.readlines() if line.strip()]
    models_entrys = {}
    current_model = None
    for line in lines:
        if line in all_models:
            current_model = line
            models_entrys[current_model] = []
        elif current_model:
            models_entrys[current_model].append(line)
    return models_entrys

def plot_heatmaps(sr_threshold=1.5, preprocess_method=None, highlight_x=0.5, highlight_y=50):
    for model, entry, backtest_df in plot_data:
        # ✅ Dynamically set SR threshold based on entry name
        sr_threshold = 1.2 if entry.startswith('S') else sr_threshold  # Use 1.2 for entries starting with 'S', otherwise 1.8
        sr_threshold = sr_threshold if entry.startswith('L') else sr_threshold # Changing threshold backto 1.5 if startwith 'L'
        # ✅ Optimized pivot using groupby instead of pivot
        sr_pivot_data = backtest_df.groupby(['rolling_window', 'threshold'])['SR'].mean().unstack()

        # ✅ Check if the entire heatmap is NaN
        if sr_pivot_data.isna().all().all():
            print(f"⚠️ Skipping {model}_{entry} heatmap: All SR values are NaN.")
            continue  # Skip plotting

        # ✅ Check if there is at least one SR > threshold
        if not np.any(sr_pivot_data.to_numpy() > sr_threshold):
            print(f"⚠️ Skipping {model}_{entry} heatmap: No SR value exceeds {sr_threshold}.")
            continue  # Skip plotting

        plt.figure(figsize=(20, 16))  # ✅ Reduced figure size for faster rendering
        
        sr_pivot_data.columns = sr_pivot_data.columns.round(2)

        ax = sns.heatmap(sr_pivot_data, annot=True, fmt=".2f", cmap="RdYlGn", linewidths=0.3, cbar_kws={'label': 'Sharpe Ratio'})           
        plt.xticks(np.arange(len(sr_pivot_data.columns)) + 0.5, [f"{col:.2f}" for col in sr_pivot_data.columns], rotation=45)
        plt.yticks(np.arange(len(sr_pivot_data.index)) + 0.5, [f"{row:.2f}" for row in sr_pivot_data.index], rotation=0)
        # plt.grid(visible=True, linestyle='--', linewidth=0.5)  # 顯示格線
        
        # 如果你想要高亮某個 (highlight_x, highlight_y) 對應的 pivot cell：
        if highlight_x is not None and highlight_y is not None:
            # 找出 x, y 的實際索引位置
            # highlight_x 對應 columns (threshold)， highlight_y 對應 index (rolling_window)
            try:
                col_idx = sr_pivot_data.columns.get_loc(highlight_x)
                row_idx = sr_pivot_data.index.get_loc(highlight_y)
                # 設定 3x3 區域，並讓 highlight cell 為中央
                top_left_col = col_idx - 1 if col_idx > 0 else col_idx
                top_left_row = row_idx - 1 if row_idx > 0 else row_idx
                # 根據資料表大小，計算寬與高（避免超出邊界）
                max_cols = len(sr_pivot_data.columns)
                max_rows = len(sr_pivot_data.index)
                width = 3 if top_left_col + 3 <= max_cols else max_cols - top_left_col
                height = 3 if top_left_row + 3 <= max_rows else max_rows - top_left_row
                # 畫出 3x3 的框框
                ax.add_patch(Rectangle(
                    (top_left_col, top_left_row),  # 起始位置
                    width, height,
                    fill=False,          # 只框線，不填滿
                    edgecolor='red',
                    linewidth=2
                ))
                print(f"✨ Highlighted 3x3 block centered at x={highlight_x}, y={highlight_y}")
            except KeyError:
                print(f"⚠️ Cannot highlight ({highlight_x}, {highlight_y}): value not found in pivot.")
        # ============== 關鍵加亮部分結束 ==============
        
        plt.title(f"{model}_{preprocess_method}_{entry} Train Period BackTest SR Heatmap", fontsize=14)
        # ✅ Save the heatmap in the current working directory
        if c.save_plot:
            save_path = f"{model}_{entry}_heatmap.png"
            plt.savefig(save_path, dpi=300, bbox_inches='tight', pad_inches=0.1)
            print(f"📁 Heatmap saved to: {save_path}")

        plt.show()  # ✅ Display the plot
        plt.close()  # ✅ Free memory after each plot

def process_and_validate(factor_series, method, factor_name):
    """
    使用指定的 preprocess 方法處理資料並檢查 NaN 百分比。
    
    Parameters:
        factor_series (pd.Series): 原始的因子資料。
        method (str): 要套用的預處理方法。
        factor_name (str): 因子名稱，用於 debug 訊息。
        
    Returns:
        pd.Series: 若處理後 NaN 低於 3%，則回傳處理後的資料；否則回傳 None 表示跳過此方法。
    """
    if method != "direct":
        # Update to handle the return values from data_processing
        factor_df, new_factor_name = data_processing(pd.DataFrame({factor_name: factor_series}), method, factor_name)
        processed = factor_df[new_factor_name]
    else:
        processed = factor_series.copy()
    if processed.isna().sum() / len(processed) > c.nan_perc:
        preprocess_nan_count = nan_count(processed)
        print(f"nan count After {method} Preprocessing: {preprocess_nan_count}")
        print(f"{factor_name} after {method} transformation exceed 3% NaN. Skipping this preprocess method.")
        return None
    else:
        print(f"NaN% after preprocess: {processed.isna().sum()/len(processed)}%.")
        processed.dropna(inplace=True)
    return processed

def main(candle_data, factor_data, factor, interval, operation, model, entry,
         window_start, window_end, window_step, threshold_start, threshold_end,
         threshold_step, rolling_stats, preprocess_method, date_range):
    """
    回測主程式，根據參數進行多參數回測。
    """
    candle_df_copy = candle_data[['start_time', 'Close']].copy()
    candle_df_copy.columns = ['start_time', 'close']
    factor_df_copy = factor_data[['start_time', factor]].copy()

    candle_df_copy['time'] = pd.to_datetime(candle_df_copy['start_time'], unit='ms')
    
    annualizer = annualizer_dict.get(interval, None)
    backtest_report = []
    for rolling_window in range(window_start, window_end, window_step):
        for threshold in np.arange(threshold_start, threshold_end, threshold_step):
            result, _, log_msgs = backtest_cached(candle_df_copy, factor_df_copy, rolling_window, threshold, 
                                          preprocess_method, entry, annualizer, model, factor, interval, date_range,
                                          rolling_stats)
            backtest_report.append(result)
    
    backtest_df = pd.DataFrame(backtest_report)
    return (model, entry, backtest_df, log_msgs)

# 定義 annualizer 字典
annualizer_dict = {
    '1m': 525600, '5m': 105120, '15m': 35040,
    '30m': 17520, '1h': 8760, '4h': 2190,
    '1d': 365, '1w': 52, '1M': 12
}

# 載入原始資料
raw_candle, raw_factor = load_all_data(c.candle_file, c.factor_file, c.factor2_file, c.factor, c.factor2)
train_split = annualizer_dict.get(c.interval, None) * 3

candle_train = raw_candle[:train_split].reset_index(drop=True).copy()
factor_train_original = raw_factor[:train_split].reset_index(drop=True).copy()

# Start and end Time and date_range
start_time = max(candle_train['start_time'].min(), factor_train_original['start_time'].min())
end_time = min(candle_train['start_time'].max(), factor_train_original['start_time'].max())
date_range = pd.date_range(start=pd.to_datetime(start_time, unit='ms'),
                           end=pd.to_datetime(end_time, unit='ms'),
                           freq=c.interval)
candle_train['time'] = pd.to_datetime(candle_train['start_time'], unit='ms')
factor_train_original['time'] = pd.to_datetime(factor_train_original['start_time'], unit='ms')
candle_train.set_index('time', inplace=True)
factor_train_original.set_index('time', inplace=True)

# 若 operation 不是 'none'，則合併兩個因子
if c.operation != 'none':
    # 直接傳入 DataFrame 進行處理
    factor_train_original, merged_col_name = combines_data(
        factor_train_original,
        c.factor,
        c.factor2,
        c.operation
    )
    factor_used = merged_col_name
else:
    factor_used = c.factor

################################################
# Step 0: 判斷 c.preprocess 是單一字串，還是串列
################################################
if isinstance(c.preprocess, list):
    all_preprocess_methods = c.preprocess
else:
    all_preprocess_methods = [c.preprocess]

# 對每個 preprocess 方法進行迴圈
for current_preprocess in all_preprocess_methods:
    print(f"\n===== Processing with preprocess method: {current_preprocess} =====")
    # 從原始資料複製一份
    factor_train = factor_train_original.copy()
    # 對指定因子進行預處理
    processed_factor = process_and_validate(factor_train[factor_used], current_preprocess, factor_used)
    if processed_factor is None:
        continue  # 如果驗證不通過，則跳到下一個 preprocess 方法
    factor_train[factor_used] = processed_factor

    # 模型與進出場設定
    if c.USE_ALL_MODELS:
        models = c.models
        entry_map = {model: c.entrys for model in c.models}
        # window_step = 20
        # threshold_step = 0.2
    else:
        entry_map = parse_manual_selection("manual_selected.txt", c.models)
        models = list(entry_map.keys())
        # window_step = 10
        # threshold_step = 0.1

    # 預先計算滾動統計值
    windows = list(range(5, c.window_end, c.window_step))
    rolling_stats_dict = precompute_rolling_stats(factor_train[factor_used], windows)

    # 重置 plot_data 以儲存當前 preprocess 方法的回測結果
    plot_data = []
    tasks = []
    for model in models:
        for entry in entry_map[model]:
            task = delayed(main)(
                candle_train,
                factor_train,
                factor_used,
                c.interval,
                c.operation,
                model,
                entry,
                window_start=5,
                window_end=c.window_end,
                window_step=c.window_step,
                threshold_start=0,
                threshold_end=c.threshold_end,
                threshold_step=c.threshold_step,
                rolling_stats=rolling_stats_dict,
                preprocess_method=current_preprocess,
                date_range=date_range
            )
            tasks.append(task)

    with ProgressBar():
        results = compute(*tasks, scheduler='processes')
    
    for res in results:
        if res is not None:
            m, e, backtest_df, log_msgs = res
            for msg in log_msgs:
                print(msg)
            plot_data.append((m, e, backtest_df))

    # 繪製當前 preprocess 方法的熱力圖
    plot_heatmaps(1.65, preprocess_method=current_preprocess, highlight_x=c.highlight_threshold, highlight_y=c.highlight_window)

## Split forward, Split Backtest, full_length_backtest

In [None]:
from utilsnumpy import split_data, load_all_data, combines_data, data_processing, backtest_cached, additional_metrics
import matplotlib.pyplot as plt
import config as c
import numpy as np
import pandas as pd
import json
import sys
import os

annualizer_dict = {
    '1m': 525600, '5m': 105120, '15m': 35040,
    '30m': 17520, '1h': 8760, '4h': 2190,
    '1d': 365, '1w': 52, '1M': 12
}

def load_and_prepare_data():
    """加载和准备回测数据，包括分割、预处理，并进行正确的NaN检查"""
    
    print("Loading data...")
    # 加载原始数据
    raw_candle, raw_factor = load_all_data(c.candle_file, c.factor_file, c.factor2_file, c.factor, c.factor2)

    # 计算分割点
    annualizer = annualizer_dict.get(c.interval, 365)
    # train_split = annualizer * 3  # 训练使用3年数据
    
    print(f"Using {c.interval} data, annualizer: {annualizer}") #, train_split points: {train_split}

    # 按时间戳排序
    raw_candle = raw_candle.sort_values('start_time').reset_index(drop=True)
    raw_factor = raw_factor.sort_values('start_time').reset_index(drop=True)

    # 检查数据长度
    if len(raw_candle) <= 300:
        print(f"Error: Candle data ({len(raw_candle)} points) is insufficient for the specified train_split ({train_split} points)")
        return None
    
    if len(raw_factor) <= 300:
        print(f"Error: Factor data ({len(raw_factor)} points) is insufficient for the specified train_split ({train_split} points)")
        return None

    # 分割数据
    split_result = split_data(raw_candle, raw_factor, years_for_training=3)
    
    # 分割dataframes
    candle_train = split_result['train']['candle']
    factor_train = split_result['train']['factor']
    candle_test = split_result['test']['candle']
    factor_test = split_result['test']['factor']
    candle_full = split_result['full']['candle']
    factor_full = split_result['full']['factor']

    # 记录原始数据长度
    factor_train_length = len(factor_train)
    factor_test_length = len(factor_test)
    factor_full_length = len(factor_full)
    
    # 为所有数据添加时间列
    for df in [candle_train, factor_train, candle_test, factor_test, candle_full, factor_full]:
        df['time'] = pd.to_datetime(df['start_time'], unit='ms')
    
    # 创建完整日期范围
    train_date_range = pd.date_range(
        start=pd.to_datetime(max(candle_train['start_time'].min(), factor_train['start_time'].min()), unit='ms'),
        end=pd.to_datetime(min(candle_train['start_time'].max(), factor_train['start_time'].max()), unit='ms'),
        freq=c.interval
    )
    test_date_range = pd.date_range(
        start=pd.to_datetime(max(candle_test['start_time'].min(), factor_test['start_time'].min()), unit='ms'),
        end=pd.to_datetime(min(candle_test['start_time'].max(), factor_test['start_time'].max()), unit='ms'),
        freq=c.interval
    )
    full_date_range = pd.date_range(
        start=pd.to_datetime(max(candle_full['start_time'].min(), factor_full['start_time'].min()), unit='ms'),
        end=pd.to_datetime(min(candle_full['start_time'].max(), factor_full['start_time'].max()), unit='ms'),
        freq=c.interval
    )

    # 初始使用的因子名称
    factor_name = c.factor

    # 1. 组合因子处理 (如果需要)
    if c.operation != 'none':
        print(f"\nApplying operation '{c.operation}' to factors...")
        
        # --- 创建数据副本用于NaN检测 ---
        factor_train_copy = factor_train.copy()
        factor_test_copy = factor_test.copy()
        factor_full_copy = factor_full.copy()
        
        # --- 训练集处理 ---
        # 在计算NaN比例之前先检查原始数据的NaN比例
        train_temp = factor_train_copy.set_index('time')
        train_temp_reindexed = train_temp.reindex(train_date_range)
        
        # 检查原始数据的NaN百分比
        nan_train_before = train_temp_reindexed[c.factor].isna().sum() / len(train_date_range)
        nan_train_before2 = train_temp_reindexed[c.factor2].isna().sum() / len(train_date_range)
        
        if nan_train_before > c.nan_perc or nan_train_before2 > c.nan_perc:
            print(f"Error: Input factors already have too many NaNs before combination: {c.factor}={nan_train_before:.3f}, {c.factor2}={nan_train_before2:.3f}")
            return None
        
        # 合并因子 (注意：combines_data直接修改并返回整个DataFrame)
        factor_train, merged_col_name = combines_data(factor_train, c.factor, c.factor2, c.operation)
        factor_test, _ = combines_data(factor_test, c.factor, c.factor2, c.operation)
        factor_full, _ = combines_data(factor_full, c.factor, c.factor2, c.operation)
        
        # 更新因子名称
        factor_name = merged_col_name
        
        # 检查合并后的NaN情况 (由于combines_data已经删除了NaN，所以这里主要是记录有多少数据被删除)
        print(f"Data remaining after combination: Train: {len(factor_train)}/{factor_train_length} ({len(factor_train)/factor_train_length:.1%})")
        print(f"Data remaining after combination: Test: {len(factor_test)}/{factor_test_length} ({len(factor_test)/factor_test_length:.1%})")
        print(f"Data remaining after combination: Full: {len(factor_full)}/{factor_full_length} ({len(factor_full)/factor_full_length:.1%})")
        
        # 如果剩余数据太少，终止回测    # 這個是在用已經drop完的factor_df來check, 所以是檢查 (1-nan%) * 原始df length
        if len(factor_train) < (1 - c.nan_perc) * factor_train_length:
            print("Train data After droped more than 3% data. Skipping backtest.")
            return None
        if len(factor_test) < (1 - c.nan_perc) * factor_test_length:
            print("Test data After droped more than 3% data. Skipping backtest")
            return None
        if len(factor_full) < (1 - c.nan_perc) * factor_full_length:
            print("Full data After droped more than 3% data. Skipping backtest")
            return None
            
    # 2. 因子预处理
    if c.preprocess != "direct":
        print(f"\nApplying preprocessing method '{c.preprocess}'...")
        
        # 原始数据长度更新
        factor_train_length = len(factor_train)
        factor_test_length = len(factor_test)
        factor_full_length = len(factor_full)

        print(f"length of each split: Train:{factor_train_length}, Test:{factor_test_length}, Full:{factor_full_length}")
        
        # --- 训练集预处理 ---
        factor_train, train_factor_name = data_processing(factor_train, c.preprocess, factor_name)
        train_remaining = len(factor_train) / factor_train_length
        
        # --- 测试集预处理 ---
        factor_test, test_factor_name = data_processing(factor_test, c.preprocess, factor_name)
        test_remaining = len(factor_test) / factor_test_length if factor_test_length > 0 else 1.0
        
        # --- 全集预处理 ---
        factor_full, full_factor_name = data_processing(factor_full, c.preprocess, factor_name)
        full_remaining = len(factor_full) / factor_full_length
        
        # 检查数据保留比例（超过3%的NaN会被删除）
        nan_train = 1 - train_remaining
        nan_test = 1 - test_remaining
        nan_full = 1 - full_remaining
        
        print(f"After {c.preprocess} preprocessing data retention: Train: {train_remaining:.3f}, Test: {test_remaining:.3f}, Full: {full_remaining:.3f}")
        print(f"NaN percentages: Train: {nan_train:.3f}, Test: {nan_test:.3f}, Full: {nan_full:.3f}")
        
        if nan_train > c.nan_perc:
            print(f"Train NaN% = {nan_train}, >3% after {c.preprocess} preprocessing. Skipping backtest.")
            return None
        
        if nan_test > c.nan_perc:
            print(f"Test NaN% = {nan_test}, >3% after {c.preprocess} preprocessing. Skipping backtest.")
            return None
        
        if nan_full > c.nan_perc:
            print(f"Full NaN% {nan_full}, >3% after {c.preprocess} preprocessing. Skipping backtest.")
            return None

        # 更新因子名称
        factor_name = train_factor_name
    
    # 4. 设置最终的时间索引
    candle_train.set_index('time', inplace=True)
    factor_train.set_index('time', inplace=True)
    candle_test.set_index('time', inplace=True)
    factor_test.set_index('time', inplace=True)
    candle_full.set_index('time', inplace=True)
    factor_full.set_index('time', inplace=True)
    
    # 只保留需要的列
    candle_train = candle_train[['start_time', 'Close']]
    candle_test = candle_test[['start_time', 'Close']]
    candle_full = candle_full[['start_time', 'Close']]
    
    factor_train = factor_train[['start_time', factor_name]]
    factor_test = factor_test[['start_time', factor_name]]
    factor_full = factor_full[['start_time', factor_name]]
    
    # 5. 重命名Close列为小写close(如果需要)
    candle_train.columns = ['start_time', 'close']
    candle_test.columns = ['start_time', 'close']
    candle_full.columns = ['start_time', 'close']
    
    # 6. 打印处理后的数据大小
    print(f"Final data sizes after processing:")
    print(f"Train: candle={len(candle_train)}, factor={len(factor_train)}, original={factor_train_length}")
    print(f"Test: candle={len(candle_test)}, factor={len(factor_test)}, original={factor_test_length}")
    print(f"Full: candle={len(candle_full)}, factor={len(factor_full)}, original={factor_full_length}")
    
    # 返回最终结果
    return {
        'train': {'candle': candle_train, 'factor': factor_train, 'date_range': train_date_range},
        'test': {'candle': candle_test, 'factor': factor_test, 'date_range': test_date_range},
        'full': {'candle': candle_full, 'factor': factor_full, 'date_range': full_date_range},
        'factor_name': factor_name
    }

def parse_manual_selection(filepath):
    """
    解析manual_selected.txt文件，返回参数元组列表[(model, entry, window, threshold)]
    使用config.py中的models列表来识别模型和条目
    如果文件格式不完整或不存在，返回空列表
    """
    if not os.path.exists(filepath):
        print(f"Warning: {filepath} not found")
        return []
        
    try:        
        with open(filepath, "r") as file:
            lines = [line.strip() for line in file.readlines() if line.strip()]
        
        params_list = []
        current_model = None
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # 检查当前行是否是模型名称
            if line in c.models:
                current_model = line
                continue
            
            # 如果没有当前模型，则跳过
            if current_model is None:
                print(f"Warning: Entry '{line}' found without a model specified")
                continue
                
            # 解析条目和参数
            parts = line.split(maxsplit=1)
            if len(parts) < 1:
                continue
                
            entry = parts[0]
            
            if len(parts) > 1:
                params_str = parts[1]
                params_entries = params_str.split(",")
                for param_entry in params_entries:
                    param_entry = param_entry.strip()
                    if param_entry:
                        try:
                            window_str, threshold_str = param_entry.split("/")
                            window = float(window_str) if "." in window_str else int(window_str)
                            threshold = float(threshold_str)
                            params_list.append((current_model, entry, window, threshold))
                        except ValueError:
                            print(f"Warning: Could not parse parameter {param_entry}")
            else:
                # 没有参数，使用默认值
                params_list.append((current_model, entry, c.window, c.threshold))
    
        return params_list
    except Exception as e:
        print(f"Error parsing manual_selected.txt: {e}")
        return []

def backtest_for_pnl(candle_df, factor_df, factor, factor2, interval, operation, preprocess, model,
                  entry, window, threshold, backtest_style, date_range=None):
    """执行回测并绘制结果图表"""
    print(f"\n=== Running {backtest_style} ===")
    print(f"Using model: {model}, entry: {entry}, window: {window}, threshold: {threshold}")
    
    # 检查空数据框
    if candle_df.empty or factor_df.empty:
        print(f"Error: Empty dataframe in {backtest_style}")
        return None
        
    # 检查数据长度是否足够
    if len(candle_df) < window or len(factor_df) < window:
        print(f"Error: Insufficient data points (candle: {len(candle_df)}, factor: {len(factor_df)}) compared to window size ({window}) in {backtest_style}")
        return None
    
    candle_df['time'] = pd.to_datetime(candle_df['start_time'], unit='ms')

    # 获取annualizer值
    annualizer = annualizer_dict.get(interval, 365)
    
    # 添加额外的指标
    additional_metric = additional_metrics(c.alpha_id, c.symbol, factor, factor2,
                                          operation, c.shift_candle_minite, backtest_style)
    
    try:
        # 运行回测 - now passing date_range
        backtest_result, df, log_msgs = backtest_cached(
            candle_df, factor_df, window, threshold, preprocess, 
            entry, annualizer, model, factor, interval, date_range)
        
        # 打印日志消息
        for msg in log_msgs:
            print(msg)
            
        # 检查df是否为空
        if df is None or df.empty:
            print(f"Warning: No results returned from backtest_cached for {backtest_style}")
            return None
        
        # # 正确处理时间戳
        if isinstance(df.index, pd.DatetimeIndex):
            # 直接使用DatetimeIndex
            start_date = df.index.min().strftime('%Y-%m-%d')
            end_date = df.index.max().strftime('%Y-%m-%d')
            start_time = df.index.min().strftime('%Y-%m-%d %H:%M:%S')
            end_time = df.index.max().strftime('%Y-%m-%d %H:%M:%S')
        elif 'start_time' in df.columns:
            # 使用start_time列
            start_date = pd.to_datetime(df['start_time'].min(), unit='ms').strftime('%Y-%m-%d')
            end_date = pd.to_datetime(df['start_time'].max(), unit='ms').strftime('%Y-%m-%d')
            start_time = pd.to_datetime(df['start_time'].min(), unit='ms').strftime('%Y-%m-%d %H:%M:%S')
            end_time = pd.to_datetime(df['start_time'].max(), unit='ms').strftime('%Y-%m-%d %H:%M:%S')

        additional_metric.update({"start_time": start_time, "end_time": end_time})
        combined_report = [{**additional_metric, **backtest_result}]

        print(f"{backtest_style} Report:")
        print(json.dumps(combined_report, indent=4))

        # 绘制结果
        fig, ax1 = plt.subplots(figsize=(15, 8))
        ax1.plot(df.index, df['close'], label='Close Price', color='green', linewidth=2)
        ax1.set_xlabel("Date", fontsize=12)
        ax1.set_ylabel("Close Price", fontsize=12, color='green')
        ax1.tick_params(axis='y', labelcolor='green')
        ax1.grid(True, alpha=0.3)
        
        # 在右侧y轴上绘制累积PnL
        ax2 = ax1.twinx()
        ax2.plot(df.index, df['cumu_pnl'], label='Cumulative PnL', color='blue', linewidth=2)
        ax2.set_ylabel("Cumulative PnL", fontsize=12, color='blue')
        ax2.tick_params(axis='y', labelcolor='blue')

        plt.title(f"Close Price and Cumulative PnL Plot (Split {backtest_style})-({start_date} ~ {end_date})", fontsize=16)
        fig.tight_layout()
        # plt.show()

        if c.save_plot:
            plt.savefig(f"{backtest_style}_Equity_Curve_{start_date}_{end_date}.png", dpi=300, bbox_inches='tight')
            print(f"已儲存 {backtest_style}_Equity_Curve_{start_date}_{end_date}.png")
            output_backtest_data = {f"{backtest_style}": combined_report}
            with open(f"{c.alpha_id}_{backtest_style}.json", "w") as json_file:
                json.dump(output_backtest_data, json_file, indent=4)
            df.to_csv(f"{c.alpha_id}_{backtest_style}_df.csv", index=True)
            print(f"已儲存 {c.alpha_id}_{backtest_style}_df.csv")
        plt.show()
        return combined_report
            
    except Exception as e:
        print(f"Error in backtest_for_pnl for {backtest_style}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def run_backtest(data, model, entry, window, threshold):
    """运行完整的回测流程（前向测试、回测、全时段回测）"""
    factor_name = data['factor_name']
    
    # 设置SR阈值
    if "short" in entry.lower():
        required_sr = 1.0
    elif "long" in entry.lower():
        required_sr = 1.7
    else:
        required_sr = 1.7
    
    print(f"\n执行测试: model = {model}, entry = {entry}, window = {window}, threshold = {threshold}")
    
    # 执行forward test
    fwd_report = backtest_for_pnl(
        data['test']['candle'],
        data['test']['factor'],
        factor_name,
        c.factor2,
        c.interval,
        c.operation,
        c.preprocess,
        model,
        entry,
        window,
        threshold,
        "forwardtest",
        data['test']['date_range'],
    )
    
    # 检查forward test结果
    if fwd_report is None:
        print("Warning: forward test failed, skipping this parameter set.")
        return
        
    # 获取SR值
    fwd_sr = fwd_report[0].get("SR", 0)
    
    # 如果SR未达到要求则跳过后续测试
    if fwd_sr < required_sr:
        print(f"Forward test SR = {fwd_sr} 未达到要求 (需 > {required_sr})，跳过此组参数的后续测试。")
        return
        
    # 如果符合SR要求则继续执行其他backtest
    backtest_for_pnl(
        data['train']['candle'],
        data['train']['factor'],
        factor_name,
        c.factor2,
        c.interval,
        c.operation,
        c.preprocess,
        model,
        entry,
        window,
        threshold,
        "backtest",
        data['train']['date_range'],
    )
    
    backtest_for_pnl(
        data['full']['candle'],
        data['full']['factor'],
        factor_name,
        c.factor2,
        c.interval,
        c.operation,
        c.preprocess,
        model,
        entry,
        window,
        threshold,
        "full_time_backtest",
        data['full']['date_range'],
    )

if __name__ == "__main__":
    # 一次性加载和处理数据
    data = load_and_prepare_data()
    if data is None:
        print("Error preparing data. Aborting backtest.")
        sys.exit(1)
    
    # 尝试从manual_selected.txt读取参数列表
    params_list = parse_manual_selection("manual_selected.txt")
    
    if params_list:
        print(f"Found {len(params_list)} parameter sets in manual_selected.txt")
        # 使用每组参数运行回测
        for model, entry, window, threshold in params_list:
            run_backtest(data, model, entry, window, threshold)
    else:
        # 如果文件不存在或参数不齐全，使用config中的参数
        print("Using parameters from config.py")
        run_backtest(data, c.model, c.entry, c.window, c.threshold)