In [None]:
# 每次运行时重新加载config.py
from importlib import reload
import config as c
reload(c)

print(f"已加载配置：factor={c.factor}, factor2={c.factor2}, operation={c.operation}, preprocess={c.preprocess}, USE_ALL_MODELS={c.USE_ALL_MODELS}" )

## Resampling 1min data to wanted timeframe

In [None]:
import os
import pandas as pd
import config as c

def prepare_price_data(
    csv_path: str,  # 輸入 CSV 檔案完整路徑
    datasource: str = 'bybit_btcusdt',
    factor: str = 'price',
    timeframe: str = '1D', 
    delay_minutes: int = 0
):
    """
    讀取 1m 資料，轉成指定的時間週期 (timeframe)，
    可選擇延遲(正值)或提前(負值)時間索引，並自動存檔到當前工作目錄。

    參數：
      csv_path      : 【完整路徑】輸入的 1m 級別 CSV 檔案
      datasource    : 資料來源名稱 (如 bybit_btcusdt)
      factor        : 影響因子名稱 (如 price)
      timeframe     : 轉換後的時間週期，如 '1H'、'1D' 等 (預設 '1D')
      delay_minutes : 時間平移的分鐘數 (正值 = 延後；負值 = 提前)

    回傳：
      pandas DataFrame (resampled 後的結果)，
      並將結果輸出為 CSV，命名格式：
      {datasource}_{factor}_{timeframe}_{start_time}_{end_time}.csv
    """
    # 1. 讀取 CSV，解析時間
    df = pd.read_csv(
        csv_path, 
        parse_dates=['Time']  # pandas 會自動解析時間格式
    )

    # 2. 將 'Time' 欄設為索引
    df.set_index('Time', inplace=True)

    # 3. 時間平移 (延遲 / 提前)
    if delay_minutes != 0:
        df.index = df.index + pd.Timedelta(minutes=delay_minutes)

    # 4. 定義 resample 聚合方式
    if c.exchange_name == 'bybit':
        ohlc_dict = {
            'Open': 'first',
            'High': 'max',
            'Low': 'min',
            'Close': 'last',
            'Volume': 'sum',
            'Turnover': 'sum'
        }
    else:
        ohlc_dict = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum',
    }
    
    # 5. 進行 resample
    df_resampled = df.resample(timeframe).agg(ohlc_dict).dropna(how='any')

    # Use Time to create one more column named 'start_time' that is in unix timestamp
    df_resampled['start_time'] = df_resampled.index.astype('int64') // 10**6
    # df_resampled['start_time'] = df_resampled['start_time'].astype('float64')

    # 6. 獲取開始與結束時間 (格式 YYYY-MM-DD)
    if not df_resampled.empty:
        start_time = df_resampled.index[0].strftime('%Y-%m-%d')
        end_time = df_resampled.index[-1].strftime('%Y-%m-%d')

        # 7. 構建輸出檔案名稱
        output_filename = f"./data/resample_{datasource}_{timeframe}_-{c.candle_delay}m.csv"
        output_path = os.path.join(os.getcwd(), output_filename)  # 當前工作目錄

        # 8. 輸出 CSV
        df_resampled.to_csv(output_path)
        print(f"✅ 檔案已儲存：{output_path}")
    else:
        print("⚠️ Resampled DataFrame 為空，未產生輸出檔案！")

    return df_resampled

df_r = prepare_price_data(
    csv_path=f"./data/{c.exchange_name}_{c.coin}usdt_price_1m.csv",
    datasource=f'{c.exchange_name}_{c.coin}',
    factor='price',
    timeframe=c.candle_timeframe,
    delay_minutes=-c.candle_delay
)

print(df_r.head()) 

## Data Visualization

In [None]:
from utilsnumpy import load_all_data, combines_data, data_processing
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import config as c

def visualize_factors():
    """
    加载并可视化因子数据，支持因子组合和预处理
    """
    print("Loading data...")
    # 加载原始数据
    raw_candle, raw_factor = load_all_data(c.candle_file, c.factor_file, c.factor2_file, c.factor, c.factor2)
    
    # 处理时间格式以便于绘图
    raw_factor['time'] = pd.to_datetime(raw_factor['start_time'], unit='ms')
    raw_candle['time'] = pd.to_datetime(raw_candle['start_time'], unit='ms')
    
    # 初始化绘图
    fig, axes = plt.subplots(3, 1, figsize=(15, 18), sharex=True)
    
    # 绘制价格数据
    axes[0].plot(raw_candle['time'], raw_candle['Close'], color='blue', linewidth=1.5)
    axes[0].set_title(f'Price Data: {c.symbol}', fontsize=16)
    axes[0].set_ylabel('Price', fontsize=14)
    axes[0].grid(True, alpha=0.3)
    
    # 绘制原始因子数据
    axes[1].plot(raw_factor['time'], raw_factor[c.factor], color='green', linewidth=1.5, label=c.factor)
    if c.factor2 in raw_factor.columns:
        axes[1].plot(raw_factor['time'], raw_factor[c.factor2], color='red', linewidth=1.5, label=c.factor2)
    axes[1].set_title('Raw Factor Data', fontsize=16)
    axes[1].set_ylabel('Factor Value', fontsize=14)
    axes[1].grid(True, alpha=0.3)
    axes[1].legend()
    
    # 处理组合因子和预处理
    factor_data = raw_factor.copy()
    factor_to_display = c.factor
    factor_label = c.factor
    
    # 如果需要組合因子
    if c.operation != 'none' and c.factor2 in factor_data.columns:
        print(f"Combining factors with operation: {c.operation}")
        combined_values, combined_name = combines_data(
            factor_data[c.factor].values, 
            factor_data[c.factor2].values, 
            c.operation, 
            c.factor, 
            c.factor2
        )
        factor_data[combined_name] = combined_values
        factor_to_display = combined_name
        factor_label = f"{c.factor} {c.operation} {c.factor2}"
    
    # 如果需要预处理
    if c.preprocess != "direct":
        print(f"Applying preprocessing: {c.preprocess}")
        processed_values = data_processing(factor_data[factor_to_display], c.preprocess, factor_to_display)
        factor_data[f"{factor_to_display}_{c.preprocess}"] = processed_values
        factor_to_display = f"{factor_to_display}_{c.preprocess}"
        factor_label = f"{factor_label} ({c.preprocess})"
    
    # 绘制处理后的因子数据
    axes[2].plot(factor_data['time'], factor_data[factor_to_display], color='purple', linewidth=1.5)
    axes[2].set_title('Processed Factor Data', fontsize=16)
    axes[2].set_ylabel('Factor Value', fontsize=14)
    axes[2].set_xlabel('Date', fontsize=14)
    axes[2].grid(True, alpha=0.3)
    
    # 在标题旁添加因子描述
    axes[2].text(0.01, 0.95, f"Factor: {factor_label}", transform=axes[2].transAxes, 
                 fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    
    # 格式化X轴日期
    for ax in axes:
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
    
    # 添加整体标题
    plt.suptitle(f'Factor Analysis - {c.symbol}', fontsize=20, y=0.98)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    
    # 保存图像
    if hasattr(c, 'save_plot') and c.save_plot:
        # plt.savefig(f"factor_analysis_{c.symbol}_{c.factor}.png", dpi=300, bbox_inches='tight')
        # print(f"图像已保存为: factor_analysis_{c.symbol}_{c.factor}.png")
        pass
    
    plt.show()
    
    # 返回处理后的数据，以便进一步分析
    return {
        'candle': raw_candle,
        'factor': factor_data,
        'factor_name': factor_to_display
    }

def visualize_factor_correlation(data):
    """
    可视化因子与价格的相关性
    
    参数:
    data: 由visualize_factors函数返回的数据字典
    """
    if data is None:
        print("没有数据可用于相关性分析")
        return
    
    candle_df = data['candle']
    factor_df = data['factor']
    factor_name = data['factor_name']
    
    # 合并价格和因子数据
    merged_df = pd.merge_asof(
        candle_df.sort_values('time'), 
        factor_df[['time', factor_name]].sort_values('time'),
        on='time',
        direction='nearest'
    )
    
    # 计算价格变化
    merged_df['price_change'] = merged_df['Close'].pct_change()
    
    # 去除NaN值
    merged_df = merged_df.dropna()
    
    # 计算相关系数
    correlation = merged_df['price_change'].corr(merged_df[factor_name])
    
    # 绘制散点图
    plt.figure(figsize=(12, 8))
    plt.scatter(merged_df[factor_name], merged_df['price_change'], alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
    plt.axvline(x=0, color='r', linestyle='-', alpha=0.3)
    
    # 添加趋势线
    z = np.polyfit(merged_df[factor_name], merged_df['price_change'], 1)
    p = np.poly1d(z)
    plt.plot(merged_df[factor_name], p(merged_df[factor_name]), "r--", alpha=0.8)
    
    plt.title(f'Correlation between Price Change and {factor_name}: {correlation:.4f}', fontsize=16)
    plt.xlabel(factor_name, fontsize=14)
    plt.ylabel('Price Change (%)', fontsize=14)
    plt.grid(True, alpha=0.3)
    
    # 保存图像
    if hasattr(c, 'save_plot') and c.save_plot:
        # plt.savefig(f"correlation_{c.symbol}_{factor_name}.png", dpi=300, bbox_inches='tight')
        # print(f"图像已保存为: correlation_{c.symbol}_{factor_name}.png")
        pass
    
    plt.show()

def visualize_factor_distribution(data):
    """
    可视化因子值的分布
    
    参数:
    data: 由visualize_factors函数返回的数据字典
    """
    if data is None:
        print("没有数据可用于分布分析")
        return
    
    factor_df = data['factor']
    factor_name = data['factor_name']
    
    # 去除NaN值
    factor_values = factor_df[factor_name].dropna().values
    
    plt.figure(figsize=(12, 8))
    
    # 绘制直方图
    plt.hist(factor_values, bins=50, alpha=0.7, color='blue')
    
    # 添加基本统计信息
    mean_val = np.mean(factor_values)
    median_val = np.median(factor_values)
    std_val = np.std(factor_values)
    
    plt.axvline(mean_val, color='r', linestyle='dashed', linewidth=1, label=f'Mean: {mean_val:.4f}')
    plt.axvline(median_val, color='g', linestyle='dashed', linewidth=1, label=f'Median: {median_val:.4f}')
    
    plt.title(f'Distribution of {factor_name}', fontsize=16)
    plt.xlabel('Value', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # 文本框显示统计信息
    stats_text = f"""
    Mean: {mean_val:.4f}
    Median: {median_val:.4f}
    Std Dev: {std_val:.4f}
    Min: {np.min(factor_values):.4f}
    Max: {np.max(factor_values):.4f}
    """
    plt.text(0.01, 0.95, stats_text, transform=plt.gca().transAxes, 
             fontsize=12, verticalalignment='top', 
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))
    
    # 保存图像
    if hasattr(c, 'save_plot') and c.save_plot:
        # plt.savefig(f"distribution_{c.symbol}_{factor_name}.png", dpi=300, bbox_inches='tight')
        # print(f"图像已保存为: distribution_{c.symbol}_{factor_name}.png")
        pass
    
    plt.show()

if __name__ == "__main__":
    # 执行因子可视化
    data = visualize_factors()
    
    # 可视化因子与价格的相关性
    visualize_factor_correlation(data)
    
    # 可视化因子分布
    visualize_factor_distribution(data)

## Train Split Loop + Heatmap (Inclding looping Preprocess)

In [None]:
from utilsnumpy import nan_count, load_all_data, combines_data, data_processing, precompute_rolling_stats, backtest_cached
import matplotlib.pyplot as plt
import config as c
import numpy as np
import pandas as pd
import seaborn as sns
import sys
from dask import delayed, compute
from dask.diagnostics import ProgressBar

def parse_manual_selection(filepath, all_models):
    """
    解析手動選擇的模型與進出場方式。
    """
    with open(filepath, "r") as file:
        lines = [line.strip() for line in file.readlines() if line.strip()]
    models_entrys = {}
    current_model = None
    for line in lines:
        if line in all_models:
            current_model = line
            models_entrys[current_model] = []
        elif current_model:
            models_entrys[current_model].append(line)
    return models_entrys

def plot_heatmaps(sr_threshold=1.5, preprocess_method="direct"):
    """
    依據回測結果繪製 SR 熱力圖，
    並在標題中標示當前使用的 preprocess_method。
    """
    for model, entry, backtest_df in plot_data:
        if 'short' in entry:
            srthreshold = 1.2
        elif 'long' in entry:
            srthreshold = sr_threshold
        else:
            srthreshold = sr_threshold
        sr_pivot_data = backtest_df.groupby(['rolling_window', 'threshold'])['SR'].mean().unstack()
        sr_pivot_data.columns = sr_pivot_data.columns.round(2)
        if sr_pivot_data.isna().all().all():
            print(f"⚠️ Skipping {model}_{entry} heatmap: All SR values are NaN.")
            continue
        if not np.any(sr_pivot_data.to_numpy() > srthreshold):
            print(f"⚠️ Skipping {model}_{entry} heatmap: No SR value exceeds {srthreshold}.")
            continue
        plt.figure(figsize=(18, 14))
        sns.heatmap(sr_pivot_data, annot=True, fmt=".2f", cmap="RdYlGn", linewidths=0.3,
                    cbar_kws={'label': 'Sharpe Ratio'})
        plt.xticks(ticks=range(len(sr_pivot_data.columns)),
                   labels=[f"{col:.2f}" for col in sr_pivot_data.columns], rotation=45)
        plt.yticks(ticks=range(len(sr_pivot_data.index)),
                   labels=[f"{row:.2f}" for row in sr_pivot_data.index], rotation=0)
        plt.title(f"{model}_{preprocess_method}_{entry} Train Period BackTest SR Heatmap", fontsize=14)
        if c.save_plot:
            plt.savefig(f"{model}_{entry}_heatmap", dpi=300, bbox_inches='tight')
            print(f"已儲存 {model}_{entry}_heatmap.png")
        plt.show()
        plt.close()

def process_and_validate(factor_series, method, factor_name):
    """
    使用指定的 preprocess 方法處理資料並檢查 NaN 百分比。
    
    Parameters:
        factor_series (pd.Series): 原始的因子資料。
        method (str): 要套用的預處理方法。
        factor_name (str): 因子名稱，用於 debug 訊息。
        
    Returns:
        pd.Series: 若處理後 NaN 低於 3%，則回傳處理後的資料；否則回傳 None 表示跳過此方法。
    """
    if method != "direct":
        processed = data_processing(factor_series, method, factor_name)
    else:
        processed = factor_series.copy()
    if processed.isna().sum() / len(processed) > 0.03:
        preprocess_nan_count = nan_count(processed)
        print(f"nan count After {method} Preprocessing: {preprocess_nan_count}")
        print(f"{factor_name} after {method} transformation exceed 3% NaN. Skipping this preprocess method.")
        return None
    else:
        print(f"NaN% after preprocess: {processed.isna().sum()/len(processed)}%.")
        processed.dropna(inplace=True)
    return processed

def main(candle_data, factor_data, factor, interval, operation, model, entry,
         window_start, window_end, window_step, threshold_start, threshold_end,
         threshold_step, rolling_stats, preprocess_method, date_range):
    """
    回測主程式，根據參數進行多參數回測。
    """
    candle_df_copy = candle_data[['start_time', 'Close']].copy()
    candle_df_copy.columns = ['start_time', 'close']
    factor_df_copy = factor_data[['start_time', factor]].copy()

    candle_df_copy['time'] = pd.to_datetime(candle_df_copy['start_time'], unit='ms')
    
    annualizer = annualizer_dict.get(interval, None)
    backtest_report = []
    for rolling_window in range(window_start, window_end, window_step):
        for threshold in np.arange(threshold_start, threshold_end, threshold_step):
            result, _, log_msgs = backtest_cached(candle_df_copy, factor_df_copy, rolling_window, threshold, 
                                          preprocess_method, entry, annualizer, model, factor, interval, date_range,
                                          rolling_stats)
            backtest_report.append(result)
    
    backtest_df = pd.DataFrame(backtest_report)
    return (model, entry, backtest_df, log_msgs)

# 定義 annualizer 字典
annualizer_dict = {
    '1m': 525600, '5m': 105120, '15m': 35040,
    '30m': 17520, '1h': 8760, '4h': 2190,
    '1d': 365, '1w': 52, '1M': 12
}

# 載入原始資料
raw_candle, raw_factor = load_all_data(c.candle_file, c.factor_file, c.factor2_file, c.factor, c.factor2)
train_split = annualizer_dict.get(c.interval, None) * 3

candle_train = raw_candle[:train_split].reset_index(drop=True).copy()
factor_train_original = raw_factor[:train_split].reset_index(drop=True).copy()

# Start and end Time and date_range
start_time = max(candle_train['start_time'].min(), factor_train_original['start_time'].min())
end_time = min(candle_train['start_time'].max(), factor_train_original['start_time'].max())
date_range = pd.date_range(start=pd.to_datetime(start_time, unit='ms'),
                           end=pd.to_datetime(end_time, unit='ms'),
                           freq=c.interval)
candle_train['time'] = pd.to_datetime(candle_train['start_time'], unit='ms')
factor_train_original['time'] = pd.to_datetime(factor_train_original['start_time'], unit='ms')
candle_train.set_index('time', inplace=True)
factor_train_original.set_index('time', inplace=True)

# 若 operation 不是 'none'，則合併兩個因子
if c.operation != 'none':
    combined_data, merged_col_name = combines_data(factor_train_original[c.factor].values, 
                                                 factor_train_original[c.factor2].values, 
                                                 c.operation, c.factor, c.factor2)
    factor_train_original[merged_col_name] = combined_data
    factor_used = merged_col_name
else:
    factor_used = c.factor

################################################
# Step 0: 判斷 c.preprocess 是單一字串，還是串列
################################################
if isinstance(c.preprocess, list):
    all_preprocess_methods = c.preprocess
else:
    all_preprocess_methods = [c.preprocess]

# 對每個 preprocess 方法進行迴圈
for current_preprocess in all_preprocess_methods:
    print(f"\n===== Processing with preprocess method: {current_preprocess} =====")
    # 從原始資料複製一份
    factor_train = factor_train_original.copy()
    # 對指定因子進行預處理
    processed_factor = process_and_validate(factor_train[factor_used], current_preprocess, factor_used)
    if processed_factor is None:
        continue  # 如果驗證不通過，則跳到下一個 preprocess 方法
    factor_train[factor_used] = processed_factor

    # 模型與進出場設定
    if c.USE_ALL_MODELS:
        models = c.ALL_MODELS
        entry_map = {model: c.ALL_ENTRYS for model in c.ALL_MODELS}
        window_step = 20
        threshold_step = 0.2
    else:
        entry_map = parse_manual_selection("manual_selected.txt", c.ALL_MODELS)
        models = list(entry_map.keys())
        window_step = 10
        threshold_step = 0.1

    # 預先計算滾動統計值
    windows = list(range(5, 351, window_step))
    rolling_stats_dict = precompute_rolling_stats(factor_train[factor_used], windows)

    # 重置 plot_data 以儲存當前 preprocess 方法的回測結果
    plot_data = []
    tasks = []
    for model in models:
        for entry in entry_map[model]:
            task = delayed(main)(
                candle_train,
                factor_train,
                factor_used,
                c.interval,
                c.operation,
                model,
                entry,
                window_start=5,
                window_end=351,
                window_step=window_step,
                threshold_start=0,
                threshold_end=4.01,
                threshold_step=threshold_step,
                rolling_stats=rolling_stats_dict,
                preprocess_method=current_preprocess,
                date_range=date_range
            )
            tasks.append(task)

    with ProgressBar():
        results = compute(*tasks, scheduler='processes')
    
    for res in results:
        if res is not None:
            m, e, backtest_df, log_msgs = res
            for msg in log_msgs:
                print(msg)
            plot_data.append((m, e, backtest_df))

    # 繪製當前 preprocess 方法的熱力圖
    plot_heatmaps(1.65, preprocess_method=current_preprocess)

## Split forward, Split Backtest, full_length_backtest

In [None]:
from utilsnumpy import nan_count, load_all_data, combines_data, data_processing, backtest_cached, additional_metrics
import matplotlib.pyplot as plt
import config as c
import numpy as np
import pandas as pd
import json
import sys
import os

annualizer_dict = {
    '1m': 525600, '5m': 105120, '15m': 35040,
    '30m': 17520, '1h': 8760, '4h': 2190,
    '1d': 365, '1w': 52, '1M': 12
}

def prepare_backtest_data(candle_data, factor_data, factor, interval):
    """
    Preprocesses candle and factor data for backtesting by:
    1. Creating time columns and setting them as indices
    2. Computing the common time range
    3. Creating a full date range for the time period
    4. Reindexing and forward-filling the data
    
    Returns:
        dict: Dictionary containing preprocessed candle and factor DataFrames, and the date range
    """
    # Create copies to avoid modifying original data
    candle_df_copy = candle_data[['start_time', 'Close']].copy()
    candle_df_copy.columns = ['start_time', 'close']
    factor_df_copy = factor_data[['start_time', factor]].copy()
    
    # Compute common time range
    start_time = max(candle_df_copy['start_time'].min(), factor_df_copy['start_time'].min())
    end_time = min(candle_df_copy['start_time'].max(), factor_df_copy['start_time'].max())
    
    # Convert timestamps to datetime and set as index
    candle_df_copy['time'] = pd.to_datetime(candle_df_copy['start_time'], unit='ms')
    factor_df_copy['time'] = pd.to_datetime(factor_df_copy['start_time'], unit='ms')
    
    candle_df_copy.set_index('time', inplace=True)
    factor_df_copy.set_index('time', inplace=True)
    
    # Create full date range
    full_range = pd.date_range(
        start=pd.to_datetime(start_time, unit='ms'),
        end=pd.to_datetime(end_time, unit='ms'),
        freq=interval
    )
    
    # Reindex and forward fill factor data
    # factor_df_copy = factor_df_copy.reindex(full_range)
    # factor_df_copy = factor_df_copy.ffill()
    
    # For consistency, also reindex candle data
    # candle_df_copy = candle_df_copy.reindex(full_range)
    # candle_df_copy = candle_df_copy.ffill()
    
    return {
        'candle_df': candle_df_copy,
        'factor_df': factor_df_copy,
        'date_range': full_range
    }

def load_and_prepare_data():
    """加载和准备回测数据，包括分割和预处理"""
    print("Loading data...")
    raw_candle, raw_factor = load_all_data(c.candle_file, c.factor_file, c.factor2_file, c.factor, c.factor2)

    # 计算分割点
    annualizer = annualizer_dict.get(c.interval, 365)
    train_split = annualizer * 3  # 训练使用3年数据
    
    print(f"Using {c.interval} data, annualizer: {annualizer}, train_split points: {train_split}")

    # 按时间戳排序
    raw_candle = raw_candle.sort_values('start_time').reset_index(drop=True)
    raw_factor = raw_factor.sort_values('start_time').reset_index(drop=True)

    # 检查数据长度
    if len(raw_candle) <= train_split:
        print(f"Error: Candle data ({len(raw_candle)} points) is insufficient for the specified train_split ({train_split} points)")
        return None
    
    if len(raw_factor) <= train_split:
        print(f"Error: Factor data ({len(raw_factor)} points) is insufficient for the specified train_split ({train_split} points)")
        return None

    # 分割数据
    split_time_candle = raw_candle.iloc[train_split]['start_time']
    print(f"Split timestamp: {pd.to_datetime(split_time_candle, unit='ms')}")
    
    candle_train = raw_candle.iloc[:train_split].copy()
    factor_train = raw_factor.iloc[:train_split].copy()

    candle_test = raw_candle.iloc[train_split:].copy()
    factor_test = raw_factor.iloc[train_split:].copy()

    candle_full = raw_candle.copy()
    factor_full = raw_factor.copy()

    # 应用因子操作和预处理
    factor_name = c.factor
    if c.operation != 'none':
        print(f"\nApplying operation '{c.operation}' to factors...")
        
        # 处理训练数据
        combined_train, merged_col_name = combines_data(factor_train[c.factor].values, 
                                                     factor_train[c.factor2].values, 
                                                     c.operation, c.factor, c.factor2)
        factor_train[merged_col_name] = combined_train
        
        # 处理测试数据
        combined_test, _ = combines_data(factor_test[c.factor].values, 
                                       factor_test[c.factor2].values, 
                                       c.operation, c.factor, c.factor2)
        factor_test[merged_col_name] = combined_test
        
        # 处理完整数据
        combined_full, _ = combines_data(factor_full[c.factor].values, 
                                       factor_full[c.factor2].values, 
                                       c.operation, c.factor, c.factor2)
        factor_full[merged_col_name] = combined_full
        
        factor_name = merged_col_name
        print(f"Created merged factor: {factor_name}")

    # 应用预处理
    if c.preprocess != "direct":
        print(f"\nApplying preprocessing method '{c.preprocess}'...")
        
        # 处理训练数据
        processed_train = data_processing(factor_train[factor_name], c.preprocess, factor_name)
        factor_train[factor_name] = processed_train
        
        # 处理测试数据
        processed_test = data_processing(factor_test[factor_name], c.preprocess, factor_name)
        factor_test[factor_name] = processed_test
        
        # 处理完整数据
        processed_full = data_processing(factor_full[factor_name], c.preprocess, factor_name)
        factor_full[factor_name] = processed_full

    # 检查NaN百分比
    nan_train = factor_train[factor_name].isna().sum() / len(factor_train[factor_name]) if len(factor_train) > 0 else 0
    nan_test = factor_test[factor_name].isna().sum() / len(factor_test[factor_name]) if len(factor_test) > 0 else 0
    nan_full = factor_full[factor_name].isna().sum() / len(factor_full[factor_name]) if len(factor_full) > 0 else 0
    print(f"Train data NaN %: {nan_train:.3f}, Test data NaN %: {nan_test:.3f}, Test Full NaN %: {nan_full:.3f}")

    if nan_train > 0.03:
        print(f"Warning: Train data {factor_name} after {c.preprocess} transformation exceeds 3% NaN.")
    else:
        print(f"Dropna after transformation, Train nan%: {factor_train[factor_name].isna().sum() / len(factor_train)}")
        factor_train.dropna(inplace=True)
    if nan_test > 0.03:
        print(f"Warning: Test data {factor_name} after {c.preprocess} transformation exceeds 3% NaN.")
    else:
        print(f"Dropna after transformation, Test nan%: {factor_train[factor_name].isna().sum() / len(factor_test)}")
        factor_test.dropna(inplace=True)
    if nan_full > 0.03:
        print(f"Warning: Full data {factor_name} after {c.preprocess} transformation exceeds 3% NaN.")
    else:
        print(f"Dropna after transformation, Full nan%: {factor_full[factor_name].isna().sum() / len(factor_full)}")
        factor_full.dropna(inplace=True)
        
    # Preprocess data for efficient backtesting
    train_data = prepare_backtest_data(candle_train, factor_train, factor_name, c.interval)
    test_data = prepare_backtest_data(candle_test, factor_test, factor_name, c.interval)
    full_data = prepare_backtest_data(candle_full, factor_full, factor_name, c.interval)
        
    return {
        'train': {'candle': train_data['candle_df'], 'factor': train_data['factor_df'], 'date_range': train_data['date_range']},
        'test': {'candle': test_data['candle_df'], 'factor': test_data['factor_df'], 'date_range': test_data['date_range']},
        'full': {'candle': full_data['candle_df'], 'factor': full_data['factor_df'], 'date_range': full_data['date_range']},
        'factor_name': factor_name
    }

def parse_manual_selection(filepath):
    """
    解析manual_selected.txt文件，返回参数元组列表[(model, entry, window, threshold)]
    使用config.py中的ALL_MODELS列表来识别模型和条目
    如果文件格式不完整或不存在，返回空列表
    """
    if not os.path.exists(filepath):
        print(f"Warning: {filepath} not found")
        return []
        
    try:        
        with open(filepath, "r") as file:
            lines = [line.strip() for line in file.readlines() if line.strip()]
        
        params_list = []
        current_model = None
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # 检查当前行是否是模型名称
            if line in c.ALL_MODELS:
                current_model = line
                continue
            
            # 如果没有当前模型，则跳过
            if current_model is None:
                print(f"Warning: Entry '{line}' found without a model specified")
                continue
                
            # 解析条目和参数
            parts = line.split(maxsplit=1)
            if len(parts) < 1:
                continue
                
            entry = parts[0]
            
            if len(parts) > 1:
                params_str = parts[1]
                params_entries = params_str.split(",")
                for param_entry in params_entries:
                    param_entry = param_entry.strip()
                    if param_entry:
                        try:
                            window_str, threshold_str = param_entry.split("/")
                            window = float(window_str) if "." in window_str else int(window_str)
                            threshold = float(threshold_str)
                            params_list.append((current_model, entry, window, threshold))
                        except ValueError:
                            print(f"Warning: Could not parse parameter {param_entry}")
            else:
                # 没有参数，使用默认值
                params_list.append((current_model, entry, c.window, c.threshold))
    
        return params_list
    except Exception as e:
        print(f"Error parsing manual_selected.txt: {e}")
        return []

def backtest_for_pnl(candle_df, factor_df, factor, factor2, interval, operation, preprocess, model,
                  entry, window, threshold, backtest_style, date_range=None):
    """执行回测并绘制结果图表"""
    print(f"\n=== Running {backtest_style} ===")
    print(f"Using model: {model}, entry: {entry}, window: {window}, threshold: {threshold}")
    
    # 检查空数据框
    if candle_df.empty or factor_df.empty:
        print(f"Error: Empty dataframe in {backtest_style}")
        return None
        
    # 检查数据长度是否足够
    if len(candle_df) < window or len(factor_df) < window:
        print(f"Error: Insufficient data points (candle: {len(candle_df)}, factor: {len(factor_df)}) compared to window size ({window}) in {backtest_style}")
        return None
    
    candle_df['time'] = pd.to_datetime(candle_df['start_time'], unit='ms')

    # 获取annualizer值
    annualizer = annualizer_dict.get(interval, 365)
    
    # 添加额外的指标
    additional_metric = additional_metrics(c.alpha_id, c.symbol, factor, factor2,
                                          operation, c.candle_delay, backtest_style)
    
    try:
        # 运行回测 - now passing date_range
        backtest_result, df, log_msgs = backtest_cached(
            candle_df, factor_df, window, threshold, preprocess, 
            entry, annualizer, model, factor, interval, date_range
        )
        
        # 打印日志消息
        for msg in log_msgs:
            print(msg)
            
        # 检查df是否为空
        if df is None or df.empty:
            print(f"Warning: No results returned from backtest_cached for {backtest_style}")
            return None
        
        # # 正确处理时间戳
        if isinstance(df.index, pd.DatetimeIndex):
            # 直接使用DatetimeIndex
            start_date = df.index.min().strftime('%Y-%m-%d')
            end_date = df.index.max().strftime('%Y-%m-%d')
            start_time = df.index.min().strftime('%Y-%m-%d %H:%M:%S')
            end_time = df.index.max().strftime('%Y-%m-%d %H:%M:%S')
        elif 'start_time' in df.columns:
            # 使用start_time列
            start_date = pd.to_datetime(df['start_time'].min(), unit='ms').strftime('%Y-%m-%d')
            end_date = pd.to_datetime(df['start_time'].max(), unit='ms').strftime('%Y-%m-%d')
            start_time = pd.to_datetime(df['start_time'].min(), unit='ms').strftime('%Y-%m-%d %H:%M:%S')
            end_time = pd.to_datetime(df['start_time'].max(), unit='ms').strftime('%Y-%m-%d %H:%M:%S')

        additional_metric.update({"start_time": start_time, "end_time": end_time})
        combined_report = [{**additional_metric, **backtest_result}]

        print(f"{backtest_style} Report:")
        print(json.dumps(combined_report, indent=4))

        # 绘制结果
        fig, ax1 = plt.subplots(figsize=(15, 8))
        ax1.plot(df.index, df['close'], label='Close Price', color='green', linewidth=2)
        ax1.set_xlabel("Date", fontsize=12)
        ax1.set_ylabel("Close Price", fontsize=12, color='green')
        ax1.tick_params(axis='y', labelcolor='green')
        ax1.grid(True, alpha=0.3)
        
        # 在右侧y轴上绘制累积PnL
        ax2 = ax1.twinx()
        ax2.plot(df.index, df['cumu_pnl'], label='Cumulative PnL', color='blue', linewidth=2)
        ax2.set_ylabel("Cumulative PnL", fontsize=12, color='blue')
        ax2.tick_params(axis='y', labelcolor='blue')

        plt.title(f"Close Price and Cumulative PnL Plot (Split {backtest_style})-({start_date} ~ {end_date})", fontsize=16)
        fig.tight_layout()
        # plt.show()

        if c.save_plot:
            plt.savefig(f"{backtest_style}_Equity_Curve_{start_date}_{end_date}.png", dpi=300, bbox_inches='tight')
            print(f"已儲存 {backtest_style}_Equity_Curve_{start_date}_{end_date}.png")
            output_backtest_data = {f"{backtest_style}": combined_report}
            with open(f"{c.alpha_id}_{backtest_style}.json", "w") as json_file:
                json.dump(output_backtest_data, json_file, indent=4)
            df.to_csv(f"{c.alpha_id}_{backtest_style}_df.csv", index=True)
            print(f"已儲存 {c.alpha_id}_{backtest_style}_df.csv")
        plt.show()
        return combined_report
            
    except Exception as e:
        print(f"Error in backtest_for_pnl for {backtest_style}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def run_backtest(data, model, entry, window, threshold):
    """运行完整的回测流程（前向测试、回测、全时段回测）"""
    factor_name = data['factor_name']
    
    # 设置SR阈值
    if "short" in entry.lower():
        required_sr = 1.0
    elif "long" in entry.lower():
        required_sr = 1.7
    else:
        required_sr = 1.7
    
    print(f"\n执行测试: model = {model}, entry = {entry}, window = {window}, threshold = {threshold}")
    
    # 执行forward test
    fwd_report = backtest_for_pnl(
        data['test']['candle'],
        data['test']['factor'],
        factor_name,
        c.factor2,
        c.interval,
        c.operation,
        c.preprocess,
        model,
        entry,
        window,
        threshold,
        "forwardtest",
        data['test']['date_range']
    )
    
    # 检查forward test结果
    if fwd_report is None:
        print("Warning: forward test failed, skipping this parameter set.")
        return
        
    # 获取SR值
    fwd_sr = fwd_report[0].get("SR", 0)
    
    # 如果SR未达到要求则跳过后续测试
    if fwd_sr < required_sr:
        print(f"Forward test SR = {fwd_sr} 未达到要求 (需 > {required_sr})，跳过此组参数的后续测试。")
        return
        
    # 如果符合SR要求则继续执行其他backtest
    backtest_for_pnl(
        data['train']['candle'],
        data['train']['factor'],
        factor_name,
        c.factor2,
        c.interval,
        c.operation,
        c.preprocess,
        model,
        entry,
        window,
        threshold,
        "backtest",
        data['train']['date_range']
    )
    
    backtest_for_pnl(
        data['full']['candle'],
        data['full']['factor'],
        factor_name,
        c.factor2,
        c.interval,
        c.operation,
        c.preprocess,
        model,
        entry,
        window,
        threshold,
        "full_time_backtest",
        data['full']['date_range']
    )

if __name__ == "__main__":
    # 一次性加载和处理数据
    data = load_and_prepare_data()
    if data is None:
        print("Error preparing data. Aborting backtest.")
        sys.exit(1)
    
    # 尝试从manual_selected.txt读取参数列表
    params_list = parse_manual_selection("manual_selected.txt")
    
    if params_list:
        print(f"Found {len(params_list)} parameter sets in manual_selected.txt")
        # 使用每组参数运行回测
        for model, entry, window, threshold in params_list:
            run_backtest(data, model, entry, window, threshold)
    else:
        # 如果文件不存在或参数不齐全，使用config中的参数
        print("Using parameters from config.py")
        run_backtest(data, c.model, c.entry, c.window, c.threshold)