In [4]:
import pandas as pd
import numpy as np
import gc

print("正在加载数据...")
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# 合并 train 和 test 以便统一处理特征# 我们用一个 'source' 列来区分它们，以便稍后拆分
df_train['source'] = 'train'
df_test['source'] = 'test'# 合并# .concat 会保留 train 和 test 的原始列，对于 test 中没有的 'class_label'，# 它会自动填充为 NaN，这很完美。
df_full = pd.concat([df_train, df_test], ignore_index=True)

# 转换基础类型
df_full['t'] = pd.to_datetime(df_full['t'])
df_full['ticker_id'] = df_full['ticker_id'].astype('category')

# 确保数据按 ticker 和 时间 排序！这是 Lag/Rolling 的前提！
df_full = df_full.sort_values(by=['ticker_id', 't']).reset_index(drop=True)

print(f"完整数据集形状: {df_full.shape}")
print(f"Train 样本: {(df_full['source'] == 'train').sum()}")
print(f"Test 样本: {(df_full['source'] == 'test').sum()}")

del df_train, df_test
gc.collect()

正在加载数据...


  df_train = pd.read_csv('train.csv')


完整数据集形状: (2760, 68509)
Train 样本: 1932
Test 样本: 828


283

In [12]:
import pandas as pd
import numpy as np

def create_features(df):
    """
    彻底修复衍生特征分组引用错误 + 完整流程
    """
    df_feat = df.copy()
    # 确保't'列是datetime类型
    df_feat['t'] = pd.to_datetime(df_feat['t'], errors='coerce')

    # ========================================= 
    # 1. 先创建所有特征（时间+衍生+交互），不涉及填充
    # =========================================
    # 1.1 时间特征
    time_feats = {
        'fe_day_of_week': df_feat['t'].dt.dayofweek,
        'fe_month': df_feat['t'].dt.month,
        'fe_day_of_year': df_feat['t'].dt.dayofyear,
        'fe_is_weekend': df_feat['t'].dt.dayofweek.apply(lambda x: 1 if x >=5 else 0),
        'fe_quarter': df_feat['t'].dt.quarter,
        'fe_days_since_start': (df_feat['t'] - df_feat.groupby('ticker_id', observed=True)['t'].transform('min')).dt.days
    }
    for name, val in time_feats.items():
        df_feat[name] = val

    # 1.2 原始核心特征（仅原始列，不含衍生）
    raw_core_feats = [
        'ratio', 'momentum', 'sm_ratio', 'sm_momentum'
    ] + [col for col in df_feat.columns if any(prefix in col for prefix in ['troughs_', 'peaks_', 'cross_threshold', 'zone_']) and '_1' not in col][:16]
    # 过滤原始数据中存在的列
    raw_core_feats = [col for col in raw_core_feats if col in df_feat.columns]
    if not raw_core_feats:
        raw_core_feats = [col for col in df_feat.columns if col not in ['id', 'ticker_id', 't', 'class_label']][:10]
    print(f"基于 {len(raw_core_feats)} 个原始特征创建衍生特征: {raw_core_feats[:5]}...")

    # 1.3 创建衍生特征（Lag/Rolling/差分）
    derived_dfs = []  # 存储衍生特征DataFrame
    grouped_raw = df_feat.groupby('ticker_id', observed=True)  # 基于原始数据分组

    for col in raw_core_feats:
        temp = pd.DataFrame(index=df_feat.index)
        # Lag特征
        temp[f'fe_lag_1_{col}'] = grouped_raw[col].shift(1)
        temp[f'fe_lag_3_{col}'] = grouped_raw[col].shift(3)
        temp[f'fe_lag_7_{col}'] = grouped_raw[col].shift(7)
        temp[f'fe_lag_14_{col}'] = grouped_raw[col].shift(14)
        # Rolling特征
        roll7 = grouped_raw[col].rolling(window=7, min_periods=1)
        roll14 = grouped_raw[col].rolling(window=14, min_periods=1)
        temp[f'fe_roll_mean_7_{col}'] = roll7.mean().reset_index(level=0, drop=True)
        temp[f'fe_roll_std_7_{col}'] = roll7.std().reset_index(level=0, drop=True)
        temp[f'fe_roll_median_14_{col}'] = roll14.median().reset_index(level=0, drop=True)
        temp[f'fe_roll_max_14_{col}'] = roll14.max().reset_index(level=0, drop=True)
        # 差分特征
        temp[f'fe_diff_1_{col}'] = grouped_raw[col].diff(1)
        temp[f'fe_diff_3_{col}'] = grouped_raw[col].diff(3)
        # 趋势特征
        temp[f'fe_ratio_vs_mean_7_{col}'] = df_feat[col] / (temp[f'fe_roll_mean_7_{col}'] + 1e-8)
        derived_dfs.append(temp)

    # 拼接衍生特征到主数据框
    if derived_dfs:
        derived_df = pd.concat(derived_dfs, axis=1)
        df_feat = pd.concat([df_feat, derived_df], axis=1)
        print(f"创建 {len(derived_df.columns)} 个衍生特征")

    # 1.4 创建交互特征
    interaction_feats = {}
    if 'ratio' in df_feat.columns and 'sm_ratio' in df_feat.columns:
        interaction_feats['fe_interaction_ratio_vs_sm_ratio'] = df_feat['ratio'] - df_feat['sm_ratio']
    if 'momentum' in df_feat.columns and 'sm_momentum' in df_feat.columns:
        interaction_feats['fe_interaction_momentum_vs_sm_momentum'] = df_feat['momentum'] - df_feat['sm_momentum']
    for name, val in interaction_feats.items():
        df_feat[name] = val
    print(f"创建 {len(interaction_feats)} 个交互特征")

    # ========================================= 
    # 2. 缺失值填充（关键：重新分组，确保包含所有新列）
    # =========================================
    # 2.1 重新生成包含所有特征的分组对象（这是修复核心！）
    grouped_all = df_feat.groupby('ticker_id', observed=True)

    # 2.2 筛选需要填充的列（仅数值列，排除元数据）
    fill_cols = []
    for col in df_feat.columns:
        if col in ['id', 'ticker_id', 't', 'class_label']:
            continue
        if pd.api.types.is_numeric_dtype(df_feat[col]) and not isinstance(df_feat[col].dtype, pd.CategoricalDtype):
            fill_cols.append(col)
    print(f"对 {len(fill_cols)} 个数值列填充缺失值")

    # 2.3 填充缺失值（用重新分组的grouped_all，确保能找到所有列）
    for col in fill_cols:
        if df_feat[col].isnull().any():
            # 直接用分组对象处理当前列，无需提前引用
            df_feat[col] = grouped_all[col].transform(
                lambda x: x.fillna(x.median(), downcast=None)
            )

    # 2.4 剩余缺失值用NaN填充（后续建模统一处理）
    df_feat[fill_cols] = df_feat[fill_cols].fillna(np.nan)

    # 消除碎片化
    df_feat = df_feat.copy()

    return df_feat

In [13]:
# ------------------------------ 运行验证 ------------------------------
print("开始执行特征工程...")
# 先验证衍生特征是否存在（调试用）
def check_derived_cols(df):
    derived_cols = [col for col in df.columns if col.startswith('fe_lag_1_')]
    print(f"衍生特征列（fe_lag_1_*）: {derived_cols[:3] if derived_cols else '无'}")
    return derived_cols

# 执行特征工程
df_full_features = create_features(df_full)
# 验证衍生特征是否存在
derived_cols = check_derived_cols(df_full_features)
if not derived_cols:
    print("警告：未生成衍生特征，请检查原始特征是否正确！")
else:
    print("特征工程执行完毕！")
    new_feats = [col for col in df_full_features.columns if col.startswith('fe_')]
    print(f"新特征总数: {len(new_feats)}, 示例: {new_feats[:5]}")
    print(f"数据集最终形状: {df_full_features.shape}")

开始执行特征工程...
基于 20 个原始特征创建衍生特征: ['ratio', 'momentum', 'sm_ratio', 'sm_momentum', 'cross_threshold_from_above_97.0']...
创建 220 个衍生特征
创建 2 个交互特征
对 68636 个数值列填充缺失值


  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcast=None)
  lambda x: x.fillna(x.median(), downcas

衍生特征列（fe_lag_1_*）: ['fe_lag_1_ratio', 'fe_lag_1_momentum', 'fe_lag_1_sm_ratio']
特征工程执行完毕！
新特征总数: 228, 示例: ['fe_day_of_week', 'fe_month', 'fe_day_of_year', 'fe_is_weekend', 'fe_quarter']
数据集最终形状: (2760, 68737)


In [14]:
# 1. 分离回 train 和 test
df_train_feat = df_full_features[df_full_features['source'] == 'train'].drop('source', axis=1)
df_test_feat = df_full_features[df_full_features['source'] == 'test'].drop('source', axis=1)

# 2. 检查形状是否正确# train 应该有 1932 行, test 应该有 828 行
print(f"特征化 Train 形状: {df_train_feat.shape}")
print(f"特征化 Test 形状: {df_test_feat.shape}")

print("正在保存到 CSV 文件...")
# index=False 不保存行索引，数据更干净
df_train_feat.to_csv('train_with_features.csv', index=False)
df_test_feat.to_csv('test_with_features.csv', index=False)

print("特征化数据已保存！")
print(f"Train: train_with_features.csv (形状: {df_train_feat.shape})")
print(f"Test: test_with_features.csv (形状: {df_test_feat.shape})")

特征化 Train 形状: (1932, 68736)
特征化 Test 形状: (828, 68736)
正在保存到 CSV 文件...
特征化数据已保存！
Train: train_with_features.csv (形状: (1932, 68736))
Test: test_with_features.csv (形状: (828, 68736))
