In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score, classification_report
import gc # 垃圾回收
import matplotlib.pyplot as plt
import seaborn as sns

# 设置 Matplotlib/Seaborn 样式
sns.set(style='whitegrid')
# 设置 Pandas 显示选项
pd.set_option('display.max_columns', None)

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc

# ----------------------------------------------------
# 1. 加载 *全部* 训练数据
# ----------------------------------------------------
print("加载 train_with_features.csv...")
try:
    df_train = pd.read_csv('train_with_features.csv', low_memory=False)
    print(f"训练数据加载成功！形状: {df_train.shape}")
except FileNotFoundError:
    print("错误：train_with_features.csv 未找到。")
    raise

# ----------------------------------------------------
# 2. 加载 *全部* 测试数据
# ----------------------------------------------------
print("加载 test_with_features.csv...")
try:
    df_test = pd.read_csv('test_with_features.csv', low_memory=False)
    print(f"测试数据加载成功！形状: {df_test.shape}")
except FileNotFoundError:
    print("错误：test_with_features.csv 未找到。")
    raise

# ----------------------------------------------------
# 3. 恢复数据类型
# ----------------------------------------------------
print("正在恢复数据类型...")
# 恢复 Train
df_train['t'] = pd.to_datetime(df_train['t'])
df_train['ticker_id'] = df_train['ticker_id'].astype('category')
# 恢复 Test
df_test['t'] = pd.to_datetime(df_test['t'])
df_test['ticker_id'] = df_test['ticker_id'].astype('category')
print("数据类型恢复完毕。")

加载 train_with_features.csv...
训练数据加载成功！形状: (1932, 68736)
加载 test_with_features.csv...
测试数据加载成功！形状: (828, 68736)
正在恢复数据类型...
数据类型恢复完毕。


In [3]:
# 1. 定义 5 分类目标映射
target_map = {'HH': 0, 'HL': 1, 'LH': 2, 'LL': 3}
# 'None' (即 NaN) 映射为 4
df_train['class_label_encoded'] = df_train['class_label'].map(target_map)
df_train['class_label_encoded'] = df_train['class_label_encoded'].fillna(4)
df_train['class_label_encoded'] = df_train['class_label_encoded'].astype(int)

print("目标变量编码完成。")

目标变量编码完成。


In [4]:
# 1. 定义目标
TARGET = 'class_label_encoded'

# 2. 定义元数据列
METADATA_COLS = ['class_label', 'class_label_encoded', 't', 'id', 'train_id']

# 3. 定义特征列 (使用所有特征)
features_to_drop = [col for col in METADATA_COLS if col in df_train.columns]
FEATURES = df_train.columns.drop(features_to_drop).tolist()

print(f"最终模型将使用全部 {len(FEATURES)} 个特征。")

# 4. 创建 X_train, y_train, X_test
X_train = df_train[FEATURES]
y_train = df_train[TARGET]
X_test = df_test[FEATURES] # 确保 df_test 和 df_train 具有相同的特征列

# 5. 修复数据类型 (针对 train 和 test)
print("正在修复 'object' 数据类型...")
numeric_features = [col for col in FEATURES if col != 'ticker_id']

# (使用 .copy() 来避免警告)
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

for col in numeric_features:
    X_train_copy[col] = pd.to_numeric(X_train_copy[col], errors='coerce', downcast='float')
    X_test_copy[col] = pd.to_numeric(X_test_copy[col], errors='coerce', downcast='float')

X_train = X_train_copy
X_test = X_test_copy
print(f"已将 {len(numeric_features)} 个特征列强制转换为数值类型。")

# 6. 填充 NaN (针对 train 和 test)
print("正在用 -999 填充剩余的 NaN 值...")
X_train[numeric_features] = X_train[numeric_features].fillna(-999)
X_test[numeric_features] = X_test[numeric_features].fillna(-999)
print("NaN 填充完毕！")

最终模型将使用全部 68732 个特征。
正在修复 'object' 数据类型...
已将 68731 个特征列强制转换为数值类型。
正在用 -999 填充剩余的 NaN 值...
NaN 填充完毕！


In [5]:
# 1. 定义 Model v2 的最佳参数
# (这是我们最好的模型参数)
final_params = {
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'multi_logloss',
    'n_estimators': 500,       # 我们可以用 V2 训练时的 'best_iteration'，但 500 是个安全值
    'learning_rate': 0.05,
    'n_jobs': -1,
    'seed': 42,
    'class_weight': 'balanced' # 👈 V2 成功的关键
}

# 2. 初始化模型
final_model = lgb.LGBMClassifier(**final_params)

# 3. 训练模型
print(f"开始训练最终模型 (在 {len(X_train)} 个全部训练样本上)...")
# 注意：我们不再需要 eval_set 或 early_stopping，
# 因为我们已经“选择”了我们的模型，现在是“全力”训练它。
final_model.fit(
    X_train, y_train,
    categorical_feature=['ticker_id']
)

print("最终模型训练完成！")

开始训练最终模型 (在 1932 个全部训练样本上)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.112374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30695
[LightGBM] [Info] Number of data points in the train set: 1932, number of used features: 8743
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
最终模型训练完成！


In [7]:
print("正在对 test.csv (828 行) 进行预测...")
# 预测类别 (0, 1, 2, 3, 4)
test_preds_encoded = final_model.predict(X_test)

# 💡 【关键修正】# 这是将我们模型的 5 类输出 映射到 比赛要求的 3 类提交 的正确规则
# 我们的模型 (V2) 预测的是：# 0 = 'HH'# 1 = 'HL'# 2 = 'LH'# 3 = 'LL'# 4 = 'N' (来自 NaN)
# 比赛提交要求是：# 'HH' 和 'LH' (0 和 2) => 'H'# 'HL' 和 'LL' (1 和 3) => 'L'# 'None' (4) => 'N'

reverse_target_map = {
    0: 'H',     # 0 ('HH')  映射为 'H'
    1: 'L',     # 1 ('HL')  映射为 'L'
    2: 'H',     # 2 ('LH')  映射为 'H'
    3: 'L',     # 3 ('LL')  映射为 'L'
    4: 'N'   # 4 ('None')映射为 'N' (遵照您的指示)
}

# -----------------------------------------------------------------# 2. 将编码 (0-4) 转换回标签 ('H', 'L', 'None')
test_preds_labels = pd.Series(test_preds_encoded).map(reverse_target_map)

# 3. 创建提交文件# df_test 中有 'id' 列
submission_df = pd.DataFrame({
    'id': df_test['id'],
    'class_label': test_preds_labels
})

# 4. 检查
print("\n提交文件 (submission.csv) 预览:")
print(submission_df.head())
print(f"\n标签分布情况:")
print(submission_df['class_label'].value_counts())

# 5. 保存
submission_df.to_csv('submission.csv', index=False)
print(f"\n成功保存 'submission.csv'！文件行数: {len(submission_df)}")
print("您现在可以提交这个文件了。")

正在对 test.csv (828 行) 进行预测...

提交文件 (submission.csv) 预览:
      id class_label
0  313.0           N
1  436.0           N
2  546.0           N
3  332.0           N
4  253.0           N

标签分布情况:
class_label
N    827
L      1
Name: count, dtype: int64

成功保存 'submission.csv'！文件行数: 828
您现在可以提交这个文件了。
