# 05c - 应用 OCEAN 权重公式到所有客户

**目标**: 使用从 Ridge 回归学到的权重公式，为所有客户生成 OCEAN 人格分数

## 工作流程:
1. 加载保存的权重公式和预处理器
2. 加载完整的建模数据
3. 应用相同的特征编码
4. 使用学到的公式计算每个客户的 OCEAN 分数
5. 保存完整的 OCEAN 特征到 CSV
6. 生成统计和可视化

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("库加载成功！")

## Step 1: 加载训练好的模型和权重

In [None]:
# 加载 pickle 格式的模型
print("加载已训练的 Ridge 回归模型...")

try:
    with open('ocean_ridge_models.pkl', 'rb') as f:
        model_package = pickle.load(f)
    
    ocean_models = model_package['models']
    preprocessor = model_package['preprocessor']
    feature_names = model_package['feature_names']
    numeric_features = model_package['numeric_features']
    categorical_features = model_package['categorical_features']
    alpha = model_package['alpha']
    
    print("✅ 模型加载成功！")
    print(f"\n模型信息:")
    print(f"- OCEAN 维度数: {len(ocean_models)}")
    print(f"- 总特征数: {len(feature_names)}")
    print(f"- 数值特征: {len(numeric_features)}")
    print(f"- 分类特征: {len(categorical_features)}")
    print(f"- 正则化参数 alpha: {alpha}")
    
except FileNotFoundError:
    print("❌ 错误: 未找到 ocean_ridge_models.pkl")
    print("请先运行 05b_train_ocean_ridge_weights.ipynb")
except Exception as e:
    print(f"❌ 加载失败: {e}")

## Step 2: 加载完整的建模数据

In [None]:
# 加载完整的建模数据
print("加载完整的建模数据...")
df = pd.read_csv('data/loan_clean_for_modeling.csv', low_memory=False)

print(f"数据形状: {df.shape}")
print(f"\n列名: {list(df.columns)}")

# 准备特征矩阵
print("\n准备特征矩阵...")
X = df[numeric_features + categorical_features].copy()

# 处理缺失值
print("处理缺失值...")
for col in numeric_features:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].median(), inplace=True)

for col in categorical_features:
    if X[col].isnull().sum() > 0:
        X[col].fillna('unknown', inplace=True)

print(f"\n特征矩阵形状: {X.shape}")

## Step 3: 应用特征编码

In [None]:
# 使用已学习的预处理器对新数据进行编码
print("编码特征...")
X_processed = preprocessor.transform(X)

print(f"编码后的特征形状: {X_processed.shape}")
print(f"预期特征数: {len(feature_names)}")

if X_processed.shape[1] == len(feature_names):
    print("✅ 特征维度匹配")
else:
    print(f"⚠️ 警告: 特征维度不匹配")
    print(f"   预期: {len(feature_names)}, 实际: {X_processed.shape[1]}")

## Step 4: 应用 Ridge 回归模型生成 OCEAN 分数

In [None]:
# 为所有客户生成 OCEAN 分数
print("应用 Ridge 回归模型生成 OCEAN 分数...\n")

ocean_cols = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
ocean_features_dict = {col: [] for col in ocean_cols}

for ocean_trait, model in ocean_models.items():
    print(f"生成 {ocean_trait} 分数...")
    
    # 使用模型预测
    scores = model.predict(X_processed)
    
    # 确保分数在 0-1 范围内
    scores = np.clip(scores, 0, 1)
    
    ocean_features_dict[ocean_trait] = scores
    
    print(f"  统计: 均值={scores.mean():.4f}, 最小={scores.min():.4f}, 最大={scores.max():.4f}")

# 转换为 DataFrame
ocean_df = pd.DataFrame(ocean_features_dict)

print(f"\n✅ OCEAN 分数生成完成！")
print(f"\n最终 OCEAN 特征统计:")
print(ocean_df.describe())

## Step 5: 创建最终的 OCEAN 特征数据集

In [None]:
# 创建最终数据集（包含原始数据和 OCEAN 分数）
print("创建最终的 OCEAN 特征数据集...\n")

# 验证长度
if len(ocean_df) == len(df):
    print("✅ 长度验证通过")
    
    # 合并原始数据和 OCEAN 分数
    df_with_ocean = df.copy()
    for col in ocean_cols:
        df_with_ocean[col] = ocean_df[col]
    
    print(f"\n最终数据集形状: {df_with_ocean.shape}")
    print(f"列名: {list(df_with_ocean.columns)}")
    
else:
    print(f"❌ 长度不匹配: OCEAN={len(ocean_df)}, 原始数据={len(df)}")

## Step 6: 保存 OCEAN 特征

In [None]:
# 保存两个版本

# 版本 1: 只包含 OCEAN 特征
print("保存 OCEAN 特征...")
ocean_only_df = ocean_df.copy()
ocean_only_file = 'ocean_features.csv'
ocean_only_df.to_csv(ocean_only_file, index=False)

import os
file_size = os.path.getsize(ocean_only_file) / (1024)  # KB
print(f"✅ {ocean_only_file} ({file_size:.2f} KB)")
print(f"   形状: {ocean_only_df.shape}")

# 版本 2: 包含所有特征和 OCEAN
print("\n保存完整数据集...")
full_file = 'loan_clean_with_ocean.csv'
df_with_ocean.to_csv(full_file, index=False)

file_size = os.path.getsize(full_file) / (1024 * 1024)  # MB
print(f"✅ {full_file} ({file_size:.2f} MB)")
print(f"   形状: {df_with_ocean.shape}")

# 版本 3: JSON 格式
json_file = 'ocean_features.json'
ocean_only_df.to_json(json_file, orient='records')
print(f"✅ {json_file}")

## Step 7: OCEAN 特征按目标变量分析

In [None]:
# 按目标变量分析 OCEAN 特征
print("=" * 80)
print("OCEAN 特征按目标变量分析")
print("=" * 80)

# 添加目标变量
ocean_with_target = ocean_df.copy()
ocean_with_target['target'] = df['target'].values

print("\nFully Paid (target=0) 的 OCEAN 分数:")
print(ocean_with_target[ocean_with_target['target'] == 0][ocean_cols].describe())

print("\nCharged Off (target=1) 的 OCEAN 分数:")
print(ocean_with_target[ocean_with_target['target'] == 1][ocean_cols].describe())

# 计算两组的平均值差异
print("\n\n两组平均值的差异 (Charged Off - Fully Paid):")
print("-" * 80)
mean_fully_paid = ocean_with_target[ocean_with_target['target'] == 0][ocean_cols].mean()
mean_charged_off = ocean_with_target[ocean_with_target['target'] == 1][ocean_cols].mean()
diff = mean_charged_off - mean_fully_paid

for col in ocean_cols:
    print(f"{col:20s}: {diff[col]:+.6f}")

## Step 8: 可视化 OCEAN 特征分布

In [None]:
# 创建可视化
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

# 1. 每个 OCEAN 维度的分布
for idx, col in enumerate(ocean_cols):
    ax = axes[idx]
    
    ax.hist(ocean_df[col], bins=50, alpha=0.7, color='steelblue', edgecolor='black')
    ax.axvline(ocean_df[col].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {ocean_df[col].mean():.3f}')
    ax.set_xlabel('Score', fontsize=11, fontweight='bold')
    ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
    ax.set_title(f'{col.upper()} Distribution', fontsize=12, fontweight='bold')
    ax.legend(fontsize=10)
    ax.grid(alpha=0.3)

# 6. 按目标变量分组的箱线图
ax = axes[5]
ocean_melt = ocean_with_target.melt(id_vars=['target'], value_vars=ocean_cols)
ocean_melt['target_label'] = ocean_melt['target'].apply(lambda x: 'Fully Paid' if x == 0 else 'Charged Off')

sns.boxplot(data=ocean_melt, x='variable', y='value', hue='target_label', ax=ax, palette=['green', 'red'])
ax.set_xlabel('OCEAN Trait', fontsize=11, fontweight='bold')
ax.set_ylabel('Score', fontsize=11, fontweight='bold')
ax.set_title('OCEAN Scores by Target Variable', fontsize=12, fontweight='bold')
ax.set_xticklabels([c[:3].upper() for c in ocean_cols])
ax.legend(title='Status', fontsize=10)
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('ocean_features_distribution.png', dpi=300, bbox_inches='tight')
print("✅ 可视化已保存: ocean_features_distribution.png")
plt.show()

## Step 9: 生成总结报告

In [None]:
print("\n" + "=" * 80)
print("OCEAN 特征应用总结报告")
print("=" * 80)

print("\n1️⃣ 数据规模")
print("-" * 80)
print(f"总样本数: {len(df_with_ocean):,}")
print(f"OCEAN 维度: {len(ocean_cols)}")
print(f"总特征数: {df_with_ocean.shape[1]}")
print(f"  - 原始特征: {df.shape[1]}")
print(f"  - OCEAN 特征: {len(ocean_cols)}")

print("\n2️⃣ OCEAN 分数统计")
print("-" * 80)
for col in ocean_cols:
    mean_val = ocean_df[col].mean()
    std_val = ocean_df[col].std()
    print(f"{col:20s}: 均值={mean_val:.4f}, 标准差={std_val:.4f}, 范围=[{ocean_df[col].min():.4f}, {ocean_df[col].max():.4f}]")

print("\n3️⃣ 生成的文件")
print("-" * 80)
print(f"1. ocean_features.csv ({len(ocean_df):,} 行 × {len(ocean_cols)} 列)")
print(f"   - 格式: CSV")
print(f"   - 内容: 所有客户的 OCEAN 分数")
print(f"")
print(f"2. loan_clean_with_ocean.csv ({df_with_ocean.shape[0]:,} 行 × {df_with_ocean.shape[1]} 列)")
print(f"   - 格式: CSV")
print(f"   - 内容: 原始特征 + OCEAN 分数")
print(f"")
print(f"3. ocean_features.json")
print(f"   - 格式: JSON")
print(f"   - 内容: OCEAN 分数（对象数组格式）")
print(f"")
print(f"4. ocean_features_distribution.png")
print(f"   - 6 个子图: 5 个 OCEAN 维度分布 + 按目标变量分组的箱线图")

print("\n4️⃣ 权重公式应用信息")
print("-" * 80)
print(f"权重来源: Ridge 回归 (alpha={alpha})")
print(f"训练样本: 500 个 (250 Fully Paid + 250 Charged Off)")
print(f"特征集: {len(numeric_features)} 个数值特征 + {len(categorical_features)} 个分类特征")
print(f"公式形式: score = intercept + Σ(weight × feature)")

print("\n5️⃣ 下一步")
print("-" * 80)
print("✅ OCEAN 特征已生成完毕")
print("\n现在可以进行 XGBoost 建模:")
print("")
print("1. 04_xgboost_baseline.ipynb (已存在)")
print("   - 训练无 OCEAN 的基线模型")
print("")
print("2. 06_xgboost_with_ocean.ipynb (已存在)")
print("   - 训练包含 OCEAN 特征的完整模型")
print("   - 修改加载数据为: loan_clean_with_ocean.csv")
print("")
print("3. 07_results_analysis.ipynb (已存在)")
print("   - 对比两个模型的性能")
print("   - 评估 OCEAN 特征的价值")

print("\n" + "=" * 80)
print("✅ OCEAN 特征应用完成！")
print("=" * 80)