# 05b - 训练 Ridge 回归学习 OCEAN 权重

**目标**: 使用 500 个带 OCEAN 标签的样本，训练 Ridge 回归模型学习特征对各个 OCEAN 维度的影响

## 工作流程:
1. 加载 Ground Truth OCEAN 标签 (500 样本)
2. 加载干净的建模数据并提取对应的 500 个样本
3. 选择 PRE-LOAN 特征（确保无数据泄漏）
4. 编码特征（数值型标准化 + 分类型 One-hot）
5. 为每个 OCEAN 维度训练 Ridge 回归 (alpha=0.17)
6. 提取并保存权重系数
7. 评估模型性能

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

# 设置随机种子
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("库加载成功！")

## Step 1: 加载 Ground Truth OCEAN 标签

In [None]:
# 加载 Ground Truth OCEAN 标签
print("加载 Ground Truth OCEAN 标签...")
df_ground_truth = pd.read_csv('ocean_ground_truth_500.csv', low_memory=False)

print(f"数据形状: {df_ground_truth.shape}")
print(f"\n列名: {list(df_ground_truth.columns)}")
print(f"\n目标变量分布:")
print(df_ground_truth['target'].value_counts())

# OCEAN 列
ocean_cols = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
print(f"\nOCEAN 分数统计:")
print(df_ground_truth[ocean_cols].describe())

## Step 2: 加载干净的建模数据

In [None]:
# 加载干净的建模数据
print("加载干净的建模数据...")
df_modeling = pd.read_csv('data/loan_clean_for_modeling.csv', low_memory=False)

print(f"建模数据形状: {df_modeling.shape}")

# 由于 Ground Truth 的索引可能不连续，我们需要重新对齐
# 创建一个 ID 映射来匹配样本
# 注意: 这里假设 500 个样本的顺序与选择时一致

# 从 ground truth 中获取样本的原始索引（如果有保存）
# 否则，我们使用前 500 行相应的样本

# 创建一个索引映射
# 这里我们使用 ground truth 中的 desc 来匹配原始数据
print("\n对齐 Ground Truth 和建模数据...")

# 为了确保准确匹配，我们可以使用 desc 的哈希值
from hashlib import md5

def get_desc_hash(desc):
    """计算 desc 的哈希值用于匹配"""
    if pd.isna(desc):
        return None
    return md5(str(desc).encode()).hexdigest()

# 添加哈希列
df_ground_truth['desc_hash'] = df_ground_truth['desc'].apply(get_desc_hash)
df_modeling['desc_hash'] = df_modeling['desc'].apply(get_desc_hash)

# 查找匹配的行
matching_indices = []
for hash_val in df_ground_truth['desc_hash']:
    idx = df_modeling[df_modeling['desc_hash'] == hash_val].index[0] if any(df_modeling['desc_hash'] == hash_val) else None
    if idx is not None:
        matching_indices.append(idx)

print(f"\n成功匹配的样本数: {len(matching_indices)} / {len(df_ground_truth)}")

if len(matching_indices) == len(df_ground_truth):
    # 使用匹配的索引
    df_features = df_modeling.loc[matching_indices].reset_index(drop=True)
    print("✅ 所有样本成功对齐")
else:
    print(f"⚠️ 警告: 只有 {len(matching_indices)} 个样本成功对齐")
    # 使用匹配的部分
    df_features = df_modeling.loc[matching_indices].reset_index(drop=True)
    df_ground_truth = df_ground_truth.iloc[:len(df_features)].reset_index(drop=True)

print(f"\n最终数据形状: {df_features.shape}")

## Step 3: 定义 PRE-LOAN 特征

In [None]:
# 定义 PRE-LOAN 特征（确保无数据泄漏）
# 这些是申请时就已知的特征

numeric_features = [
    'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
    'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'inq_last_6mths',
    'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc'
]

categorical_features = [
    'term', 'grade', 'sub_grade', 'purpose', 'home_ownership', 'emp_length',
    'verification_status', 'application_type'
]

# 检查这些特征是否在数据中存在
print("检查 PRE-LOAN 特征...\n")

available_numeric = [f for f in numeric_features if f in df_features.columns]
available_categorical = [f for f in categorical_features if f in df_features.columns]

print(f"可用的数值型特征: {len(available_numeric)} 个")
print(available_numeric)

print(f"\n可用的分类特征: {len(available_categorical)} 个")
print(available_categorical)

# 使用可用的特征
numeric_features = available_numeric
categorical_features = available_categorical
all_features = numeric_features + categorical_features

print(f"\n总特征数: {len(all_features)}")

## Step 4: 特征编码

In [None]:
# 准备特征矩阵
print("准备特征矩阵...\n")

X = df_features[all_features].copy()

# 处理缺失值
print("处理缺失值...")
for col in numeric_features:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].median(), inplace=True)

for col in categorical_features:
    if X[col].isnull().sum() > 0:
        X[col].fillna('unknown', inplace=True)

print(f"缺失值处理完成")

# 创建预处理器
print("\n创建预处理器...")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# 拟合并转换特征
print("编码特征...")
X_processed = preprocessor.fit_transform(X)

print(f"\n编码后的特征形状: {X_processed.shape}")

# 获取特征名称
try:
    cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
    feature_names = list(numeric_features) + list(cat_feature_names)
    print(f"总特征数（编码后）: {len(feature_names)}")
except:
    feature_names = [f'feature_{i}' for i in range(X_processed.shape[1])]
    print(f"总特征数（编码后）: {X_processed.shape[1]}")

## Step 5: 训练 Ridge 回归模型

In [None]:
# 为每个 OCEAN 维度训练 Ridge 回归模型
print("为每个 OCEAN 维度训练 Ridge 回归模型...\n")

alpha = 0.17  # 指定的正则化参数
ocean_models = {}
ocean_scores = {}

for ocean_trait in ocean_cols:
    print(f"\n训练 {ocean_trait.upper()} 模型...")
    print("-" * 60)
    
    # 目标变量
    y = df_ground_truth[ocean_trait].values
    
    # 创建并训练 Ridge 回归模型
    model = Ridge(alpha=alpha, random_state=RANDOM_STATE)
    model.fit(X_processed, y)
    
    # 预测
    y_pred = model.predict(X_processed)
    
    # 计算评估指标
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    # 保存模型和分数
    ocean_models[ocean_trait] = model
    ocean_scores[ocean_trait] = {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }
    
    print(f"MSE:  {mse:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"MAE:  {mae:.6f}")
    print(f"R²:   {r2:.6f}")
    
    # 显示权重最大的特征
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': model.coef_
    }).sort_values('coefficient', key=abs, ascending=False)
    
    print(f"\n前 10 个重要特征:")
    print(coef_df.head(10).to_string(index=False))

print("\n" + "=" * 60)
print("所有模型训练完成！")
print("=" * 60)

## Step 6: 性能总结

In [None]:
# 创建性能总结
print("\n=" * 80)
print("Ridge 回归模型性能总结")
print("=" * 80)

performance_df = pd.DataFrame(ocean_scores).T
print("\n性能指标对比:")
print(performance_df.to_string())

print("\n\n平均性能:")
print(f"平均 R²: {performance_df['r2'].mean():.6f}")
print(f"平均 RMSE: {performance_df['rmse'].mean():.6f}")
print(f"平均 MAE: {performance_df['mae'].mean():.6f}")

## Step 7: 提取并保存权重系数

In [None]:
# 提取所有模型的权重和截距
print("提取权重系数...\n")

weights_summary = {}

for ocean_trait, model in ocean_models.items():
    weights_summary[ocean_trait] = {
        'intercept': float(model.intercept_),
        'coefficients': {feature_names[i]: float(model.coef_[i]) for i in range(len(feature_names))},
        'alpha': alpha
    }

# 保存为 JSON
print("保存权重到 JSON...")
json_file = 'ocean_weights_formula.json'
with open(json_file, 'w') as f:
    json.dump(weights_summary, f, indent=2)

print(f"✅ 权重已保存: {json_file}")

# 也保存为 pickle 格式（用于 Python 加载）
print("\n保存模型到 pickle...")
pickle_file = 'ocean_ridge_models.pkl'
with open(pickle_file, 'wb') as f:
    pickle.dump({
        'models': ocean_models,
        'preprocessor': preprocessor,
        'feature_names': feature_names,
        'numeric_features': numeric_features,
        'categorical_features': categorical_features,
        'alpha': alpha
    }, f)

print(f"✅ 模型已保存: {pickle_file}")

## Step 8: 创建权重系数 CSV

In [None]:
# 创建权重系数表
print("创建权重系数表...\n")

coefficients_list = []

for ocean_trait in ocean_cols:
    model = ocean_models[ocean_trait]
    intercept = model.intercept_
    
    # 添加截距行
    coefficients_list.append({
        'OCEAN_trait': ocean_trait,
        'feature': 'INTERCEPT',
        'coefficient': intercept
    })
    
    # 添加所有特征的系数
    for i, feature in enumerate(feature_names):
        coefficients_list.append({
            'OCEAN_trait': ocean_trait,
            'feature': feature,
            'coefficient': model.coef_[i]
        })

coefficients_df = pd.DataFrame(coefficients_list)

# 保存为 CSV
csv_file = 'ocean_weights_coefficients.csv'
coefficients_df.to_csv(csv_file, index=False)

print(f"✅ 权重系数表已保存: {csv_file}")
print(f"形状: {coefficients_df.shape}")
print(f"\n前 20 行:")
print(coefficients_df.head(20).to_string(index=False))

## Step 9: 可视化权重

In [None]:
# 可视化每个 OCEAN 维度的权重
print("生成权重可视化...\n")

fig, axes = plt.subplots(1, 5, figsize=(20, 5))

for idx, ocean_trait in enumerate(ocean_cols):
    model = ocean_models[ocean_trait]
    
    # 获取最大的 15 个系数（按绝对值）
    coef_abs = np.abs(model.coef_)
    top_indices = np.argsort(coef_abs)[-15:][::-1]
    
    top_features = [feature_names[i] for i in top_indices]
    top_coefs = [model.coef_[i] for i in top_indices]
    
    # 绘图
    ax = axes[idx]
    colors = ['red' if c < 0 else 'green' for c in top_coefs]
    y_pos = np.arange(len(top_features))
    
    ax.barh(y_pos, top_coefs, color=colors, alpha=0.7, edgecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels([f[:20] for f in top_features], fontsize=8)
    ax.invert_yaxis()
    ax.set_xlabel('Coefficient', fontsize=10, fontweight='bold')
    ax.set_title(f'{ocean_trait.upper()}\n(R²={ocean_scores[ocean_trait]["r2"]:.4f})', 
                fontsize=11, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)

plt.tight_layout()
plt.savefig('ocean_weights_visualization.png', dpi=300, bbox_inches='tight')
print("✅ 可视化已保存: ocean_weights_visualization.png")
plt.show()

## Step 10: 总结

In [None]:
print("\n" + "=" * 80)
print("Ridge 回归权重训练总结")
print("=" * 80)

print("\n1️⃣ 训练数据")
print("-" * 80)
print(f"样本数: {len(df_ground_truth):,}")
print(f"特征数: {X_processed.shape[1]}")
print(f"  - 数值型特征: {len(numeric_features)}")
print(f"  - 分类特征 (编码后): {len(feature_names) - len(numeric_features)}")

print("\n2️⃣ 模型配置")
print("-" * 80)
print(f"算法: Ridge 回归")
print(f"正则化参数 (alpha): {alpha}")
print(f"OCEAN 维度数: {len(ocean_cols)}")
print(f"每个模型的系数数: {len(feature_names) + 1}")

print("\n3️⃣ 性能指标")
print("-" * 80)
for ocean_trait in ocean_cols:
    r2 = ocean_scores[ocean_trait]['r2']
    rmse = ocean_scores[ocean_trait]['rmse']
    print(f"{ocean_trait:20s}: R²={r2:.6f}, RMSE={rmse:.6f}")

print("\n4️⃣ 生成的文件")
print("-" * 80)
print("1. ocean_weights_formula.json")
print("   - 格式: JSON 对象，包含每个 OCEAN 维度的权重")
print(f"   - 结构: {{ ocean_trait: {{ 'intercept': ..., 'coefficients': {{...}}, 'alpha': ... }} }}")
print("")
print("2. ocean_ridge_models.pkl")
print("   - 格式: Python pickle 文件")
print("   - 内容: 5 个已训练的 Ridge 模型 + 预处理器")
print("")
print("3. ocean_weights_coefficients.csv")
print(f"   - 格式: CSV 文件，{coefficients_df.shape[0]} 行")
print("   - 列: OCEAN_trait, feature, coefficient")
print("")
print("4. ocean_weights_visualization.png")
print("   - 5 个 OCEAN 维度的权重可视化")

print("\n5️⃣ 下一步")
print("-" * 80)
print("运行 05c_apply_ocean_to_all.ipynb:")
print("1. 加载保存的权重公式")
print("2. 为所有客户应用公式")
print("3. 生成完整的 OCEAN 特征 CSV")
print("")
print("然后进行后续的 XGBoost 建模:")
print("- 04_xgboost_baseline.ipynb")
print("- 06_xgboost_with_ocean.ipynb")
print("- 07_results_analysis.ipynb")

print("\n" + "=" * 80)
print("✅ Ridge 回归权重训练完成！")
print("=" * 80)