# 05 - OCEAN 人格特征提取

**目标**: 从 desc 文本字段中提取 OCEAN 五大人格特征

## OCEAN 五大人格维度:
- **O**penness (开放性): 想象力、好奇心、创造性
- **C**onscientiousness (尽责性): 责任感、计划性、自律
- **E**xtraversion (外向性): 社交性、活力、积极情绪
- **A**greeableness (宜人性): 合作性、信任、利他
- **N**euroticism (神经质): 情绪不稳定、焦虑、脆弱

## 关键要求:
1. 使用与 baseline 模型相同的 train/test split
2. 避免数据泄漏
3. 为每个样本生成 5 个 OCEAN 分数 (0-1 范围)
4. 保存特征以便后续模型使用

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
import re
from collections import Counter

# 设置随机种子（与 baseline 模型保持一致）
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("库加载成功！")

## Step 1: 加载数据

In [None]:
# 加载干净的建模数据
print("加载干净的建模数据...")
df = pd.read_csv('../../data/loan_clean_for_modeling.csv', low_memory=False)

print(f"数据形状: {df.shape[0]:,} 行 × {df.shape[1]} 列")

# 检查 desc 字段
if 'desc' in df.columns:
    non_null_desc = df['desc'].notna().sum()
    print(f"\ndesc 非空样本: {non_null_desc:,} ({non_null_desc/len(df)*100:.2f}%)")
    print(f"\n示例 desc 文本 (前3个):")
    for i, text in enumerate(df['desc'].dropna().head(3), 1):
        text_str = str(text)[:150]
        print(f"{i}. {text_str}...")
else:
    print("\n警告: 未找到 desc 列！")

## Step 2: Train/Test Split（与 baseline 保持一致）

In [None]:
# 准备特征和目标变量
X = df.drop(columns=['target'], errors='ignore')
y = df['target']

# 使用相同的随机种子进行分割
print("执行 Train/Test 分割（与 baseline 一致）...\n")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE,
    stratify=y
)

print(f"训练集大小: {X_train.shape[0]:,}")
print(f"测试集大小: {X_test.shape[0]:,}")

# 提取 desc 列
desc_train = X_train['desc'].copy()
desc_test = X_test['desc'].copy()

print(f"\n训练集 desc 非空: {desc_train.notna().sum():,}")
print(f"测试集 desc 非空: {desc_test.notna().sum():,}")

## Step 3: 定义 OCEAN 词典（Lexicon-based Approach）

使用心理语言学词典来识别文本中与各个人格维度相关的词汇

In [None]:
# OCEAN 词典 - 基于心理语言学研究
ocean_lexicon = {
    'openness': {
        'keywords': [
            'creative', 'imaginative', 'innovative', 'artistic', 'curious',
            'explore', 'adventure', 'new', 'different', 'unique', 'original',
            'idea', 'dream', 'vision', 'experience', 'discover', 'learn',
            'travel', 'culture', 'art', 'education', 'knowledge', 'grow',
            'opportunity', 'future', 'possibility', 'potential', 'change'
        ],
        'description': '开放性 - 好奇心、想象力、创造性'
    },
    'conscientiousness': {
        'keywords': [
            'responsible', 'organized', 'plan', 'goal', 'achieve', 'complete',
            'careful', 'thorough', 'diligent', 'disciplined', 'reliable',
            'manage', 'budget', 'save', 'financial', 'stability', 'secure',
            'pay', 'repay', 'debt', 'consolidate', 'improve', 'credit',
            'commit', 'promise', 'obligation', 'duty', 'obligation'
        ],
        'description': '尽责性 - 责任感、自律、计划性'
    },
    'extraversion': {
        'keywords': [
            'social', 'friend', 'family', 'people', 'celebrate', 'party',
            'wedding', 'event', 'community', 'together', 'share', 'join',
            'active', 'energy', 'enthusiasm', 'excited', 'positive',
            'happy', 'love', 'enjoy', 'fun', 'life', 'relationship'
        ],
        'description': '外向性 - 社交性、活力、积极情绪'
    },
    'agreeableness': {
        'keywords': [
            'help', 'support', 'care', 'assist', 'generous', 'kind',
            'family', 'children', 'parents', 'trust', 'honest', 'fair',
            'cooperate', 'understand', 'compassion', 'empathy',
            'community', 'charity', 'donate', 'volunteer', 'serve',
            'together', 'team', 'partnership', 'relationship'
        ],
        'description': '宜人性 - 合作性、信任、利他'
    },
    'neuroticism': {
        'keywords': [
            'worry', 'stress', 'anxiety', 'concern', 'fear', 'nervous',
            'difficult', 'struggle', 'problem', 'issue', 'challenge',
            'emergency', 'unexpected', 'crisis', 'urgent', 'need',
            'pressure', 'burden', 'overwhelm', 'frustrated', 'unfortunate',
            'hardship', 'setback', 'obstacle', 'difficult', 'tough'
        ],
        'description': '神经质 - 情绪不稳定、焦虑'
    }
}

print("OCEAN 词典定义完成！\n")
for trait, info in ocean_lexicon.items():
    print(f"{trait.upper()}: {info['description']}")
    print(f"  关键词数量: {len(info['keywords'])}")
    print(f"  示例: {', '.join(info['keywords'][:5])}...\n")

## Step 4: OCEAN 特征提取函数

In [None]:
def extract_ocean_features(text):
    """
    从文本中提取 OCEAN 人格特征分数
    
    参数:
        text: 输入文本
    
    返回:
        dict: 包含 5 个 OCEAN 分数的字典 (0-1 范围)
    """
    if pd.isna(text) or text == '':
        # 缺失文本返回中性值 0.5
        return {
            'ocean_openness': 0.5,
            'ocean_conscientiousness': 0.5,
            'ocean_extraversion': 0.5,
            'ocean_agreeableness': 0.5,
            'ocean_neuroticism': 0.5
        }
    
    # 文本预处理
    text_lower = str(text).lower()
    # 移除标点符号
    text_clean = re.sub(r'[^a-z\s]', ' ', text_lower)
    words = text_clean.split()
    word_count = len(words)
    
    if word_count == 0:
        return {
            'ocean_openness': 0.5,
            'ocean_conscientiousness': 0.5,
            'ocean_extraversion': 0.5,
            'ocean_agreeableness': 0.5,
            'ocean_neuroticism': 0.5
        }
    
    # 计算每个 OCEAN 维度的分数
    ocean_scores = {}
    
    for trait, info in ocean_lexicon.items():
        # 计算匹配关键词的数量
        matches = sum(1 for word in words if word in info['keywords'])
        
        # 归一化分数 (基于文本长度)
        # 使用对数缩放避免过度依赖文本长度
        raw_score = matches / np.sqrt(word_count) if word_count > 0 else 0
        
        # 将分数映射到 0-1 范围，使用 sigmoid 函数
        # 调整参数使得分数分布更合理
        normalized_score = 1 / (1 + np.exp(-5 * (raw_score - 0.3)))
        
        # 确保分数在 0-1 范围内
        normalized_score = np.clip(normalized_score, 0, 1)
        
        ocean_scores[f'ocean_{trait}'] = normalized_score
    
    return ocean_scores

# 测试函数
test_text = "I need this loan to consolidate my debt and improve my financial stability. I am a responsible person who always pays bills on time."
test_scores = extract_ocean_features(test_text)

print("测试 OCEAN 提取功能:\n")
print(f"输入文本: {test_text}\n")
print("OCEAN 分数:")
for trait, score in test_scores.items():
    print(f"  {trait}: {score:.4f}")

## Step 5: 为训练集提取 OCEAN 特征

In [None]:
print("为训练集提取 OCEAN 特征...\n")
print(f"总样本数: {len(desc_train):,}")

# 提取特征
ocean_features_train = []
for i, text in enumerate(desc_train):
    if (i + 1) % 10000 == 0:
        print(f"  已处理: {i+1:,} / {len(desc_train):,}")
    
    features = extract_ocean_features(text)
    ocean_features_train.append(features)

# 转换为 DataFrame
ocean_train_df = pd.DataFrame(ocean_features_train, index=desc_train.index)

print(f"\n✅ 训练集 OCEAN 特征提取完成！")
print(f"特征形状: {ocean_train_df.shape}")
print(f"\n特征统计:\n")
print(ocean_train_df.describe())

## Step 6: 为测试集提取 OCEAN 特征

In [None]:
print("为测试集提取 OCEAN 特征...\n")
print(f"总样本数: {len(desc_test):,}")

# 提取特征
ocean_features_test = []
for i, text in enumerate(desc_test):
    if (i + 1) % 5000 == 0:
        print(f"  已处理: {i+1:,} / {len(desc_test):,}")
    
    features = extract_ocean_features(text)
    ocean_features_test.append(features)

# 转换为 DataFrame
ocean_test_df = pd.DataFrame(ocean_features_test, index=desc_test.index)

print(f"\n✅ 测试集 OCEAN 特征提取完成！")
print(f"特征形状: {ocean_test_df.shape}")
print(f"\n特征统计:\n")
print(ocean_test_df.describe())

## Step 7: 合并并保存 OCEAN 特征

In [None]:
# 合并训练集和测试集的 OCEAN 特征
ocean_all_df = pd.concat([ocean_train_df, ocean_test_df])

# 按原始索引排序
ocean_all_df = ocean_all_df.sort_index()

print(f"合并后的 OCEAN 特征形状: {ocean_all_df.shape}")
print(f"原始数据形状: {df.shape}")

# 验证索引一致性
if len(ocean_all_df) == len(df):
    print("\n✅ 索引验证通过！")
else:
    print(f"\n⚠️ 警告: 索引不匹配！")
    print(f"OCEAN 特征: {len(ocean_all_df)}")
    print(f"原始数据: {len(df)}")

# 保存 OCEAN 特征
ocean_all_df.to_csv('../../ocean_features.csv', index=True)
print("\nOCEAN 特征已保存: ocean_features.csv")

# 同时保存训练集和测试集的分开版本（用于后续建模）
ocean_train_df.to_csv('../../ocean_features_train.csv', index=True)
ocean_test_df.to_csv('../../ocean_features_test.csv', index=True)
print("训练集 OCEAN 特征已保存: ocean_features_train.csv")
print("测试集 OCEAN 特征已保存: ocean_features_test.csv")

## Step 8: OCEAN 特征分析

In [None]:
print("=" * 80)
print("OCEAN 特征分析")
print("=" * 80)

# 1. 分布统计
print("\n1️⃣ OCEAN 特征分布统计")
print("-" * 80)
for col in ocean_all_df.columns:
    mean_val = ocean_all_df[col].mean()
    std_val = ocean_all_df[col].std()
    min_val = ocean_all_df[col].min()
    max_val = ocean_all_df[col].max()
    print(f"{col}:")
    print(f"  均值: {mean_val:.4f}, 标准差: {std_val:.4f}")
    print(f"  范围: [{min_val:.4f}, {max_val:.4f}]")

# 2. 相关性分析
print("\n2️⃣ OCEAN 特征相关性")
print("-" * 80)
correlation_matrix = ocean_all_df.corr()
print(correlation_matrix)

# 3. 与目标变量的关系
print("\n3️⃣ OCEAN 特征与违约率的关系")
print("-" * 80)

# 合并目标变量
ocean_with_target = ocean_all_df.copy()
ocean_with_target['target'] = y

# 计算每个 OCEAN 特征的违约率差异
for col in ocean_all_df.columns:
    # 将特征分为高低两组（以中位数为界）
    median_val = ocean_with_target[col].median()
    high_group = ocean_with_target[ocean_with_target[col] > median_val]
    low_group = ocean_with_target[ocean_with_target[col] <= median_val]
    
    high_default_rate = high_group['target'].mean()
    low_default_rate = low_group['target'].mean()
    diff = high_default_rate - low_default_rate
    
    print(f"{col}:")
    print(f"  高分组违约率: {high_default_rate*100:.2f}%")
    print(f"  低分组违约率: {low_default_rate*100:.2f}%")
    print(f"  差异: {diff*100:+.2f}%")

## Step 9: 可视化 OCEAN 特征

In [None]:
# 创建可视化
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

# 1-5. 每个 OCEAN 特征的分布
for i, col in enumerate(ocean_all_df.columns):
    ax = axes[i]
    
    # 违约 vs 非违约的分布
    default_scores = ocean_with_target[ocean_with_target['target'] == 1][col]
    non_default_scores = ocean_with_target[ocean_with_target['target'] == 0][col]
    
    ax.hist(non_default_scores, bins=30, alpha=0.5, label='Fully Paid', color='green', edgecolor='black')
    ax.hist(default_scores, bins=30, alpha=0.5, label='Charged Off', color='red', edgecolor='black')
    
    trait_name = col.replace('ocean_', '').title()
    ax.set_xlabel('Score', fontsize=11, fontweight='bold')
    ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
    ax.set_title(f'{trait_name} Distribution', fontsize=12, fontweight='bold')
    ax.legend(fontsize=9)
    ax.grid(alpha=0.3)

# 6. OCEAN 相关性热图
ax = axes[5]
im = ax.imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
ax.set_xticks(range(len(ocean_all_df.columns)))
ax.set_yticks(range(len(ocean_all_df.columns)))
ax.set_xticklabels([col.replace('ocean_', '')[:3].upper() for col in ocean_all_df.columns], 
                    rotation=45, fontsize=10)
ax.set_yticklabels([col.replace('ocean_', '')[:3].upper() for col in ocean_all_df.columns], 
                    fontsize=10)
ax.set_title('OCEAN Feature Correlations', fontsize=12, fontweight='bold')

# 添加数值标签
for i in range(len(ocean_all_df.columns)):
    for j in range(len(ocean_all_df.columns)):
        text = ax.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}',
                      ha='center', va='center', color='black', fontsize=9, fontweight='bold')

plt.colorbar(im, ax=ax)

plt.tight_layout()
plt.savefig('../../ocean_features_analysis.png', dpi=300, bbox_inches='tight')
print("\n可视化已保存: ocean_features_analysis.png")
plt.show()

## Step 10: 总结

In [None]:
print("=" * 80)
print("OCEAN 特征提取总结")
print("=" * 80)

print("\n1️⃣ 提取方法")
print("-" * 80)
print("方法: Lexicon-based (词典匹配法)")
print("词典来源: 心理语言学研究（LIWC 风格）")
print(f"总关键词数: {sum(len(info['keywords']) for info in ocean_lexicon.values())}")
print("归一化: Sigmoid 函数 + 文本长度调整")

print("\n2️⃣ 特征统计")
print("-" * 80)
print(f"总样本数: {len(ocean_all_df):,}")
print(f"训练集样本: {len(ocean_train_df):,}")
print(f"测试集样本: {len(ocean_test_df):,}")
print(f"OCEAN 特征数: {len(ocean_all_df.columns)}")

print("\n3️⃣ 数据质量")
print("-" * 80)
print("缺失值处理: 填充为中性值 0.5")
print("分数范围: [0, 1]")
print(f"数据泄漏检查: ✅ 训练集和测试集分别提取")

print("\n4️⃣ 保存的文件")
print("-" * 80)
print("1. ocean_features.csv - 完整 OCEAN 特征")
print("2. ocean_features_train.csv - 训练集 OCEAN 特征")
print("3. ocean_features_test.csv - 测试集 OCEAN 特征")
print("4. ocean_features_analysis.png - 可视化分析")

print("\n5️⃣ 下一步")
print("-" * 80)
print("✅ OCEAN 特征提取完成，现在可以进行:")
print("")
print("1. 06_xgboost_with_ocean.ipynb")
print("   - 合并 OCEAN 特征与原始特征")
print("   - 训练包含 OCEAN 的完整 XGBoost 模型")
print("   - 对比 baseline 性能")
print("")
print("2. 07_results_analysis.ipynb")
print("   - 详细对比 Baseline vs Full Model")
print("   - 分析 OCEAN 特征的预测价值")
print("   - 生成最终报告")
print("")
print("=" * 80)

print("\n✅ OCEAN 特征提取完成！")