# 03 - 创建干净的建模数据集

**目标**: 创建一个干净的、无数据泄漏的建模数据集

## 处理步骤:
1. 加载有 desc 的数据
2. 删除所有 POST-LOAN 特征（防止数据泄漏）
3. 删除覆盖率 < 80% 的特征（质量控制）
4. 删除 METADATA 特征（除了 desc - 保留用于 OCEAN 提取）
5. 创建二分类目标变量（Fully Paid vs Charged Off）
6. 保存干净数据集
7. 生成质量报告

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("库加载成功！")

## Step 1: 加载数据

In [None]:
# 加载原始数据
print("加载原始数据 loan.csv...")
df = pd.read_csv('../../data/loan.csv', low_memory=False)
print(f"原始数据: {df.shape[0]:,} 行 × {df.shape[1]} 列")

# 筛选有 desc 的数据
print("\n筛选有 desc 的数据...")
df_with_desc = df[
    df['desc'].notna() & 
    (df['desc'].astype(str).str.strip().str.len() > 1)
].copy()

print(f"有 desc 的数据: {df_with_desc.shape[0]:,} 行 × {df_with_desc.shape[1]} 列")
print(f"覆盖率: {len(df_with_desc)/len(df)*100:.2f}%")

# 释放原始数据内存
del df

print("\n数据加载完成！")

## Step 2: 定义特征分类

基于 02_feature_selection_and_leakage_check.ipynb 的分析结果

In [None]:
# POST-LOAN 特征（必须删除 - 会导致数据泄漏）
post_loan_features = [
    # Payment related (generated after loan issued)
    'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
    'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
    'recoveries', 'collection_recovery_fee',
    'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d',
    'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low',
    
    # Hardship and debt settlement (post-loan events)
    'hardship_flag', 'hardship_type', 'hardship_reason',
    'hardship_status', 'hardship_start_date', 'hardship_end_date',
    'hardship_loan_status', 'hardship_dpd', 'hardship_length',
    'hardship_amount', 'hardship_payoff_balance_amount',
    'deferral_term', 'payment_plan_start_date',
    'debt_settlement_flag', 'debt_settlement_flag_date',
    'settlement_status', 'settlement_date', 'settlement_amount',
    'settlement_percentage', 'settlement_term',
    
    # Other post-loan info
    'pymnt_plan', 'initial_list_status',  # policy file date is post-approval
    'policy_code',  # internal policy code
]

# METADATA 特征（无预测价值 - 删除，但保留 desc）
metadata_features = [
    'id', 'member_id', 'url',  # ID fields
    'funded_amnt_inv',  # duplicate of funded_amnt for investors
    # 注意: 'desc' 暂时保留用于 OCEAN 提取
]

# OUTCOME 特征（目标变量相关 - 特殊处理）
outcome_features = [
    'loan_status',  # 将用于创建目标变量
]

print(f"POST-LOAN 特征数量: {len(post_loan_features)}")
print(f"METADATA 特征数量: {len(metadata_features)}")
print(f"OUTCOME 特征数量: {len(outcome_features)}")
print(f"\n保留 'desc' 字段用于后续 OCEAN 特征提取")

## Step 3: 分析特征覆盖率

In [None]:
# 计算所有特征的覆盖率
print("计算特征覆盖率...\n")

coverage_stats = []

for col in df_with_desc.columns:
    non_null = df_with_desc[col].notna().sum()
    coverage = (non_null / len(df_with_desc)) * 100
    
    # 判断特征类型
    if col in post_loan_features:
        feature_type = 'POST-LOAN'
        keep_status = '❌ DELETE (Leakage)'
    elif col in metadata_features:
        feature_type = 'METADATA'
        keep_status = '❌ DELETE (No value)'
    elif col in outcome_features:
        feature_type = 'OUTCOME'
        keep_status = '⚠️ SPECIAL (Target)'
    elif col == 'desc':
        feature_type = 'TEXT'
        keep_status = '✅ KEEP (For OCEAN)'
    else:
        feature_type = 'PRE-LOAN'
        if coverage >= 80:
            keep_status = '✅ KEEP (Good quality)'
        else:
            keep_status = f'❌ DELETE (Coverage {coverage:.1f}% < 80%)'
    
    coverage_stats.append({
        '特征名': col,
        '类型': feature_type,
        '覆盖率%': f"{coverage:.2f}",
        '非空数量': f"{non_null:,}",
        '处理决策': keep_status,
        'dtype': str(df_with_desc[col].dtype)
    })

coverage_df = pd.DataFrame(coverage_stats)
coverage_df['coverage_numeric'] = coverage_df['覆盖率%'].astype(float)
coverage_df = coverage_df.sort_values('coverage_numeric', ascending=False)

print("特征覆盖率分析完成！")
print(f"\n总特征数: {len(coverage_df)}")

# 统计各类特征
print("\n=" * 80)
print("特征类型统计")
print("=" * 80)
print(coverage_df['类型'].value_counts())

print("\n=" * 80)
print("处理决策统计")
print("=" * 80)
keep_count = coverage_df['处理决策'].str.contains('KEEP').sum()
delete_count = coverage_df['处理决策'].str.contains('DELETE').sum()
special_count = coverage_df['处理决策'].str.contains('SPECIAL').sum()

print(f"保留特征: {keep_count}")
print(f"删除特征: {delete_count}")
print(f"特殊处理: {special_count}")

# 保存覆盖率报告
coverage_df.to_csv('../../feature_coverage_report.csv', index=False)
print("\n覆盖率报告已保存: feature_coverage_report.csv")

## Step 4: 查看需要删除的特征

In [None]:
# 查看需要删除的特征
delete_features = coverage_df[coverage_df['处理决策'].str.contains('DELETE')]

print("=" * 80)
print(f"需要删除的特征 (共 {len(delete_features)} 个)")
print("=" * 80)
print(delete_features[['特征名', '类型', '覆盖率%', '处理决策']].to_string(index=False))

# 按删除原因分组
print("\n=" * 80)
print("删除原因分析")
print("=" * 80)

leakage_delete = delete_features[delete_features['类型'] == 'POST-LOAN']
metadata_delete = delete_features[delete_features['类型'] == 'METADATA']
quality_delete = delete_features[
    (delete_features['类型'] == 'PRE-LOAN') & 
    (delete_features['覆盖率%'].astype(float) < 80)
]

print(f"因数据泄漏删除 (POST-LOAN): {len(leakage_delete)} 个")
print(f"因无预测价值删除 (METADATA): {len(metadata_delete)} 个")
print(f"因质量不达标删除 (覆盖率<80%): {len(quality_delete)} 个")

## Step 5: 删除特征并创建干净数据集

In [None]:
# 收集所有需要删除的特征名
features_to_delete = delete_features['特征名'].tolist()

print(f"开始删除 {len(features_to_delete)} 个特征...\n")
print(f"删除前数据形状: {df_with_desc.shape}")

# 删除特征
df_clean = df_with_desc.drop(columns=features_to_delete, errors='ignore')

print(f"删除后数据形状: {df_clean.shape}")
print(f"保留特征数: {df_clean.shape[1]}")
print(f"\n保留的特征包括:")
print(f"- desc (用于 OCEAN 提取)")
print(f"- loan_status (用于创建目标变量)")
print(f"- {df_clean.shape[1] - 2} 个高质量 PRE-LOAN 特征")

## Step 6: 创建目标变量

In [None]:
# 查看 loan_status 的分布
print("=" * 80)
print("loan_status 分布")
print("=" * 80)
print(df_clean['loan_status'].value_counts())
print(f"\n总计: {len(df_clean):,} 条记录")

# 创建二分类目标变量
# 1 = Charged Off (违约), 0 = Fully Paid (正常还款)
print("\n创建目标变量...")

# 定义正常还款的状态
fully_paid_statuses = ['Fully Paid', 'Current', 'In Grace Period']
# 定义违约的状态
charged_off_statuses = ['Charged Off', 'Default', 'Late (31-120 days)', 'Late (16-30 days)']

# 只保留明确的 Fully Paid 和 Charged Off 状态
df_clean_binary = df_clean[
    (df_clean['loan_status'] == 'Fully Paid') | 
    (df_clean['loan_status'] == 'Charged Off')
].copy()

print(f"\n筛选后数据量: {len(df_clean_binary):,} 条")
print("\nloan_status 分布 (筛选后):")
print(df_clean_binary['loan_status'].value_counts())

# 创建目标变量
df_clean_binary['target'] = (df_clean_binary['loan_status'] == 'Charged Off').astype(int)

print("\n=" * 80)
print("目标变量分布")
print("=" * 80)
print(f"target = 0 (Fully Paid): {(df_clean_binary['target']==0).sum():,}")
print(f"target = 1 (Charged Off): {(df_clean_binary['target']==1).sum():,}")
print(f"\n违约率: {df_clean_binary['target'].mean()*100:.2f}%")

# 现在可以删除 loan_status
df_clean_binary = df_clean_binary.drop(columns=['loan_status'])

print(f"\n最终数据形状: {df_clean_binary.shape}")

## Step 7: 数据类型分析

In [None]:
# 分析保留特征的数据类型
print("=" * 80)
print("保留特征的数据类型分布")
print("=" * 80)

numeric_features = df_clean_binary.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df_clean_binary.select_dtypes(include=['object']).columns.tolist()

# 移除 target 和 desc
if 'target' in numeric_features:
    numeric_features.remove('target')
if 'desc' in categorical_features:
    categorical_features.remove('desc')

print(f"\n数值型特征: {len(numeric_features)} 个")
print(f"分类特征: {len(categorical_features)} 个")
print(f"文本特征: 1 个 (desc)")
print(f"目标变量: 1 个 (target)")

print("\n=" * 80)
print("数值型特征列表")
print("=" * 80)
for i, feat in enumerate(numeric_features, 1):
    print(f"{i:3d}. {feat}")

print("\n=" * 80)
print("分类特征列表")
print("=" * 80)
for i, feat in enumerate(categorical_features, 1):
    print(f"{i:2d}. {feat}")

## Step 8: 保存干净数据集

In [None]:
# 保存干净的建模数据集
output_file = '../../data/loan_clean_for_modeling.csv'

print(f"保存干净数据集到: {output_file}")
df_clean_binary.to_csv(output_file, index=False)

import os
file_size = os.path.getsize(output_file) / (1024 * 1024)  # MB
print(f"\n文件大小: {file_size:.2f} MB")
print(f"数据形状: {df_clean_binary.shape[0]:,} 行 × {df_clean_binary.shape[1]} 列")

# 保存特征列表
feature_lists = {
    'numeric_features': numeric_features,
    'categorical_features': categorical_features,
    'text_feature': ['desc'],
    'target': ['target']
}

import json
with open('../../feature_lists_clean.json', 'w') as f:
    json.dump(feature_lists, f, indent=2)

print("\n特征列表已保存: feature_lists_clean.json")
print("\n✅ 数据清洗完成！")

## Step 9: 数据质量总结报告

In [None]:
print("=" * 80)
print("数据清洗总结报告")
print("=" * 80)

print("\n1️⃣ 数据规模变化")
print("-" * 80)
print(f"原始数据集: 2,260,668 行 × 145 列")
print(f"有 desc 的数据: {len(df_with_desc):,} 行 × {len(df_with_desc.columns)} 列")
print(f"最终建模数据: {df_clean_binary.shape[0]:,} 行 × {df_clean_binary.shape[1]} 列")
print(f"数据保留率: {len(df_clean_binary)/2260668*100:.2f}%")

print("\n2️⃣ 特征删除统计")
print("-" * 80)
print(f"原始特征数: 145")
print(f"删除的特征数: {len(features_to_delete)}")
print(f"  - POST-LOAN (防止泄漏): {len(leakage_delete)}")
print(f"  - METADATA (无价值): {len(metadata_delete)}")
print(f"  - 低质量 (覆盖率<80%): {len(quality_delete)}")
print(f"保留的特征数: {df_clean_binary.shape[1]}")
print(f"  - 数值型: {len(numeric_features)}")
print(f"  - 分类型: {len(categorical_features)}")
print(f"  - 文本型: 1 (desc)")
print(f"  - 目标变量: 1 (target)")

print("\n3️⃣ 目标变量分布")
print("-" * 80)
print(f"Fully Paid (target=0): {(df_clean_binary['target']==0).sum():,} ({(df_clean_binary['target']==0).sum()/len(df_clean_binary)*100:.2f}%)")
print(f"Charged Off (target=1): {(df_clean_binary['target']==1).sum():,} ({(df_clean_binary['target']==1).sum()/len(df_clean_binary)*100:.2f}%)")
print(f"违约率: {df_clean_binary['target'].mean()*100:.2f}%")

print("\n4️⃣ 数据质量检查")
print("-" * 80)
# 检查每个特征的缺失率
missing_report = []
for col in df_clean_binary.columns:
    if col not in ['target', 'desc']:
        missing_pct = (df_clean_binary[col].isna().sum() / len(df_clean_binary)) * 100
        if missing_pct > 0:
            missing_report.append({
                '特征': col,
                '缺失率%': f"{missing_pct:.2f}"
            })

if missing_report:
    missing_df = pd.DataFrame(missing_report)
    missing_df['missing_numeric'] = missing_df['缺失率%'].astype(float)
    missing_df = missing_df.sort_values('missing_numeric', ascending=False)
    missing_df = missing_df.drop('missing_numeric', axis=1)
    
    print(f"有缺失值的特征: {len(missing_report)} 个")
    print("\n缺失率最高的前10个特征:")
    print(missing_df.head(10).to_string(index=False))
else:
    print("所有特征都没有缺失值 ✅")

print("\n5️⃣ 下一步行动")
print("-" * 80)
print("✅ 数据清洗完成，可以进行以下步骤:")
print("")
print("1. 04_xgboost_baseline.ipynb")
print("   - 使用干净数据训练 XGBoost 基线模型")
print("   - 不包含 OCEAN 特征")
print("   - 建立性能基准")
print("")
print("2. 05_ocean_feature_extraction.ipynb")
print("   - 从 desc 字段提取 OCEAN 人格特征")
print("   - 注意避免 train/test leakage")
print("")
print("3. 06_xgboost_with_ocean.ipynb")
print("   - 训练包含 OCEAN 特征的完整模型")
print("   - 对比基线模型性能")
print("")
print("=" * 80)

## Step 10: 可视化数据概览

In [None]:
# 创建可视化
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. 目标变量分布
ax1 = axes[0, 0]
target_counts = df_clean_binary['target'].value_counts()
colors = ['#2ecc71', '#e74c3c']
ax1.bar(['Fully Paid\n(target=0)', 'Charged Off\n(target=1)'], 
        target_counts.values, color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('Count', fontsize=12, fontweight='bold')
ax1.set_title('Target Variable Distribution', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)
for i, v in enumerate(target_counts.values):
    ax1.text(i, v, f'{v:,}\n({v/len(df_clean_binary)*100:.1f}%)', 
             ha='center', va='bottom', fontweight='bold')

# 2. 特征类型分布
ax2 = axes[0, 1]
feature_type_counts = [len(numeric_features), len(categorical_features), 1]
feature_types = ['Numeric', 'Categorical', 'Text']
colors2 = ['#3498db', '#9b59b6', '#f39c12']
wedges, texts, autotexts = ax2.pie(feature_type_counts, labels=feature_types, 
                                     autopct='%1.1f%%', colors=colors2, 
                                     startangle=90, textprops={'fontweight': 'bold'})
ax2.set_title('Feature Type Distribution', fontsize=14, fontweight='bold')

# 3. 特征删除原因
ax3 = axes[1, 0]
delete_reasons = ['POST-LOAN\n(Leakage)', 'METADATA\n(No value)', 
                  'Low Quality\n(Coverage<80%)', 'Kept']
delete_counts = [len(leakage_delete), len(metadata_delete), 
                 len(quality_delete), df_clean_binary.shape[1]]
colors3 = ['#e74c3c', '#95a5a6', '#e67e22', '#2ecc71']
bars = ax3.barh(delete_reasons, delete_counts, color=colors3, alpha=0.7, edgecolor='black')
ax3.set_xlabel('Count', fontsize=12, fontweight='bold')
ax3.set_title('Feature Retention Analysis', fontsize=14, fontweight='bold')
ax3.grid(axis='x', alpha=0.3)
for i, (bar, count) in enumerate(zip(bars, delete_counts)):
    ax3.text(count, bar.get_y() + bar.get_height()/2, 
             f' {count}', va='center', fontweight='bold')

# 4. 数据规模变化
ax4 = axes[1, 1]
data_stages = ['Original\nDataset', 'With\ndesc', 'Clean\nBinary']
data_counts = [2260668, len(df_with_desc), len(df_clean_binary)]
colors4 = ['#34495e', '#3498db', '#2ecc71']
bars = ax4.bar(data_stages, data_counts, color=colors4, alpha=0.7, edgecolor='black')
ax4.set_ylabel('Number of Rows', fontsize=12, fontweight='bold')
ax4.set_title('Dataset Size Changes', fontsize=14, fontweight='bold')
ax4.grid(axis='y', alpha=0.3)
for bar, count in zip(bars, data_counts):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height,
             f'{count:,}',
             ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('../../data_cleaning_summary.png', dpi=300, bbox_inches='tight')
print("\n可视化已保存: data_cleaning_summary.png")
plt.show()