# 05a - 使用 LLM 生成 OCEAN Ground Truth Labels

**目标**: 使用 LLama 3 (via Hugging Face) 为 500 个样本的贷款描述生成 OCEAN 人格评分

## 工作流程:
1. 从干净数据集中选择 500 个样本（平衡选择：250 违约 + 250 正常）
2. 使用 Hugging Face Inference API + LLama 3 模型分析每个 `desc` 字段
3. 为每个样本生成 5 个 OCEAN 人格分数 (0-1 范围)
4. 保存为 CSV 文件用于后续 Ridge 回归权重学习

In [None]:
import pandas as pd
import numpy as np
import json
import re
import warnings
warnings.filterwarnings('ignore')

from huggingface_hub import InferenceClient
import time

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', None)

print("库加载成功！")

## Step 1: 加载干净的建模数据

In [None]:
# 加载干净的建模数据
print("加载干净的建模数据...")
df = pd.read_csv('data/loan_clean_for_modeling.csv', low_memory=False)

print(f"数据形状: {df.shape[0]:,} 行 × {df.shape[1]} 列")
print(f"\n目标变量分布:")
print(df['target'].value_counts())
print(f"\ndesc 字段非空样本: {df['desc'].notna().sum():,}")

## Step 2: 平衡选择 500 个样本

In [None]:
# 平衡选择 500 个样本 (250 违约 + 250 正常)
print("选择 500 个平衡样本...\n")

# 分离违约和正常样本
df_charged_off = df[df['target'] == 1].copy()
df_fully_paid = df[df['target'] == 0].copy()

print(f"违约样本总数: {len(df_charged_off):,}")
print(f"正常样本总数: {len(df_fully_paid):,}")

# 随机选择 250 个样本每类
np.random.seed(42)
sample_charged_off = df_charged_off.sample(n=min(250, len(df_charged_off)), random_state=42)
sample_fully_paid = df_fully_paid.sample(n=min(250, len(df_fully_paid)), random_state=42)

# 合并
df_sample_500 = pd.concat([sample_charged_off, sample_fully_paid], ignore_index=False)
df_sample_500 = df_sample_500.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n选中样本总数: {len(df_sample_500):,}")
print(f"目标变量分布:")
print(df_sample_500['target'].value_counts())

# 检查 desc 字段
print(f"\ndesc 字段非空样本: {df_sample_500['desc'].notna().sum():,}")

# 显示示例
print("\n示例 desc 文本:")
for i in range(3):
    desc = df_sample_500.iloc[i]['desc']
    if pd.notna(desc):
        desc_str = str(desc)[:200]
        print(f"\n样本 {i+1}: {desc_str}...")

## Step 3: 初始化 Hugging Face Inference Client

In [None]:
# 初始化 Hugging Face Inference Client
# 注意: 需要设置环境变量 HF_TOKEN 或直接提供 token

print("初始化 Hugging Face Inference Client...\n")

# 方法 1: 使用环境变量 (推荐)
import os
hf_token = os.getenv('HF_TOKEN')

if not hf_token:
    print("⚠️ 警告: 未找到 HF_TOKEN 环境变量")
    print("请设置环境变量: export HF_TOKEN='your_hugging_face_token'")
    print("\n你也可以直接在下面的代码中设置 token:")
    print("hf_token = 'your_token_here'")
    
    # 如果没有设置环境变量，请在这里直接输入
    # hf_token = 'your_hugging_face_token_here'

# 初始化 client
try:
    client = InferenceClient(token=hf_token)
    print("✅ Hugging Face Client 初始化成功！")
except Exception as e:
    print(f"❌ 初始化失败: {e}")
    print("\n请确保:")
    print("1. 已安装 huggingface_hub: pip install huggingface_hub")
    print("2. 已设置有效的 HF token")
    print("3. LLama 3 模型访问权限已获得")

## Step 4: 定义 OCEAN 提取函数

In [None]:
def extract_ocean_from_llm(desc_text, client, max_retries=3):
    """
    使用 LLama 3 从文本中提取 OCEAN 人格分数
    
    参数:
        desc_text: 贷款申请描述文本
        client: Hugging Face InferenceClient
        max_retries: 最大重试次数
    
    返回:
        dict: 包含 5 个 OCEAN 分数 (0-1)
    """
    
    # 缺失或空文本处理
    if pd.isna(desc_text) or str(desc_text).strip() == '':
        return {
            'openness': 0.5,
            'conscientiousness': 0.5,
            'extraversion': 0.5,
            'agreeableness': 0.5,
            'neuroticism': 0.5,
            'status': 'empty_text'
        }
    
    # 构建 prompt
    prompt = f"""Analyze the following loan application description and rate the borrower's personality on the OCEAN traits (Big Five Personality Model) on a scale of 0 to 1.

Description:
{desc_text}


---

Output ONLY a JSON object (no markdown, no extra text) with the following format:
{{
  "openness": 0.X,
  "conscientiousness": 0.X,
  "extraversion": 0.X,
  "agreeableness": 0.X,
  "neuroticism": 0.X
}}

Where:
- Openness (0-1): Imagination, curiosity, creativity, willingness to explore new ideas
- Conscientiousness (0-1): Responsibility, discipline, organization, reliability
- Extraversion (0-1): Sociability, energy, assertiveness, positive emotions
- Agreeableness (0-1): Cooperation, trust, altruism, empathy
- Neuroticism (0-1): Emotional instability, anxiety, vulnerability

Respond with ONLY the JSON object."""
    
    # 重试逻辑
    for attempt in range(max_retries):
        try:
            # 调用 LLama 3
            response = client.text_generation(
                prompt,
                model="meta-llama/Meta-Llama-3-8B-Instruct",
                max_new_tokens=200,
                temperature=0.3,
                top_p=0.9
            )
            
            # 清理响应
            response_text = str(response).strip()
            
            # 提取 JSON
            # 如果包含 markdown 代码块，提取其中的内容
            if '```json' in response_text:
                response_text = response_text.split('```json')[1].split('```')[0]
            elif '```' in response_text:
                response_text = response_text.split('```')[1].split('```')[0]
            
            # 尝试解析 JSON
            ocean_dict = json.loads(response_text)
            
            # 验证所有键都存在且值在 0-1 范围内
            required_keys = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
            valid = True
            
            for key in required_keys:
                if key not in ocean_dict:
                    valid = False
                    break
                # 确保值在 0-1 范围
                value = float(ocean_dict[key])
                ocean_dict[key] = np.clip(value, 0, 1)
            
            if valid:
                ocean_dict['status'] = 'success'
                return ocean_dict
            
        except json.JSONDecodeError:
            if attempt == max_retries - 1:
                print(f"\n❌ JSON 解析失败，返回默认值")
                return {
                    'openness': 0.5,
                    'conscientiousness': 0.5,
                    'extraversion': 0.5,
                    'agreeableness': 0.5,
                    'neuroticism': 0.5,
                    'status': 'json_parse_error'
                }
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"\n❌ API 调用失败: {e}")
                return {
                    'openness': 0.5,
                    'conscientiousness': 0.5,
                    'extraversion': 0.5,
                    'agreeableness': 0.5,
                    'neuroticism': 0.5,
                    'status': 'api_error'
                }
            # 等待后重试
            time.sleep(2 ** attempt)  # 指数退避
    
    # 默认返回
    return {
        'openness': 0.5,
        'conscientiousness': 0.5,
        'extraversion': 0.5,
        'agreeableness': 0.5,
        'neuroticism': 0.5,
        'status': 'max_retries_exceeded'
    }

print("OCEAN 提取函数定义完成！")

## Step 5: 为 500 个样本生成 OCEAN 标签

In [None]:
# 生成 OCEAN 标签
print("开始为 500 个样本生成 OCEAN 标签...\n")
print("预计时间: 取决于 API 速率限制和网络")
print("\n进度:")

ocean_labels = []
status_counts = {}

for idx, row in df_sample_500.iterrows():
    desc = row['desc']
    
    # 提取 OCEAN 分数
    ocean_scores = extract_ocean_from_llm(desc, client)
    status = ocean_scores.pop('status', 'unknown')
    
    # 统计状态
    status_counts[status] = status_counts.get(status, 0) + 1
    
    ocean_labels.append(ocean_scores)
    
    # 进度显示
    if (idx + 1) % 50 == 0:
        print(f"已处理: {idx + 1} / {len(df_sample_500)}")
    
    # 速率限制保护
    if (idx + 1) % 10 == 0:
        time.sleep(1)  # 每 10 个请求休息 1 秒

# 转换为 DataFrame
ocean_labels_df = pd.DataFrame(ocean_labels)

print(f"\n✅ 标签生成完成！")
print(f"\n状态统计:")
for status, count in status_counts.items():
    print(f"  {status}: {count}")

## Step 6: 创建最终的 Ground Truth 数据集

In [None]:
# 合并原始数据和 OCEAN 标签
df_ground_truth = df_sample_500[['desc', 'target']].reset_index(drop=True)
df_ground_truth = pd.concat([df_ground_truth, ocean_labels_df], axis=1)

print("Ground Truth 数据集:")
print(f"形状: {df_ground_truth.shape}")
print(f"\n列名: {list(df_ground_truth.columns)}")

# 显示统计
print("\nOCEAN 分数统计:")
print(df_ground_truth[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']].describe())

# 按目标变量分组显示
print("\n按目标变量分组的 OCEAN 分数均值:")
print(df_ground_truth.groupby('target')[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']].mean())

# 显示示例
print("\n示例数据 (前 5 行):")
display_df = df_ground_truth.copy()
display_df['desc'] = display_df['desc'].apply(lambda x: str(x)[:100] + '...' if len(str(x)) > 100 else str(x))
print(display_df.head())

## Step 7: 保存 Ground Truth 数据

In [None]:
# 保存 Ground Truth 数据集
output_file = 'ocean_ground_truth_500.csv'

print(f"保存 Ground Truth 数据到: {output_file}\n")
df_ground_truth.to_csv(output_file, index=False)

import os
file_size = os.path.getsize(output_file) / (1024)  # KB
print(f"✅ 文件大小: {file_size:.2f} KB")
print(f"✅ 数据行数: {len(df_ground_truth):,}")
print(f"✅ 数据列数: {len(df_ground_truth.columns)}")

# 也保存为 JSON 格式（便于查看）
json_file = 'ocean_ground_truth_500.json'
df_ground_truth.to_json(json_file, orient='records', indent=2)
print(f"\n✅ 也保存为 JSON 格式: {json_file}")

## Step 8: 数据质量验证

In [None]:
print("=" * 80)
print("Ground Truth 数据质量验证")
print("=" * 80)

print("\n1️⃣ 缺失值检查")
print("-" * 80)
missing_counts = df_ground_truth.isnull().sum()
if missing_counts.sum() == 0:
    print("✅ 没有缺失值")
else:
    print(missing_counts)

print("\n2️⃣ OCEAN 分数范围验证")
print("-" * 80)
ocean_cols = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
valid_range = True
for col in ocean_cols:
    min_val = df_ground_truth[col].min()
    max_val = df_ground_truth[col].max()
    in_range = (min_val >= 0) and (max_val <= 1)
    status = "✅" if in_range else "❌"
    print(f"{status} {col}: [{min_val:.4f}, {max_val:.4f}]")
    valid_range = valid_range and in_range

print("\n3️⃣ 目标变量平衡检查")
print("-" * 80)
target_dist = df_ground_truth['target'].value_counts().sort_index()
for target_val, count in target_dist.items():
    label = "Fully Paid" if target_val == 0 else "Charged Off"
    pct = count / len(df_ground_truth) * 100
    print(f"{label} (target={target_val}): {count:,} ({pct:.1f}%)")

print("\n4️⃣ 文本字段验证")
print("-" * 80)
non_empty_desc = df_ground_truth['desc'].notna().sum()
print(f"非空 desc 字段: {non_empty_desc:,} / {len(df_ground_truth):,}")
avg_desc_len = df_ground_truth['desc'].dropna().astype(str).str.len().mean()
print(f"平均 desc 长度: {avg_desc_len:.0f} 字符")

print("\n" + "=" * 80)
if valid_range and non_empty_desc == len(df_ground_truth):
    print("✅ 数据质量检查通过！可以进行下一步（Ridge 回归权重训练）")
else:
    print("⚠️ 注意: 发现一些数据质量问题，请检查上述输出")
print("=" * 80)

## Step 9: 总结

In [None]:
print("=" * 80)
print("LLM OCEAN Ground Truth 标签生成总结")
print("=" * 80)

print("\n1️⃣ 样本信息")
print("-" * 80)
print(f"总样本数: {len(df_ground_truth):,}")
print(f"违约样本 (target=1): {(df_ground_truth['target']==1).sum():,}")
print(f"正常样本 (target=0): {(df_ground_truth['target']==0).sum():,}")

print("\n2️⃣ OCEAN 分数信息")
print("-" * 80)
print(f"特征维度: 5 (Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism)")
print(f"分数范围: [0, 1]")
print(f"生成方法: LLama 3 via Hugging Face Inference API")

print("\n3️⃣ 生成的文件")
print("-" * 80)
print(f"1. ocean_ground_truth_500.csv (500 行 × 7 列)")
print(f"   - 列: desc, target, openness, conscientiousness, extraversion, agreeableness, neuroticism")
print(f"2. ocean_ground_truth_500.json (相同数据，JSON 格式)")

print("\n4️⃣ 下一步")
print("-" * 80)
print("运行 05b_train_ocean_ridge_weights.ipynb:")
print("1. 加载 ocean_ground_truth_500.csv")
print("2. 编码 PRE-LOAN 特征")
print("3. 训练 Ridge 回归模型 (alpha=0.17)")
print("4. 提取并保存权重系数")

print("\n" + "=" * 80)
print("✅ Ground Truth 标签生成完成！")
print("=" * 80)