# 05g - Sample 2K and Generate OCEAN Features for All 5 LLMs

**Purpose**: Sample 2000 samples from 34K dataset, extract BGE embeddings, and generate OCEAN predictions using all 5 LLM-trained ElasticNet models

## Workflow:
1. Load 34K dataset
2. Stratified sampling (2000 samples, balanced Fully Paid/Charged Off)
3. Extract BGE embeddings (2000 samples, ~17 minutes)
4. Load 5 ElasticNet models (Llama, GPT, Qwen, Gemma, DeepSeek)
5. Predict OCEAN for each model (5 LLMs x 5 dimensions = 25 OCEAN columns)
6. Save combined dataset

## Input Files:
- loan_final_desc50plus_with_ocean_bge.csv (34,530 rows)
- elasticnet_models_llama.pkl
- elasticnet_models_gpt.pkl
- elasticnet_models_qwen.pkl
- elasticnet_models_gemma.pkl
- elasticnet_models_deepseek.pkl

## Output Files:
- loan_2k_with_all_ocean.csv (2000 rows x 62 columns)
  - 36 base features
  - 25 OCEAN features (5 LLMs x 5 dimensions)
  - 1 target column
- bge_embeddings_2k.npy (2000 x 1024)
- 05g_generation_report.json

**Estimated Time**: 20-25 minutes

In [None]:
import pandas as pd
import numpy as np
import pickle
import json
import os
import time
import warnings
from datetime import datetime
from tqdm import tqdm
warnings.filterwarnings('ignore')

# For BGE embeddings
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

print("Libraries loaded successfully")
print(f"Timestamp: {datetime.now()}")

## Step 1: Configuration

In [None]:
# Load environment variables from .env file
def load_env_file(filepath='../.env'):
    """Load environment variables from .env file"""
    env_vars = {}
    try:
        with open(filepath, 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    key, value = line.split('=', 1)
                    env_vars[key.strip()] = value.strip()
                    os.environ[key.strip()] = value.strip()
        return env_vars
    except FileNotFoundError:
        print(f"Warning: .env file not found at {filepath}")
        return {}

# Load .env
env_vars = load_env_file('../.env')

# Configuration
CONFIG = {
    # Input files
    'input_data': '../loan_final_desc50plus_with_ocean_bge.csv',
    
    # ElasticNet models (5 LLMs)
    'elasticnet_models': {
        'llama': '../elasticnet_models_llama.pkl',
        'gpt': '../elasticnet_models_gpt.pkl',
        'qwen': '../elasticnet_models_qwen.pkl',
        'gemma': '../elasticnet_models_gemma.pkl',
        'deepseek': '../elasticnet_models_deepseek.pkl'
    },
    
    # Output files
    'output_data': '../loan_2k_with_all_ocean.csv',
    'output_embeddings': '../bge_embeddings_2k.npy',
    'output_report': '../05g_generation_report.json',
    
    # Sampling parameters
    'sample_size': 2000,
    'random_state': 42,
    
    # BGE model configuration
    'bge_model': 'BAAI/bge-large-en-v1.5',
    'embedding_dim': 1024,
    
    # API configuration (adjusted to reduce 500 errors)
    'batch_size': 10,
    'delay_between_batches': 0.6,  # Increased from 0.5 to 0.6
    'max_retries': 5,  # Increased from 3 to 5
    
    # OCEAN dimensions
    'ocean_dims': ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']
}

# Load HuggingFace API token
HF_TOKEN = os.environ.get('HF_TOKEN') or os.environ.get('HUGGINGFACE_API_KEY')
if not HF_TOKEN:
    raise ValueError("HuggingFace API token not found! Please set HF_TOKEN in .env file")
    
print(f"HuggingFace API token loaded: {HF_TOKEN[:10]}...")

print("\n" + "="*80)
print("Configuration")
print("="*80)
print(f"  Sample size: {CONFIG['sample_size']}")
print(f"  LLM models: {list(CONFIG['elasticnet_models'].keys())}")
print(f"  OCEAN dimensions: {CONFIG['ocean_dims']}")
print(f"  Total OCEAN columns: {len(CONFIG['elasticnet_models'])} LLMs x {len(CONFIG['ocean_dims'])} dims = {len(CONFIG['elasticnet_models']) * len(CONFIG['ocean_dims'])}")
print(f"\n  API Settings:")
print(f"    Batch size: {CONFIG['batch_size']}")
print(f"    Delay: {CONFIG['delay_between_batches']}s")
print(f"    Max retries: {CONFIG['max_retries']}")
print(f"    Checkpoint: Every 50 samples")
print("="*80)

## Step 2: Load 34K Dataset

In [None]:
print("Loading 34K dataset...")
df_full = pd.read_csv(CONFIG['input_data'], low_memory=False)

print(f"\nDataset loaded:")
print(f"  Rows: {len(df_full):,}")
print(f"  Columns: {len(df_full.columns)}")

# Check required columns
required_cols = ['desc', 'target']
missing_cols = [col for col in required_cols if col not in df_full.columns]
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

# Remove rows with missing desc
df_valid = df_full[df_full['desc'].notna()].copy()
print(f"\nRows with valid descriptions: {len(df_valid):,}")

# Target distribution
print(f"\nTarget distribution:")
print(df_valid['target'].value_counts())
print(f"Default rate: {df_valid['target'].mean()*100:.2f}%")

## Step 3: Stratified Sampling (2000 samples)

In [None]:
from sklearn.model_selection import train_test_split

print(f"Performing stratified sampling ({CONFIG['sample_size']} samples)...")

# Stratified sampling to maintain class balance
df_sample, _ = train_test_split(
    df_valid,
    train_size=CONFIG['sample_size'],
    random_state=CONFIG['random_state'],
    stratify=df_valid['target']
)

print(f"\nSampled data:")
print(f"  Rows: {len(df_sample):,}")
print(f"  Target distribution:")
print(df_sample['target'].value_counts())
print(f"  Default rate: {df_sample['target'].mean()*100:.2f}%")

# Reset index
df_sample = df_sample.reset_index(drop=True)

# Extract descriptions for embedding
descriptions = df_sample['desc'].tolist()
print(f"\nExtracted {len(descriptions)} descriptions for BGE embedding")

## Step 4: Extract BGE Embeddings (2000 samples)

This step extracts BGE embeddings using HuggingFace Inference API.

Estimated time: 2000 samples x 0.5s = ~17 minutes

In [None]:
def create_session_with_retries():
    """Create requests session with automatic retries."""
    session = requests.Session()
    retry_strategy = Retry(
        total=CONFIG['max_retries'],
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session


def extract_bge_embedding(text, session, hf_token, max_retries=5, base_delay=3):
    """
    Extract BGE embedding for a single text using HF Inference API with enhanced retry logic.
    
    Args:
        text: Input text
        session: Requests session with retries
        hf_token: HuggingFace API token
        max_retries: Maximum retry attempts
        base_delay: Base delay in seconds for exponential backoff
    
    Returns:
        numpy array: 1024-dim embedding vector
    """
    api_url = f"https://api-inference.huggingface.co/models/{CONFIG['bge_model']}"
    headers = {"Authorization": f"Bearer {hf_token}"}
    
    for attempt in range(max_retries):
        try:
            response = session.post(
                api_url,
                headers=headers,
                json={"inputs": text, "options": {"wait_for_model": True}},
                timeout=60
            )
            
            if response.status_code == 200:
                embedding = np.array(response.json())
                if embedding.shape == (1024,):
                    return embedding
                elif embedding.shape == (1, 1024):
                    return embedding[0]
                else:
                    raise ValueError(f"Unexpected embedding shape: {embedding.shape}")
            
            elif response.status_code == 500:
                # Internal server error - exponential backoff
                if attempt < max_retries - 1:
                    delay = base_delay * (2 ** attempt)
                    print(f"      [500 error] Retry {attempt+1}/{max_retries}, waiting {delay}s...")
                    time.sleep(delay)
                    continue
                else:
                    raise Exception(f"API Error 500 after {max_retries} retries")
            
            elif response.status_code == 503:
                # Model loading
                if attempt < max_retries - 1:
                    delay = base_delay * 2
                    print(f"      [Model loading] Retry {attempt+1}/{max_retries}, waiting {delay}s...")
                    time.sleep(delay)
                    continue
                else:
                    raise Exception(f"API Error 503 after {max_retries} retries")
            
            else:
                raise Exception(f"API error {response.status_code}: {response.text}")
                
        except Exception as e:
            if attempt < max_retries - 1:
                delay = base_delay * (attempt + 1)
                print(f"      [Error] {str(e)[:50]}, retry {attempt+1}/{max_retries}, waiting {delay}s...")
                time.sleep(delay)
                continue
            else:
                raise Exception(f"Error extracting embedding: {e}")


def extract_bge_embeddings_batch(texts, session, hf_token, batch_size=10, delay=0.5, 
                                  resume_from=0, existing_embeddings=None):
    """
    Extract BGE embeddings for multiple texts in batches with resume support.
    
    Args:
        texts: List of texts
        session: Requests session
        hf_token: HuggingFace API token
        batch_size: Batch size
        delay: Delay between batches
        resume_from: Index to resume from (for checkpoint recovery)
        existing_embeddings: Existing embeddings array to continue from
    
    Returns:
        numpy array: (n_texts, 1024) embeddings
    """
    # Initialize embeddings array
    if existing_embeddings is not None and len(existing_embeddings) == len(texts):
        embeddings = existing_embeddings.copy()
        print(f"✓ Loaded existing embeddings, resuming from index {resume_from}")
    else:
        embeddings = np.zeros((len(texts), 1024))
        resume_from = 0
        print(f"✓ Starting fresh extraction")
    
    # Calculate remaining work
    remaining = len(texts) - resume_from
    if remaining == 0:
        print("✓ All embeddings already extracted!")
        return embeddings
    
    n_batches_total = (len(texts) + batch_size - 1) // batch_size
    n_batches_remaining = (remaining + batch_size - 1) // batch_size
    
    print(f"\nExtracting BGE embeddings for {len(texts)} texts...")
    print(f"  Batch size: {batch_size}")
    print(f"  Resume from: {resume_from} / {len(texts)} ({resume_from/len(texts)*100:.1f}%)")
    print(f"  Remaining: {remaining} samples")
    print(f"  Batches remaining: {n_batches_remaining} / {n_batches_total}")
    print(f"  Estimated time: {n_batches_remaining * delay / 60:.1f} minutes\n")
    
    start_time = time.time()
    
    # Temp file for checkpointing
    temp_file = CONFIG['output_embeddings'].replace('.npy', '.temp.npy')
    
    with tqdm(total=len(texts), initial=resume_from, desc="Extracting embeddings") as pbar:
        for i in range(resume_from, len(texts)):
            try:
                text = texts[i]
                embedding = extract_bge_embedding(text, session, hf_token)
                embeddings[i] = embedding
                pbar.update(1)
                
                # Save checkpoint every 50 samples
                if (i + 1) % 50 == 0:
                    np.save(temp_file, embeddings)
                    pbar.set_postfix({'saved': f'checkpoint@{i+1}'})
                
                # Delay between requests
                if i < len(texts) - 1:
                    time.sleep(delay)
                    
            except KeyboardInterrupt:
                print(f"\n\n⚠️  Interrupted by user at index {i}")
                print(f"   Saving checkpoint to {temp_file}...")
                np.save(temp_file, embeddings)
                print(f"   ✓ Checkpoint saved! {i}/{len(texts)} completed ({i/len(texts)*100:.1f}%)")
                print(f"   To resume, run the cell again (it will auto-resume from {i})")
                raise
            
            except Exception as e:
                print(f"\n⚠️  Error at index {i}: {e}")
                print(f"   Saving checkpoint to {temp_file}...")
                np.save(temp_file, embeddings)
                print(f"   ✓ Checkpoint saved! To resume, run the cell again")
                raise
    
    elapsed_time = time.time() - start_time
    
    # Save final checkpoint
    np.save(temp_file, embeddings)
    
    print(f"\n✓ Embedding extraction complete!")
    print(f"  Total time: {elapsed_time / 60:.1f} minutes")
    print(f"  Average time per sample: {elapsed_time / remaining:.2f} seconds")
    print(f"  Checkpoint saved: {temp_file}")
    
    return embeddings

print("✓ Embedding extraction functions defined (with checkpoint support)")

In [None]:
# Check for existing checkpoint
temp_file = CONFIG['output_embeddings'].replace('.npy', '.temp.npy')
existing_embeddings = None
resume_from = 0

if os.path.exists(temp_file):
    print("="*80)
    print("✓ Found checkpoint file!")
    print("="*80)
    
    try:
        existing_embeddings = np.load(temp_file)
        print(f"  File: {temp_file}")
        print(f"  Shape: {existing_embeddings.shape}")
        print(f"  Size: {os.path.getsize(temp_file) / 1024 / 1024:.2f} MB")
        
        # Verify shape matches
        if existing_embeddings.shape[0] != len(descriptions):
            print(f"\n⚠️  Warning: Checkpoint size mismatch!")
            print(f"    Checkpoint: {existing_embeddings.shape[0]} samples")
            print(f"    Current: {len(descriptions)} samples")
            print(f"    Starting fresh extraction...")
            existing_embeddings = None
            resume_from = 0
        else:
            # Find where to resume (first zero vector)
            vector_norms = np.linalg.norm(existing_embeddings, axis=1)
            completed = np.sum(vector_norms > 0.01)
            resume_from = completed
            
            print(f"\n  Progress analysis:")
            print(f"    Completed: {completed} / {len(descriptions)} ({completed/len(descriptions)*100:.1f}%)")
            print(f"    Remaining: {len(descriptions) - completed}")
            
            if resume_from >= len(descriptions):
                print(f"\n✓ All embeddings already extracted!")
                embeddings = existing_embeddings
            else:
                print(f"\n  Will resume from index {resume_from}")
                
    except Exception as e:
        print(f"\n⚠️  Error loading checkpoint: {e}")
        print(f"    Starting fresh extraction...")
        existing_embeddings = None
        resume_from = 0
else:
    print("="*80)
    print("Starting fresh extraction (no checkpoint found)")
    print("="*80)

# Extract embeddings (with resume support)
if resume_from < len(descriptions):
    # Create session
    session = create_session_with_retries()
    
    # Extract embeddings
    embeddings = extract_bge_embeddings_batch(
        texts=descriptions,
        session=session,
        hf_token=HF_TOKEN,
        batch_size=CONFIG['batch_size'],
        delay=CONFIG['delay_between_batches'],
        resume_from=resume_from,
        existing_embeddings=existing_embeddings
    )
    
    print(f"\n" + "="*80)
    print("Embeddings Extraction Summary")
    print("="*80)
    print(f"  Shape: {embeddings.shape}")
    print(f"  Expected: ({len(df_sample)}, {CONFIG['embedding_dim']})")
    
    # Verify all embeddings are valid
    vector_norms = np.linalg.norm(embeddings, axis=1)
    valid_count = np.sum(vector_norms > 0.01)
    print(f"  Valid embeddings: {valid_count} / {len(embeddings)} ({valid_count/len(embeddings)*100:.1f}%)")
    
    assert embeddings.shape == (len(df_sample), CONFIG['embedding_dim']), "Embedding shape mismatch!"
    
    # Save final embeddings
    final_output = CONFIG['output_embeddings'].replace('.temp', '')
    np.save(final_output, embeddings)
    print(f"\n✓ Final embeddings saved: {final_output}")
    print(f"  File size: {os.path.getsize(final_output) / 1024 / 1024:.1f} MB")
    
    # Keep temp file for safety
    print(f"\n  Temp file kept for safety: {temp_file}")
    print(f"  (You can delete it manually if extraction is successful)")
else:
    print("\n✓ Using existing complete embeddings")
    final_output = CONFIG['output_embeddings']
    np.save(final_output, embeddings)
    print(f"  Saved to: {final_output}")

## Step 5: Load ElasticNet Models and Predict OCEAN

In [None]:
print("="*80)
print("Loading ElasticNet Models and Predicting OCEAN")
print("="*80)

# Storage for all OCEAN predictions
all_ocean_predictions = {}

for llm_key, model_path in CONFIG['elasticnet_models'].items():
    print(f"\n{'='*80}")
    print(f"Processing {llm_key.upper()} model")
    print(f"{'='*80}")
    
    # Load model
    print(f"Loading model: {model_path}")
    with open(model_path, 'rb') as f:
        model_data = pickle.load(f)
    
    # Extract models and scaler
    models = model_data['models']
    scaler = model_data['scaler']
    
    print(f"  Models loaded: {list(models.keys())}")
    print(f"  Scaler: {type(scaler).__name__}")
    
    # Standardize embeddings
    embeddings_scaled = scaler.transform(embeddings)
    print(f"  Embeddings standardized: mean={embeddings_scaled.mean():.6f}, std={embeddings_scaled.std():.6f}")
    
    # Predict each OCEAN dimension
    print(f"\n  Predicting OCEAN dimensions:")
    for dim in CONFIG['ocean_dims']:
        model = models[dim]
        predictions = model.predict(embeddings_scaled)
        
        # Clip to [0, 1] range
        predictions = np.clip(predictions, 0, 1)
        
        # Store with naming convention: {llm}_{dimension}
        col_name = f"{llm_key}_{dim}"
        all_ocean_predictions[col_name] = predictions
        
        print(f"    {dim}: mean={predictions.mean():.3f}, std={predictions.std():.3f}, range=[{predictions.min():.3f}, {predictions.max():.3f}]")

print(f"\n{'='*80}")
print(f"All OCEAN predictions complete!")
print(f"Total columns generated: {len(all_ocean_predictions)}")
print(f"{'='*80}")

## Step 6: Combine Features and Save

In [None]:
print("\nCombining all features...")

# Start with original dataframe
df_output = df_sample.copy()

# Add all OCEAN predictions
for col_name, predictions in all_ocean_predictions.items():
    df_output[col_name] = predictions

print(f"\nOutput dataframe:")
print(f"  Rows: {len(df_output):,}")
print(f"  Columns: {len(df_output.columns)}")

# Show OCEAN columns
ocean_cols = [col for col in df_output.columns if any(llm in col for llm in CONFIG['elasticnet_models'].keys())]
print(f"\nOCEAN columns ({len(ocean_cols)}):")
for i, col in enumerate(sorted(ocean_cols), 1):
    print(f"  {i:2d}. {col}")

# Show sample statistics
print(f"\nSample OCEAN statistics:")
print(df_output[ocean_cols].describe())

# Save to CSV
print(f"\nSaving to CSV...")
df_output.to_csv(CONFIG['output_data'], index=False)
print(f"Saved: {CONFIG['output_data']}")
print(f"File size: {os.path.getsize(CONFIG['output_data']) / 1024 / 1024:.1f} MB")

## Step 7: Generate Summary Report

In [None]:
# Generate summary report
report = {
    'phase': '05g - Sample 2K and Generate All OCEAN Features',
    'timestamp': datetime.now().isoformat(),
    
    'data': {
        'input_file': CONFIG['input_data'],
        'input_size': int(len(df_full)),
        'sample_size': int(len(df_output)),
        'output_file': CONFIG['output_data'],
        'n_features': int(len(df_output.columns))
    },
    
    'method': {
        'embedding_model': CONFIG['bge_model'],
        'embedding_dim': CONFIG['embedding_dim'],
        'llm_models': list(CONFIG['elasticnet_models'].keys()),
        'n_llm_models': len(CONFIG['elasticnet_models']),
        'ocean_dimensions': CONFIG['ocean_dims'],
        'n_ocean_dimensions': len(CONFIG['ocean_dims'])
    },
    
    'ocean_features': {},
    
    'target_distribution': {
        'fully_paid': int((df_output['target'] == 0).sum()),
        'charged_off': int((df_output['target'] == 1).sum()),
        'default_rate': float(df_output['target'].mean())
    }
}

# Add statistics for each OCEAN feature
for col in ocean_cols:
    report['ocean_features'][col] = {
        'mean': float(df_output[col].mean()),
        'std': float(df_output[col].std()),
        'min': float(df_output[col].min()),
        'max': float(df_output[col].max()),
        'median': float(df_output[col].median())
    }

# Save report
with open(CONFIG['output_report'], 'w') as f:
    json.dump(report, f, indent=2)

print("\n" + "="*80)
print("SUMMARY REPORT")
print("="*80)

print(f"\n1. Data Processing")
print(f"   Input: {report['data']['input_file']}")
print(f"   Input size: {report['data']['input_size']:,}")
print(f"   Sample size: {report['data']['sample_size']:,}")
print(f"   Output: {report['data']['output_file']}")

print(f"\n2. LLM Models Processed")
for llm in report['method']['llm_models']:
    print(f"   - {llm}")

print(f"\n3. OCEAN Features Generated")
print(f"   Total: {len(ocean_cols)} columns")
print(f"   ({report['method']['n_llm_models']} LLMs × {report['method']['n_ocean_dimensions']} dimensions)")

print(f"\n4. Target Distribution")
print(f"   Fully Paid: {report['target_distribution']['fully_paid']:,}")
print(f"   Charged Off: {report['target_distribution']['charged_off']:,}")
print(f"   Default rate: {report['target_distribution']['default_rate']*100:.2f}%")

print(f"\n5. Output Files")
print(f"   - {CONFIG['output_data']}")
print(f"   - {CONFIG['output_embeddings']}")
print(f"   - {CONFIG['output_report']}")

print(f"\n6. Next Steps")
print(f"   Run 07_xgboost_all_llm_comparison_2k.ipynb to:")
print(f"   - Train 6 XGBoost models (1 baseline + 5 LLM OCEAN)")
print(f"   - Compare performance across all LLMs")
print(f"   - Identify best LLM ground truth for loan default prediction")

print("\n" + "="*80)
print("05g Complete!")
print("="*80)