# 05e - Extract BGE-Large Embeddings (500 Test Samples)

**Purpose**: Extract BGE-Large embeddings from 500 test samples as input features for Ridge regression model training

**Input Files**:
- test_samples_500.csv - 500 samples
- ocean_ground_truth/ - OCEAN ground truth (select best model)

**Output Files**:
- bge_embeddings_500.npy - BGE embeddings matrix (500x1024)
- ocean_targets_500.csv - Corresponding OCEAN scores (500x5)
- 05e_extraction_summary.json - Extraction statistics report

**Estimated Time**: Approximately 15-20 minutes (500 API calls, 0.3 second delay each)

## Step 1: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import requests
import json
import os
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully")

## Step 2: Load Test Data and OCEAN Ground Truth

In [None]:
# Load 500 test samples
print("Loading test data...")
df_samples = pd.read_csv('../test_samples_500.csv')
print(f"Loaded {len(df_samples)} samples")
print(f"\nSample info:")
print(df_samples.head())
print(f"\nColumns: {df_samples.columns.tolist()}")

# Load OCEAN ground truth (use best model)
print("\nLoading OCEAN ground truth...")
ocean_gt_file = '../ocean_ground_truth/deepseek_v3.1_ocean_500.csv'

if not os.path.exists(ocean_gt_file):
    # If file doesn't exist, try other models
    print(f"WARNING: {ocean_gt_file} does not exist, searching for other models...")
    ocean_dir = '../ocean_ground_truth'
    if os.path.exists(ocean_dir):
        files = [f for f in os.listdir(ocean_dir) if f.endswith('_ocean_500.csv')]
        if files:
            ocean_gt_file = os.path.join(ocean_dir, files[0])
            print(f"Using: {files[0]}")
        else:
            raise FileNotFoundError("Cannot find OCEAN ground truth file")
    else:
        raise FileNotFoundError(f"Cannot find {ocean_dir} directory")

df_ocean = pd.read_csv(ocean_gt_file)
print(f"Loaded {len(df_ocean)} OCEAN scores")
print(f"\nOCEAN features:")
print(df_ocean.head())
print(f"\nStatistics:")
print(df_ocean.describe())

## Step 3: Define BGE Embedding Extraction Function

In [None]:
# Load HF Token
def load_hf_token():
    try:
        with open('../.env', 'r') as f:
            for line in f:
                if line.strip() and not line.startswith('#'):
                    key, value = line.strip().split('=', 1)
                    if key == 'HF_TOKEN':
                        return value
    except:
        pass
    return os.getenv('HF_TOKEN', '')

hf_token = load_hf_token()
print(f"HF Token loaded: {'yes' if hf_token else 'no'}")

# Define BGE embedding extraction function with enhanced retry logic
def extract_bge_embedding(text: str, max_retries: int = 5, base_delay: int = 3) -> np.ndarray:
    """
    Call HF Inference API to extract BGE-Large embeddings with exponential backoff
    
    Args:
        text: Input text
        max_retries: Maximum retry attempts (increased to 5)
        base_delay: Base retry delay in seconds
    
    Returns:
        1024-dimensional embedding vector
    """
    api_url = "https://api-inference.huggingface.co/models/BAAI/bge-large-en-v1.5"
    headers = {
        "Authorization": f"Bearer {hf_token}",
        "Content-Type": "application/json"
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.post(
                api_url,
                headers=headers,
                json={"inputs": text},
                timeout=60  # Increased timeout
            )
            
            if response.status_code == 200:
                features = response.json()
                
                # Handle different response formats
                if isinstance(features, list):
                    if len(features) > 0:
                        if isinstance(features[0], list):
                            # features is [token_embeddings]
                            avg_feature = np.mean(features, axis=0)
                        else:
                            # features is already the embedding
                            avg_feature = features
                    else:
                        raise ValueError("Empty features list")
                else:
                    # Assume it's already an embedding
                    avg_feature = features
                
                return np.array(avg_feature)
            
            elif response.status_code == 500:
                # Internal server error - exponential backoff
                if attempt < max_retries - 1:
                    delay = base_delay * (2 ** attempt)  # Exponential backoff: 3s, 6s, 12s, 24s
                    print(f"    API 500 error (attempt {attempt+1}/{max_retries}), waiting {delay}s...")
                    time.sleep(delay)
                    continue
                else:
                    raise Exception(f"API Error 500 after {max_retries} retries")
            
            elif response.status_code == 503:
                # Model loading
                if attempt < max_retries - 1:
                    delay = base_delay * 2  # Wait longer for model loading
                    print(f"    Model loading... waiting {delay}s (attempt {attempt+1}/{max_retries})")
                    time.sleep(delay)
                    continue
                else:
                    raise Exception(f"API Error 503 after {max_retries} retries")
            
            elif response.status_code == 429:
                # Rate limit - wait even longer
                if attempt < max_retries - 1:
                    delay = base_delay * (attempt + 2)  # Linear increase: 6s, 9s, 12s
                    print(f"    Rate limited, waiting {delay}s...")
                    time.sleep(delay)
                    continue
                else:
                    raise Exception(f"Rate limited after {max_retries} retries")
            
            else:
                error_msg = response.text[:200] if hasattr(response, 'text') else 'Unknown error'
                if attempt < max_retries - 1:
                    print(f"    API Error {response.status_code}, retrying...")
                    time.sleep(base_delay * (attempt + 1))
                    continue
                else:
                    raise Exception(f"API Error {response.status_code}: {error_msg}")
        
        except requests.exceptions.Timeout:
            if attempt < max_retries - 1:
                delay = base_delay * (attempt + 1)
                print(f"    Request timeout, waiting {delay}s...")
                time.sleep(delay)
                continue
            else:
                raise Exception("Request timeout after all retries")
        
        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                delay = base_delay * (attempt + 1)
                print(f"    Network error: {str(e)[:50]}, waiting {delay}s...")
                time.sleep(delay)
                continue
            else:
                raise
        
        except Exception as e:
            if attempt < max_retries - 1:
                delay = base_delay * (attempt + 1)
                print(f"    Error: {str(e)[:50]}, waiting {delay}s...")
                time.sleep(delay)
                continue
            else:
                raise
    
    raise Exception("Failed to extract embedding after all retries")

print("\n✓ BGE embedding extraction function defined (with enhanced retry)")
print("  - Max retries: 5")
print("  - Exponential backoff for 500 errors")
print("  - Timeout: 60s")

## Step 4: Batch Extract Embeddings

In [None]:
print("="*80)
print("Starting BGE Embeddings extraction (500 samples)")
print("="*80)

embeddings = []
success_count = 0
error_count = 0
error_indices = []

start_time = time.time()
total_samples = len(df_samples)

# Increased delay to avoid API rate limits and 500 errors
DELAY_BETWEEN_REQUESTS = 0.5  # Increased from 0.3 to 0.5 seconds

for idx, (_, row) in enumerate(df_samples.iterrows(), 1):
    text = row.get('desc', '')
    
    if len(text.strip()) < 10:
        # Skip too short descriptions
        embeddings.append(np.zeros(1024))
        error_count += 1
        error_indices.append(idx - 1)
        print(f"  [{idx}] Skipping: text too short")
        continue
    
    try:
        # Extract embedding
        emb = extract_bge_embedding(text)
        
        if emb is not None and len(emb) == 1024:
            embeddings.append(emb)
            success_count += 1
        else:
            embeddings.append(np.zeros(1024))
            error_count += 1
            error_indices.append(idx - 1)
            print(f"  [{idx}] Error: Invalid embedding dimension")
    
    except Exception as e:
        embeddings.append(np.zeros(1024))
        error_count += 1
        error_indices.append(idx - 1)
        print(f"  [{idx}] Error: {str(e)[:100]}")
    
    # Show progress
    if idx % 25 == 0 or idx == total_samples:  # Show progress more frequently
        elapsed = time.time() - start_time
        rate = idx / elapsed if elapsed > 0 else 0
        eta = (total_samples - idx) / rate if rate > 0 else 0
        
        progress = idx / total_samples * 100
        success_rate = success_count / idx * 100 if idx > 0 else 0
        print(f"[{idx:3d}/{total_samples}] {progress:5.1f}% | ✓{success_count} ✗{error_count} ({success_rate:.1f}% success) | {rate:.2f} samples/s | ETA: {eta/60:.1f}min")
    
    # Add delay to avoid rate limiting and reduce 500 errors
    time.sleep(DELAY_BETWEEN_REQUESTS)

elapsed_total = time.time() - start_time

print(f"\n" + "="*80)
print(f"Embedding extraction complete")
print(f"="*80)
print(f"\nTime elapsed: {elapsed_total/60:.1f} minutes ({elapsed_total:.1f} seconds)")
print(f"Success: {success_count}/{total_samples} ({success_count/total_samples*100:.1f}%)")
print(f"Failed: {error_count}/{total_samples} ({error_count/total_samples*100:.1f}%)")

if error_count > 0:
    print(f"\nFailed indices (first 20): {error_indices[:20]}")

# Convert to numpy array
X = np.array(embeddings)
print(f"\nEmbedding matrix shape: {X.shape}")
print(f"Data type: {X.dtype}")
print(f"Memory usage: {X.nbytes / 1024 / 1024:.1f} MB")
print(f"Average time per sample: {elapsed_total/total_samples:.2f}s")

## Step 5: Save Embeddings and Target Variables

In [None]:
print("Saving results...\n")

# Save embeddings
embedding_file = '../bge_embeddings_500.npy'
np.save(embedding_file, X)
print(f"Embeddings saved: {embedding_file}")
print(f"  Shape: {X.shape}")
print(f"  Size: {os.path.getsize(embedding_file) / 1024 / 1024:.1f} MB")

# Save OCEAN targets
ocean_target_file = '../ocean_targets_500.csv'
df_ocean.to_csv(ocean_target_file, index=False)
print(f"\nOCEAN targets saved: {ocean_target_file}")
print(f"  Shape: {df_ocean.shape}")
print(f"  Columns: {df_ocean.columns.tolist()}")

# Verify data consistency
if len(X) == len(df_ocean):
    print(f"\nData consistency check passed")
    print(f"   Embeddings count: {len(X)}")
    print(f"   OCEAN targets count: {len(df_ocean)}")
else:
    print(f"\nWARNING: Data inconsistency")
    print(f"   Embeddings: {len(X)}")
    print(f"   OCEAN targets: {len(df_ocean)}")

## Step 6: Generate Statistics Report

In [None]:
# Generate summary report
summary = {
    'phase': '05e - Extract BGE Embeddings',
    'timestamp': datetime.now().isoformat(),
    'total_samples': int(total_samples),
    'success_count': int(success_count),
    'error_count': int(error_count),
    'success_rate': f"{success_count/total_samples*100:.2f}%",
    'embedding_model': 'BAAI/bge-large-en-v1.5',
    'embedding_dimension': 1024,
    'embedding_file': embedding_file,
    'ocean_target_file': ocean_target_file,
    'ocean_features': df_ocean.columns.tolist(),
    'ocean_statistics': {},
    'processing_time_seconds': elapsed_total,
    'samples_per_second': success_count / elapsed_total if elapsed_total > 0 else 0
}

# Add OCEAN statistics
for col in df_ocean.columns:
    summary['ocean_statistics'][col] = {
        'mean': float(df_ocean[col].mean()),
        'std': float(df_ocean[col].std()),
        'min': float(df_ocean[col].min()),
        'max': float(df_ocean[col].max())
    }

# Save summary
summary_file = '../05e_extraction_summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Statistics report saved: {summary_file}")
print(f"\n" + "="*80)
print("Summary")
print("="*80)
print(json.dumps(summary, indent=2, default=str))

## Summary

Step 05e Complete

**Output Files**:
- `bge_embeddings_500.npy` - 500x1024 embeddings matrix
- `ocean_targets_500.csv` - 500x5 OCEAN scores
- `05e_extraction_summary.json` - Extraction report

**Next Step**:
Run `05f_train_ridge_models.ipynb` to train Ridge regression models