<a href="https://colab.research.google.com/github/jmiamen/N8NRepo/blob/main/model_to_build_off.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🚜 Master Agricultural Keywording Pipeline - BULLETPROOF DEBUG Edition

**Production-Grade Agricultural Image Keywording System**

This notebook orchestrates a bulletproof two-stage agricultural image keywording pipeline with comprehensive DEBUG monitoring. It combines a fine-tuned LLaVA model for specialist keyword generation with GPT-4 Vision refinement, featuring real-time progress tracking and robust error handling.

## 🎯 **Key Features:**
- **Comprehensive DEBUG Logging**: Live progress indicators for AI processing steps
- **Memory Management**: GPU monitoring and automatic cleanup
- **Business API Integration**: Working Dropbox Business API with fallbacks
- **Error Resilience**: Bulletproof JSON formatting and error recovery
- **Production Ready**: Shopify-compatible CSV export with photographer matching

## 🏗️ **12-Step Pipeline Architecture:**
1. Install Dependencies
2. Imports, Drive Mount, Config Setup
3. Utility Functions + PhotographerMatcher
4. Dropbox Business API Integration
5. Load Finetuned LLaVA Model
6. **🔥 Enhanced Specialist Inference** (with comprehensive DEBUG)
7. Save Intermediate JSON
8. Load + Enrich Intermediate JSON
9. **🔥 Enhanced GPT-4 Vision Refinement** (with comprehensive DEBUG)
10. Assemble Final Metadata Rows
11. Final CSV Export (Shopify-compatible)
12. Pipeline Summary (complete statistics)

---

**DEBUG ENHANCEMENTS:**
- Real-time progress indicators prevent "stagnant logs"
- GPU memory monitoring and automatic cleanup
- Multi-phase timing for API calls and processing
- Success/failure statistics with fallback reporting
- ETA calculations for long-running processes

## Step 1: Install Dependencies
Installing all required packages with Business API support and memory management tools.

In [None]:
print("🚀 AGStock Keyworder Enhanced v5.0 - Installing Dependencies")
print("=" * 60)

import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
        print(f"✅ {package}")
        return True
    except subprocess.CalledProcessError:
        print(f"❌ Failed: {package}")
        return False

dependencies = [
    "torch",
    "torchvision",
    "torchaudio",
    "transformers>=4.35.0",
    "accelerate>=0.24.0",
    "bitsandbytes>=0.41.0",
    "peft>=0.6.0",
    "dropbox>=11.36.0",
    "pillow>=10.0.0",
    "requests>=2.31.0",
    "python-dotenv>=1.0.0",
    "openai>=1.0.0",
    "pandas>=2.0.0",
    "numpy>=1.24.0",
    "matplotlib>=3.7.0",
    "tqdm>=4.65.0",
    "psutil>=5.9.0"
]

print("📦 Installing dependencies...")

failed = []
for pkg in dependencies:
    if not install_package(pkg):
        failed.append(pkg)

if failed:
    print(f"⚠️ Failed: {len(failed)} packages")
else:
    print("✅ All dependencies installed!")

print("🎯 v5.0 Optimizations Complete")
print("🏁 Step 1 Complete")

## Step 2: Imports, Drive Mount, and Configuration
Setting up all imports with memory management and mounting Google Drive for model access.

In [None]:
print("🔧 AGStock Keyworder Enhanced v4.0 FIXED - Environment Setup")
print("=" * 55)

# Core Python libraries
import os
import json
import re
import csv
import time
import base64
import logging
import requests
import dropbox
import shutil
import gc
import unicodedata
import random
from typing import Dict, List, Any, Optional
from functools import wraps

# Third-party libraries - REMOVED tenacity dependency for simplified retry system
import torch
from transformers import LlavaForConditionalGeneration, LlavaProcessor, AutoProcessor, BitsAndBytesConfig
from peft import PeftModel
from PIL import Image
import pandas as pd
import numpy as np
from tqdm import tqdm
import psutil

# Enhanced logging setup with comprehensive monitoring
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# SIMPLIFIED RETRY DECORATOR - replaces complex tenacity usage
def simple_retry(max_attempts=3, delay=2, backoff=2, max_delay=30):
    """Simple retry decorator with exponential backoff"""
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            last_exception = None
            current_delay = delay

            for attempt in range(max_attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    last_exception = e

                    if attempt == max_attempts - 1:  # Last attempt
                        raise last_exception

                    # Exponential backoff with jitter
                    sleep_time = min(current_delay + random.uniform(0, 1), max_delay)
                    logging.info(f"Retry attempt {attempt + 1}/{max_attempts} after {sleep_time:.1f}s delay")
                    time.sleep(sleep_time)
                    current_delay *= backoff

            raise last_exception  # Should never reach here
        return wrapper
    return decorator

# Google Drive mount with error handling
try:
    from google.colab import drive, files
    drive.mount('/content/drive')
    logging.info("✅ Google Drive mounted successfully")
    IN_COLAB = True
except ImportError:
    logging.info("ℹ️ Running outside Google Colab - local mode")
    IN_COLAB = False
    # Create mock files module for local testing
    class MockFiles:
        @staticmethod
        def download(filename):
            print(f"📁 Local mode: {filename} ready for manual download")
    files = MockFiles()

# Enhanced path configuration with validation
if IN_COLAB:
    DRIVE_PROJECT_PATH = "/content/drive/MyDrive/AgriKeywordingProject"
    MODEL_PATH = os.path.join(DRIVE_PROJECT_PATH, "model_weights")
    PHOTOGRAPHER_CSV_PATH = os.path.join(DRIVE_PROJECT_PATH, "photographer_matching.csv")
else:
    DRIVE_PROJECT_PATH = "./AgriKeywordingProject"
    MODEL_PATH = "./model_weights"
    PHOTOGRAPHER_CSV_PATH = "./photographer_matching.csv"

logging.info(f"📁 Project path: {DRIVE_PROJECT_PATH}")
logging.info(f"🤖 Model path: {MODEL_PATH}")
logging.info(f"👥 Photographer CSV: {PHOTOGRAPHER_CSV_PATH}")

# Memory management helper functions
def get_gpu_memory_usage():
    """Get current GPU memory usage in GB"""
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1024**3
    return 0.0

def cleanup_memory():
    """Clean up GPU and system memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"💾 Initial GPU Memory: {get_gpu_memory_usage():.2f}GB")
print("✅ Enhanced environment initialization complete with SIMPLIFIED retry system")
print("🚀 Ready for utility functions and enhanced features setup")

## Configuration Form
Enter your API credentials and specify the Dropbox folder to process.

In [None]:
#@title Dropbox Business API Configuration
DROPBOX_APP_KEY = "" #@param {type:"string"}
DROPBOX_APP_SECRET = "" #@param {type:"string"}
DROPBOX_REFRESH_TOKEN = "" #@param {type:"string"}

#@title OpenAI API Configuration
OPENAI_API_KEY = "" #@param {type:"string"}

#@title Image Source Configuration
image_folder_name = "Images_To_Keyword" #@param {type:"string"}

# Set environment variables automatically
os.environ['DROPBOX_APP_KEY'] = DROPBOX_APP_KEY
os.environ['DROPBOX_APP_SECRET'] = DROPBOX_APP_SECRET
os.environ['DROPBOX_REFRESH_TOKEN'] = DROPBOX_REFRESH_TOKEN
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# Validate configuration
config_status = {
    "Dropbox": bool(DROPBOX_APP_KEY and DROPBOX_APP_SECRET and DROPBOX_REFRESH_TOKEN),
    "OpenAI": bool(OPENAI_API_KEY),
    "Folder": bool(image_folder_name)
}

print("📊 Configuration Status:")
for service, status in config_status.items():
    status_icon = "✅" if status else "⚠️"
    print(f"   {status_icon} {service}: {'Configured' if status else 'Not configured'}")

print(f"✅ Dropbox folder: '{image_folder_name}'")
print(f"✅ API keys configured: {bool(OPENAI_API_KEY)}")

## Step 3: Utility Functions and PhotographerMatcher Class
Core functions for data processing and photographer identification with bulletproof error handling.

In [None]:
print("🔧 AGStock Keyworder Enhanced v4.0 FIXED - Utility Functions Setup")
print("=" * 60)

# EFFICIENT PhotographerMatcher class - MEMORY BOMB FIXED!
class PhotographerMatcher:
    """Efficient photographer matching with range storage - NO MEMORY EXPANSION"""

    def __init__(self, csv_path):
        """Initialize efficient photographer matcher"""
        self.ranges = []  # [(start, end, photographer_info)]
        self.exact_matches = {}  # {sku: photographer_info}
        self.prefixes = {}  # {prefix: photographer_info}
        self.load_photographers(csv_path)

    def load_photographers(self, csv_path):
        """Load photographer data with efficient range storage - NO EXPANSION"""
        if not os.path.exists(csv_path):
            logging.warning(f"Photographer CSV not found: {csv_path}")
            print(f"⚠️ Photographer CSV not found: {csv_path}")
            print("🔄 Continuing with default photographer matching")
            return

        try:
            df = pd.read_csv(csv_path)
            for _, row in df.iterrows():
                sku_range = str(row.get('sku_range', ''))
                name = str(row.get('name', 'Unknown'))
                state = str(row.get('state', 'Unknown'))

                photographer_info = {
                    'name': name,
                    'state': state
                }

                # Efficient range parsing (e.g., "270000-279999")
                if '-' in sku_range:
                    try:
                        start_str, end_str = sku_range.split('-', 1)
                        start_num = int(start_str.strip())
                        end_num = int(end_str.strip())

                        # Store as range tuple - NO EXPANSION
                        self.ranges.append((start_num, end_num, photographer_info))

                    except ValueError:
                        logging.warning(f"Invalid range format: {sku_range}")

                # Prefix matching (e.g., "280", "AH")
                elif sku_range:
                    self.prefixes[sku_range] = photographer_info

            print(f"✅ Loaded {len(df)} photographer entries efficiently")
            print(f"📊 Ranges: {len(self.ranges)}, Prefixes: {len(self.prefixes)}")
            print(f"🚀 MEMORY OPTIMIZED: No dictionary expansion used!")

        except Exception as e:
            logging.error(f"Failed to load photographer CSV: {e}")
            print(f"❌ Failed to load photographer CSV: {e}")

    def get_vendor_info(self, sku):
        """Efficient vendor lookup with range checking - NO MEMORY EXPANSION"""
        if not sku:
            return {'vendor': 'Unknown', 'photographer_match': 'no', 'state': 'Unknown'}

        sku_str = str(sku).strip()

        # Check exact matches first (fastest)
        if sku_str in self.exact_matches:
            photographer = self.exact_matches[sku_str]
            return {
                'vendor': photographer['name'],
                'photographer_match': 'yes',
                'state': photographer['state']
            }

        # Try to convert to integer for range checking
        try:
            sku_int = int(sku_str)

            # Check ranges efficiently - O(n) but n is small (number of photographers)
            for start, end, photographer_info in self.ranges:
                if start <= sku_int <= end:
                    return {
                        'vendor': photographer_info['name'],
                        'photographer_match': 'yes',
                        'state': photographer_info['state']
                    }
        except ValueError:
            # SKU is not numeric, skip range checking
            pass

        # Check prefix matches
        for prefix, photographer_info in self.prefixes.items():
            if sku_str.startswith(prefix):
                return {
                    'vendor': photographer_info['name'],
                    'photographer_match': 'yes',
                    'state': photographer_info['state']
                }

        # No match found
        return {'vendor': 'Unknown', 'photographer_match': 'no', 'state': 'Unknown'}


#

# Enhanced Pipeline Timer for comprehensive performance tracking
class EnhancedPipelineTimer:
    """Enhanced pipeline timing with step-by-step performance analysis"""

    def __init__(self):
        self.steps = {}
        self.pipeline_start = time.time()
        print("⏱️ Enhanced Pipeline Timer initialized")

    def start_step(self, step_name):
        """Start timing a pipeline step"""
        self.steps[step_name] = {'start': time.time(), 'end': None}
        logging.info(f"⏱️ Starting step: {step_name}")

    def end_step(self, step_name):
        """End timing a pipeline step"""
        if step_name in self.steps and self.steps[step_name]['end'] is None:
            self.steps[step_name]['end'] = time.time()
            duration = self.get_step_duration(step_name)
            logging.info(f"⏱️ Completed step: {step_name} in {duration:.1f}s")
        else:
            logging.warning(f"⚠️ Step '{step_name}' not found or already ended")

    def get_step_duration(self, step_name):
        """Get duration of a completed step"""
        if step_name not in self.steps:
            return 0.0
        step = self.steps[step_name]
        if step['end'] is None:
            return time.time() - step['start']  # Currently running
        return step['end'] - step['start']

    def get_total_time(self):
        """Get total pipeline execution time"""
        return time.time() - self.pipeline_start

    def get_summary(self):
        """Get comprehensive timing summary"""
        summary = {}
        for step_name, step_data in self.steps.items():
            summary[step_name] = self.get_step_duration(step_name)
        summary['total_time'] = self.get_total_time()
        return summary

# Initialize enhanced pipeline timer
pipeline_timer = EnhancedPipelineTimer()

# Enhanced agricultural keyword filtering system
def filter_llava_keywords(raw_keywords, max_keywords=12):
    """Enhanced keyword filtering for LLaVA outputs with agricultural focus"""
    if not raw_keywords or raw_keywords == 'processing_failed':
        return 'agriculture, farming'

    # Clean and split keywords
    cleaned = clean_ai_keywords(raw_keywords)
    if len(cleaned) <= max_keywords:
        return ', '.join(cleaned)

    # Priority scoring for agricultural relevance
    priority_keywords = {
        # High priority - Core agricultural terms
        'agriculture': 10, 'farming': 10, 'cattle': 9, 'corn': 9, 'soybean': 9,
        'wheat': 9, 'dairy': 9, 'livestock': 9, 'pasture': 8, 'field': 8,
        'crop': 8, 'farm': 8, 'rural': 8, 'barn': 8, 'harvest': 8,

        # Medium priority - Specific agricultural elements
        'angus': 7, 'holstein': 7, 'hereford': 7, 'charolais': 7, 'simmental': 7,
        'jersey': 7, 'brahman': 7, 'shorthorn': 7, 'duroc': 7, 'hampshire': 7,
        'yorkshire': 7, 'landrace': 7, 'alfalfa': 6, 'hay': 6, 'oats': 6,
        'barley': 6, 'rye': 6, 'sorghum': 6, 'canola': 6, 'sunflower': 6,

        # Geographic terms - medium priority
        'iowa': 6, 'nebraska': 6, 'kansas': 6, 'wisconsin': 6, 'minnesota': 6,
        'illinois': 6, 'missouri': 6, 'south dakota': 6, 'north dakota': 6,

        # Lower priority - Generic terms
        'animal': 4, 'outdoor': 3, 'green': 3, 'brown': 3, 'natural': 3,

        # Remove these terms completely
        'processing': 0, 'computer': 0, 'technical': 0, 'system': 0, 'operation': 0,
        'focus': 0, 'image': 0, 'photo': 0, 'picture': 0, 'digital': 0
    }

    # Score and sort keywords
    scored_keywords = []
    for keyword in cleaned:
        # Remove redundant terms first
        if any(duplicate in keyword for duplicate in ['cattle cattle', 'farm farm', 'agriculture agriculture']):
            continue

        score = priority_keywords.get(keyword.lower(), 5)  # Default medium score
        if score > 0:  # Only include non-blacklisted terms
            scored_keywords.append((keyword, score))

    # Sort by score (highest first) and take top max_keywords
    scored_keywords.sort(key=lambda x: x[1], reverse=True)
    final_keywords = [kw for kw, score in scored_keywords[:max_keywords]]

    # Ensure minimum keyword count with fallbacks
    if len(final_keywords) < 8:
        fallback_keywords = ['agriculture', 'farming', 'crop', 'field', 'rural', 'farm']
        for fallback in fallback_keywords:
            if fallback not in final_keywords and len(final_keywords) < 8:
                final_keywords.append(fallback)

    return ', '.join(final_keywords)

def correct_geographic_keywords(keywords, photographer_context):
    """Enhanced geographic correction with photographer context awareness"""
    if not keywords:
        return keywords

    photographer_state = photographer_context.get('state', '').lower()
    if not photographer_state or photographer_state == 'unknown':
        return keywords  # No correction possible

    keyword_list = [kw.strip().lower() for kw in keywords.split(',')]

    # US state mappings for correction
    state_corrections = {
        'iowa': ['illinois', 'nebraska', 'minnesota', 'wisconsin'],
        'wisconsin': ['iowa', 'illinois', 'minnesota', 'michigan'],
        'nebraska': ['iowa', 'kansas', 'colorado', 'south dakota'],
        'kansas': ['nebraska', 'oklahoma', 'missouri', 'colorado'],
        'illinois': ['iowa', 'wisconsin', 'indiana', 'missouri'],
        'minnesota': ['wisconsin', 'iowa', 'north dakota', 'south dakota'],
        'missouri': ['kansas', 'illinois', 'arkansas', 'iowa'],
        'texas': ['oklahoma', 'louisiana', 'arkansas', 'new mexico'],
        'california': ['nevada', 'oregon', 'arizona'],
        'florida': ['georgia', 'alabama'],
        'ohio': ['michigan', 'indiana', 'pennsylvania', 'west virginia']
    }

    # Apply geographic correction
    corrected_keywords = []
    photographer_state_added = False

    for keyword in keyword_list:
        # Check if keyword is an incorrect state
        incorrect_state = False
        for correct_state, incorrect_states in state_corrections.items():
            if correct_state == photographer_state and keyword in incorrect_states:
                # Replace incorrect state with correct one
                if not photographer_state_added:
                    corrected_keywords.append(photographer_state)
                    photographer_state_added = True
                incorrect_state = True
                break

        if not incorrect_state:
            corrected_keywords.append(keyword)

    # Ensure photographer state is first if known
    if photographer_state and photographer_state != 'unknown':
        if photographer_state in corrected_keywords:
            corrected_keywords.remove(photographer_state)
        corrected_keywords.insert(0, photographer_state)

    return ', '.join(corrected_keywords)

def remove_redundant_terms(keywords):
    """Enhanced redundancy removal with agricultural term intelligence"""
    if not keywords:
        return []

    # Convert to list if string
    if isinstance(keywords, str):
        keyword_list = [kw.strip() for kw in keywords.split(',')]
    else:
        keyword_list = keywords

    # Remove obvious duplicates and redundant variations
    redundancy_groups = {
        'cattle': ['cattle cattle', 'cow cattle', 'cattle cow', 'livestock cattle'],
        'agriculture': ['agriculture agriculture', 'agricultural agriculture', 'farming agriculture'],
        'farm': ['farm farm', 'farming farm', 'farm farming'],
        'crop': ['crop crop', 'crops crop', 'crop crops'],
        'field': ['field field', 'fields field', 'field fields']
    }

    # Technical terms to remove
    technical_blacklist = [
        'processing', 'computer', 'technical', 'operation', 'focus', 'system',
        'digital', 'image', 'photo', 'picture', 'analysis', 'data'
    ]

    filtered_keywords = []
    for keyword in keyword_list:
        keyword_lower = keyword.lower().strip()

        # Skip empty keywords
        if not keyword_lower:
            continue

        # Skip technical blacklist terms
        if keyword_lower in technical_blacklist:
            continue

        # Check for redundancy patterns
        is_redundant = False
        for base_term, redundant_variations in redundancy_groups.items():
            if keyword_lower in redundant_variations:
                # Add base term if not already present
                if base_term not in [k.lower() for k in filtered_keywords]:
                    filtered_keywords.append(base_term)
                is_redundant = True
                break

        # Add keyword if not redundant and not already present
        if not is_redundant and keyword_lower not in [k.lower() for k in filtered_keywords]:
            filtered_keywords.append(keyword)

    return filtered_keywords

# Enhanced CSV download function with bulletproof error handling
def bulletproof_download_csv(filename, description="CSV file"):
    """Bulletproof CSV download with multiple fallback strategies"""
    if not os.path.exists(filename):
        print(f"❌ File not found: {filename}")
        return False

    try:
        # Primary: Google Colab files.download()
        if IN_COLAB:
            files.download(filename)
            print(f"✅ {description} downloaded via Colab")
            return True
        else:
            # Local mode: Just confirm file is ready
            file_size = os.path.getsize(filename)
            print(f"✅ {description} ready for download: {filename} ({file_size:,} bytes)")
            return True

    except Exception as e:
        print(f"⚠️ Download method failed: {e}")
        print(f"📁 File available locally: {filename}")

        # Fallback: Print file location
        abs_path = os.path.abspath(filename)
        print(f"📂 Absolute path: {abs_path}")
        return False

print("✅ Enhanced utility functions initialized")
print("👥 PhotographerMatcher class with simplified exact matching ready")
print("⏱️ Enhanced pipeline timer activated")
print("🔧 Agricultural keyword filtering system loaded")
print("🌍 Geographic correction algorithms ready")
print("🛡️ Bulletproof CSV download system ready")
print("🧹 Enhanced redundancy removal ready")

In [None]:
# Bulletproof string escaping for JSON safety
def bulletproof_escape_string(text):
    """Bulletproof string escaping to prevent JSON corruption"""
    if not isinstance(text, str):
        text = str(text)

    # Critical JSON escaping
    escape_map = {
        '\\': '\\\\',
        '"': '\\"',
        '\n': '\\n',
        '\r': '\\r',
        '\t': '\\t',
        '\b': '\\b',
        '\f': '\\f'
    }

    for char, escape in escape_map.items():
        text = text.replace(char, escape)
    return text

# Enhanced agricultural terminology validation system
def count_agricultural_terms(text):
    """Count and analyze agricultural technical terms in text for quality validation"""
    if not text:
        return {"count": 0, "terms": [], "categories": {}}

    text_lower = text.lower()

    # Comprehensive agricultural terminology database
    agricultural_terms = {
        'livestock_breeds': [
            'hereford', 'holstein', 'simmental', 'angus', 'charolais', 'jersey',
            'duroc', 'hampshire', 'yorkshire', 'landrace', 'brahman', 'shorthorn'
        ],
        'equipment': [
            'combine', 'auger wagon', 'sprayer boom', 'planter', 'feed bunk',
            'tmr mixer', 'grain cart', 'cultivator', 'disc harrow', 'seed drill',
            'manure spreader', 'hay baler', 'mower', 'tedder', 'rake'
        ],
        'activities': [
            'cow-calf operation', 'calving', 'nursing', 'vaccination', 'sidedressing',
            'cultivation', 'harvest', 'planting', 'spraying', 'feeding', 'milking',
            'breeding', 'weaning', 'castration', 'dehorning'
        ],
        'crops': [
            'corn', 'maize', 'soybean', 'wheat', 'alfalfa', 'hay', 'oats', 'barley',
            'rye', 'sorghum', 'millet', 'canola', 'sunflower', 'cotton'
        ],
        'growth_stages': [
            'v1', 'v2', 'v4', 'v6', 'v8', 'v10', 'v12', 'vt', 'r1', 'r2', 'r3',
            'r4', 'r5', 'r6', 'boot stage', 'heading', 'tasseling', 'silking',
            'milk stage', 'dough stage', 'maturity'
        ],
        'general_agriculture': [
            'pasture', 'livestock', 'cattle', 'farming', 'agriculture', 'crop',
            'field', 'barn', 'silo', 'feedlot', 'dairy', 'ranch', 'farm'
        ]
    }

    found_terms = []
    category_counts = {}

    for category, terms in agricultural_terms.items():
        category_terms = []
        for term in terms:
            if term in text_lower:
                found_terms.append(term)
                category_terms.append(term)
        category_counts[category] = len(category_terms)

    return {
        "count": len(found_terms),
        "terms": found_terms,
        "categories": category_counts,
        "quality_score": min(len(found_terms) / 5.0, 1.0)  # Quality score 0-1
    }

# Enhanced keyword cleaning with agricultural preservation
def clean_ai_keywords(keywords_input):
    """Enhanced keyword cleaning with agricultural term preservation"""
    if not keywords_input:
        return []

    # Handle different input types
    if isinstance(keywords_input, list):
        keywords_string = ", ".join(str(kw) for kw in keywords_input)
    else:
        keywords_string = str(keywords_input)

    # Remove conversational filler patterns
    junk_patterns = [
        r"an analysis of this agricultural.*?",
        r"another \d+.*?",
        r"<<\|end\|>>",
        r"ASSISTANT:",
        r"USER:",
        r"<image>",
        r"\[.*?\]",
        r"\{.*?\}"
    ]

    for pattern in junk_patterns:
        keywords_string = re.sub(pattern, "", keywords_string, flags=re.IGNORECASE)

    # Remove non-ASCII characters
    keywords_string = re.sub('[^\\x00-\\x7F]+', '', keywords_string)

    # Clean and normalize keywords
    keywords = [kw.strip().lower() for kw in keywords_string.split(',')]
    cleaned_keywords = [kw for kw in keywords if kw and len(kw.strip()) > 1]

    # Remove duplicates while preserving order
    return list(dict.fromkeys(cleaned_keywords))

def extract_sku_from_filename(filename):
    """Extract SKU using enhanced regex patterns with robust error handling"""
    if not filename:
        return None

    patterns = [
        r'[A-Z]{2} (\d{6})',     # Primary: AH 270480
        r'([A-Z]{2})(\d{6})',    # Secondary: AH270480
        r'(\d{6})',              # Basic: 270480
        r'[A-Z]+_(\d{5,7})',     # Variable length: AB_12345
        r'([A-Z]+)(\d{5,7})',    # No underscore: AB12345
        r'\b(\d{5,7})\b',        # Word boundary: any 5-7 digits
    ]

    for pattern in patterns:
        try:
            match = re.search(pattern, filename)
            if match:
                groups = match.groups()
                if len(groups) == 1:
                    return groups[0]
                elif len(groups) == 2:
                    # Return the numeric part (second group for patterns with prefix)
                    return groups[1] if groups[1].isdigit() else groups[0]
                elif groups:
                    # Return first non-empty group
                    return next(g for g in groups if g)
        except Exception as e:
            logging.warning(f"Regex error in extract_sku_from_filename for pattern {pattern}: {e}")
            continue

    return None

# AI-Driven State Inference
def infer_photographer_state(ai_generated_keywords, photographer_context):
    """AI-driven state inference"""

    # Get initial photographer state
    photographer_state = photographer_context.get('state', 'Unknown')

    if photographer_state == 'Unknown' or not photographer_state:
        # Use AI keywords to infer state
        state_keywords = {
            'Iowa': ['corn', 'soybean', 'iowa', 'midwest'],
            'Nebraska': ['cattle', 'beef', 'nebraska', 'ranch'],
            'Kansas': ['wheat', 'grain', 'kansas', 'prairie'],
            'Texas': ['cotton', 'longhorn', 'texas', 'ranch'],
            'California': ['vineyard', 'fruit', 'california', 'orchard']
        }

        keywords_lower = ai_generated_keywords.lower() if ai_generated_keywords else ''

        for state, indicators in state_keywords.items():
            if any(indicator in keywords_lower for indicator in indicators):
                return state

        return 'Unknown'

    # Use photographer states dictionary for multiple states
    if ',' in photographer_state or ' and ' in photographer_state:
        states_list = re.split(r',| and |/|&|\||;', photographer_state)
        # Use first state as primary
        return states_list[0].strip() if states_list else photographer_state

    return photographer_state

# Progressive JSON validation with Unicode support
def progressive_json_validation(data, context_name="Unknown"):
    """Validate JSON at multiple stages with Unicode preservation"""
    try:
        if not isinstance(data, (list, dict)):
            return False, f"Invalid data type: {type(data)}"

        # JSON serialization test with Unicode preservation
        json_content = json.dumps(data, ensure_ascii=False, indent=2)

        # Round-trip validation
        parsed_back = json.loads(json_content)

        # Content integrity check
        if isinstance(data, list) and len(parsed_back) != len(data):
            return False, f"Content corruption: expected {len(data)} items, got {len(parsed_back)}"

        logging.info(f"✅ Progressive JSON validation passed for {context_name}")
        return True, "Validation passed"

    except json.JSONDecodeError as e:
        return False, f"JSON encoding error: {e}"
    except Exception as e:
        return False, f"Validation error: {e}"

# Critical validation function
def validate_data_structure(data, step_name):
    """Critical validation points at pipeline transitions"""
    if not data:
        logging.error(f"{step_name}: No data to process")
        return False
    if not isinstance(data, list):
        logging.error(f"{step_name}: Data should be a list, got {type(data)}")
        return False

    logging.info(f"✅ {step_name}: Validated {len(data)} records")
    return True

## Step 4: Dropbox Business API Integration
Production-ready Dropbox Business API with comprehensive error handling and local fallback support.

In [None]:
print("🔧 AGStock Keyworder Enhanced v4.0 FIXED - Dropbox Business API Integration")
print("=" * 65)

pipeline_timer.start_step("Dropbox Integration")

# SIMPLIFIED OAuth2 token refresh - replaces complex tenacity decorator
@simple_retry(max_attempts=3, delay=2, backoff=2)
def refresh_access_token(app_key, app_secret, refresh_token):
    """Simplified OAuth2 token refresh with standard retry"""
    if not all([app_key, app_secret, refresh_token]):
        logging.error("❌ Missing Dropbox credentials for token refresh")
        return None

    response = requests.post(
        'https://api.dropboxapi.com/oauth2/token',
        data={
            'grant_type': 'refresh_token',
            'refresh_token': refresh_token,
            'client_id': app_key,
            'client_secret': app_secret
        },
        timeout=30
    )
    response.raise_for_status()
    new_access_token = response.json().get('access_token')

    if new_access_token:
        logging.info("✅ Successfully refreshed Dropbox access token")
        return new_access_token
    else:
        logging.error("❌ Token refresh response missing access token")
        return None

# SIMPLIFIED file download - replaces complex tenacity decorator
@simple_retry(max_attempts=3, delay=2, backoff=2)
def download_file_simple(url: str, headers: Dict[str, str], local_path: str) -> None:
    """Simplified file download with standard retry"""
    with requests.post(url, headers=headers, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(local_path, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

def download_dropbox_images_enhanced(folder_name: str, local_dir: str = "_INPUT_IMAGES/") -> tuple:
    """Enhanced Dropbox Business API integration with SIMPLIFIED retry logic"""
    download_stats = {
        'attempted': 0,
        'successful': 0,
        'failed': 0,
        'skipped': 0,
        'start_time': time.time(),
        'memory_usage': {
            'start': psutil.virtual_memory().percent,
            'peak': psutil.virtual_memory().percent
        }
    }

    logging.info(f"🔄 Starting Enhanced Business API download for folder: '{folder_name}'")

    # Local test mode for development
    if folder_name == "test_images" and os.path.exists("test_images"):
        logging.info("🧪 Entering local test mode")
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        test_images = [f for f in os.listdir("test_images")
                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp'))]

        if not test_images:
            logging.warning("⚠️ Local 'test_images' folder is empty")
            return False, download_stats

        for img in test_images:
            shutil.copy2(os.path.join("test_images", img), os.path.join(local_dir, img))
            download_stats['successful'] += 1

        download_stats['attempted'] = len(test_images)
        logging.info(f"✅ Copied {len(test_images)} files from test mode")
        return True, download_stats

    # Enhanced Business API authentication
    if not all([DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN]):
        logging.error("❌ Missing Dropbox credentials")
        return False, download_stats

    access_token = refresh_access_token(DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN)
    if not access_token:
        logging.error("❌ Could not obtain valid Dropbox access token")
        return False, download_stats

    try:
        # Enhanced Business Team Authentication
        team_dbx = dropbox.DropboxTeam(access_token)
        members = team_dbx.team_members_list()

        if not members.members:
            logging.error("❌ No team members found in Dropbox Business account")
            return False, download_stats

        member = members.members[0]
        member_dbx = team_dbx.as_user(member.profile.team_member_id)
        logging.info(f"🤝 Authenticated as team member: {member.profile.name.display_name}")

        # Enhanced shared folder discovery with namespace support
        shared_folders = member_dbx.sharing_list_folders().entries

        # Handle nested folder paths (e.g., "Batches/25-06-15/Social")
        root_folder_name = folder_name.split('/')[0]
        nested_path = '/'.join(folder_name.split('/')[1:]) if '/' in folder_name else ''

        target_folder = next((f for f in shared_folders
                            if f.name.lower() == root_folder_name.lower()), None)

        if not target_folder:
            logging.error(f"❌ Root folder '{root_folder_name}' not found in shared folders")
            available_folders = [f.name for f in shared_folders[:5]]
            logging.info(f"📁 Available folders: {available_folders}")
            return False, download_stats

        logging.info(f"📁 Found target folder '{target_folder.name}' with namespace ID: {target_folder.shared_folder_id}")

        # Enhanced namespace-based file listing
        path_to_list = f'/{nested_path}' if nested_path else ''

        # Business API headers with namespace context
        headers = {
            'Authorization': f'Bearer {access_token}',
            'Content-Type': 'application/json',
            'Dropbox-API-Select-User': member.profile.team_member_id,
            'Dropbox-API-Path-Root': json.dumps({
                ".tag": "namespace_id",
                "namespace_id": target_folder.shared_folder_id
            })
        }

        response = requests.post('https://api.dropboxapi.com/2/files/list_folder',
                               headers=headers,
                               json={'path': path_to_list},
                               timeout=30)
        response.raise_for_status()

        entries = response.json().get('entries', [])
        image_files = [e for e in entries
                      if e.get('.tag') == 'file' and
                      e.get('name', '').lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp'))]

        if not image_files:
            logging.warning(f"⚠️ No image files found in folder: '{folder_name}'")
            return False, download_stats

        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        # Enhanced batch download with progress tracking
        download_stats['attempted'] = len(image_files)

        for i, entry in enumerate(tqdm(image_files, desc="Downloading images"), 1):
            file_path = entry.get('path_display')
            local_path = os.path.join(local_dir, entry.get('name'))

            # Update memory monitoring
            current_memory = psutil.virtual_memory().percent
            download_stats['memory_usage']['peak'] = max(download_stats['memory_usage']['peak'], current_memory)

            if os.path.exists(local_path):
                logging.debug(f"⏭️ Skipping existing file ({i}/{len(image_files)}): {entry.get('name')}")
                download_stats['skipped'] += 1
                continue

            logging.info(f"⬇️ Downloading ({i}/{len(image_files)}): '{entry.get('name')}'...")
            download_headers = {
                'Authorization': f'Bearer {access_token}',
                'Dropbox-API-Select-User': member.profile.team_member_id,
                'Dropbox-API-Arg': json.dumps({'path': file_path}),
                'Dropbox-API-Path-Root': headers['Dropbox-API-Path-Root']
            }

            try:
                download_file_simple('https://content.dropboxapi.com/2/files/download',
                                   download_headers, local_path)
                download_stats['successful'] += 1
            except Exception as e:
                logging.error(f"❌ Failed to download {entry.get('name')}: {e}")
                download_stats['failed'] += 1

        total_time = time.time() - download_stats['start_time']
        success_rate = (download_stats['successful'] / download_stats['attempted']) * 100 if download_stats['attempted'] > 0 else 0

        logging.info(f"✅ Download complete: {download_stats['successful']}/{download_stats['attempted']} files ({success_rate:.1f}%) in {total_time:.1f}s")
        return download_stats['successful'] > 0, download_stats

    except dropbox.exceptions.AuthError as e:
        logging.error(f"❌ Dropbox authentication error: {e}")
        return False, download_stats
    except Exception as e:
        logging.error(f"❌ Business API error: {e}")
        return False, download_stats

# Execute download step
print("🔥 EXECUTING ENHANCED DROPBOX DOWNLOAD")
print("=" * 45)

download_success, download_stats = download_dropbox_images_enhanced(image_folder_name, "_INPUT_IMAGES/")

if download_success:
    print(f"✅ Download completed successfully!")
    print(f"📊 Downloaded: {download_stats['successful']} files")
    print(f"⏭️ Skipped: {download_stats['skipped']} existing files")
    print(f"❌ Failed: {download_stats['failed']} files")
    print(f"💾 Memory usage: {download_stats['memory_usage']['start']:.1f}% → {download_stats['memory_usage']['peak']:.1f}%")
else:
    print("⚠️ Download failed, checking for existing images...")

# Check available images
try:
    if os.path.exists("_INPUT_IMAGES/"):
        image_files = [f for f in os.listdir("_INPUT_IMAGES/") if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        print(f"📊 Images available for processing: {len(image_files)}")
        if image_files:
            print(f"📁 Sample files: {', '.join(image_files[:3])}{'...' if len(image_files) > 3 else ''}")
    else:
        print("📁 No _INPUT_IMAGES/ directory found")
        image_files = []
except Exception as e:
    print(f"⚠️ Error checking image files: {e}")
    image_files = []

pipeline_timer.end_step("Dropbox Integration")

print("✅ Enhanced Dropbox Business API integration complete")
print("🔧 SIMPLIFIED OAuth2 token refresh with standard retry")
print("🏢 Team namespace support and member delegation configured")
print("📁 Nested folder path support with comprehensive error handling")
print("🛡️ Local test mode fallback and enhanced metrics tracking")
print(f"💾 Memory monitoring with psutil integrated")
print(f"⏱️ Step duration: {pipeline_timer.get_step_duration('Dropbox Integration'):.1f} seconds")

print("" + "=" * 65)
print("🏁 Step 4 Complete: SIMPLIFIED Dropbox Business API Ready")

## Step 5: Load Finetuned LLaVA Model
Loading the base LLaVA model with PEFT adapter from Google Drive with comprehensive validation.

In [None]:
print("🔧 AGStock Keyworder Enhanced v4.0 FIXED - Loading Finetuned LLaVA Model")
print("=" * 65)

pipeline_timer.start_step("Model Loading")

def validate_model_paths():
    """Validate all required paths exist before model loading"""
    print("🔍 Validating model paths...")

    if not os.path.exists(DRIVE_PROJECT_PATH):
        print(f"❌ Project directory not found: {DRIVE_PROJECT_PATH}")
        return False

    if not os.path.exists(MODEL_PATH):
        print(f"❌ Model weights directory not found: {MODEL_PATH}")
        return False

    # Check for required PEFT adapter files
    required_files = ['adapter_config.json', 'adapter_model.safetensors']
    missing_files = []

    for file in required_files:
        file_path = os.path.join(MODEL_PATH, file)
        if not os.path.exists(file_path):
            missing_files.append(file)

    if missing_files:
        print(f"❌ Missing PEFT adapter files: {missing_files}")
        return False

    print("✅ All required model files found")
    return True

# Validate paths before loading
if not validate_model_paths():
    print("💡 Please ensure the finetuned model weights are properly uploaded to Google Drive")
    print(f"   Expected location: {MODEL_PATH}")
    raise Exception("Model validation failed")

# Enhanced BitsAndBytesConfig with BUILD_SPEC.md requirements
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,  # NEW: Enhanced quantization accuracy
    bnb_4bit_quant_type="nf4"        # NEW: NF4 quantization for better performance
)

print(f"🔧 Loading base LLaVA model: {model_id}")
print(f"⚙️ Using enhanced 4-bit quantization with double quantization and NF4")
print(f"💾 Pre-load GPU Memory: {get_gpu_memory_usage():.2f}GB")

base_load_start = time.time()

base_model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

base_load_time = time.time() - base_load_start
print(f"✅ Base LLaVA model loaded in {base_load_time:.2f}s")
print(f"💾 GPU Memory after base model: {get_gpu_memory_usage():.2f}GB")

print("🔧 Loading processor...")
processor_start = time.time()
processor = AutoProcessor.from_pretrained(model_id)
processor_load_time = time.time() - processor_start
print(f"✅ Processor loaded in {processor_load_time:.2f}s")

print(f"🔧 Loading PEFT adapter from: {MODEL_PATH}")
adapter_start = time.time()
model = PeftModel.from_pretrained(base_model, MODEL_PATH)
adapter_load_time = time.time() - adapter_start
print(f"✅ PEFT adapter loaded in {adapter_load_time:.2f}s")

# Enhanced GPU cleanup following BUILD_SPEC.md requirements
print("🧹 Performing GPU memory cleanup...")
cleanup_start = time.time()
torch.cuda.empty_cache()     # Clear GPU cache
torch.cuda.synchronize()     # Synchronize CUDA operations
cleanup_time = time.time() - cleanup_start
print(f"✅ GPU cleanup completed in {cleanup_time:.3f}s")

# Final validation
print("🔍 Validating finetuned model...")
test_prompt = "USER: <image>\nAnalyze this agricultural image and provide relevant keywords.\nASSISTANT:"
try:
    test_inputs = processor(text=test_prompt, images=None, return_tensors="pt")
    print("✅ Model validation successful")
except Exception as e:
    print(f"❌ Model validation failed: {e}")
    raise

pipeline_timer.end_step("Model Loading")

total_load_time = base_load_time + processor_load_time + adapter_load_time
final_gpu_memory = get_gpu_memory_usage()

print(f"\n🎉 Enhanced finetuned specialist model ready!")
print(f"   📊 Total load time: {total_load_time:.2f}s")
print(f"   🧹 GPU cleanup time: {cleanup_time:.3f}s")
print(f"   💾 Final GPU memory: {final_gpu_memory:.2f}GB")
print(f"   ⚙️ Enhanced quantization: Double-quant + NF4")
print(f"   🚀 Ready for agricultural keyword generation!")
print(f"   ⏱️ Step duration: {pipeline_timer.get_step_duration('Model Loading'):.1f} seconds")

print("\n" + "=" * 65)
print("🏁 Step 5 Complete: Enhanced LLaVA Model Ready")

## Step 6: 🔥 Enhanced LLaVA Specialist Inference with Keyword Filtering
Enhanced LLaVA model inference with comprehensive DEBUG monitoring, keyword filtering, and geographic correction system.

In [None]:
def generate_keywords_with_finetuned_model_debug(image_path, model, processor):
    """Enhanced keyword generation with comprehensive DEBUG logging and filtering"""
    start_time = time.time()

    # Image loading phase
    try:
        image_load_start = time.time()
        image = Image.open(image_path).convert('RGB')
        image_load_time = time.time() - image_load_start

        # ENHANCED: More constraining prompt for better keyword focus
        prompt = """USER: <image>
Analyze this agricultural image and provide exactly 8-12 essential keywords focusing on:
1. Primary subject (cattle, crop, equipment)
2. Specific breed/variety if clearly visible
3. Basic setting (pasture, barn, field)
4. Simple descriptors (calf, feeding, harvesting)

Avoid technical jargon, repetitive terms, and processing terminology.
Focus on what buyers would search for.

ASSISTANT:"""

        # Input processing phase
        processing_start = time.time()
        inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
        processing_time = time.time() - processing_start

        # Inference phase with memory monitoring
        pre_inference_memory = get_gpu_memory_usage()
        inference_start = time.time()

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=80,  # REDUCED: Limit to reduce keyword overflow
                do_sample=True,
                temperature=0.5,    # REDUCED: More focused responses
                top_k=30,          # REDUCED: Less randomness
                top_p=0.85,        # REDUCED: More focused sampling
                pad_token_id=processor.tokenizer.eos_token_id
            )

        inference_time = time.time() - inference_start
        post_inference_memory = get_gpu_memory_usage()

        # Decoding phase
        decoding_start = time.time()
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
        result = decoded_output.split("ASSISTANT:")[-1].strip() if "ASSISTANT:" in decoded_output else decoded_output
        decoding_time = time.time() - decoding_start

        # ENHANCED: Apply keyword filtering and prioritization
        filtered_result = filter_llava_keywords(result, max_keywords=12)

        total_time = time.time() - start_time

        # Return results with timing data and filtering info
        return {
            'keywords': filtered_result,
            'raw_keywords': result,  # Keep original for comparison
            'timing': {
                'image_load': image_load_time,
                'processing': processing_time,
                'inference': inference_time,
                'decoding': decoding_time,
                'total': total_time
            },
            'memory': {
                'pre_inference': pre_inference_memory,
                'post_inference': post_inference_memory,
                'delta': post_inference_memory - pre_inference_memory
            },
            'filtering_applied': True
        }

    except Exception as e:
        return {
            'keywords': f'processing_failed: {str(e)}',
            'timing': {'total': time.time() - start_time},
            'memory': {'error': True},
            'filtering_applied': False
        }

# Initialize processing variables
specialist_results = []
local_image_dir = "_INPUT_IMAGES/"
processing_stats = {
    'total_processing_time': 0,
    'successful_inferences': 0,
    'failed_inferences': 0,
    'keyword_filtering_applied': 0,
    'memory_peaks': [],
    'avg_inference_time': 0
}

print("🚀 Starting Enhanced Specialist Inference with Keyword Filtering")
print("=" * 70)

pipeline_timer.start_step("LLaVA Specialist Inference")

# ENHANCED: Try Business API first, then fall back to local images
print("📥 PHASE 1: Image Acquisition")
dropbox_success, _ = download_dropbox_images_enhanced(image_folder_name, local_image_dir)

if not dropbox_success:
    print("⚠️ Dropbox Business API failed. Checking for local images...")

    # Check if local images already exist from previous runs
    if os.path.exists(local_image_dir):
        existing_images = [f for f in os.listdir(local_image_dir)
                          if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        if existing_images:
            print(f"✅ Found {len(existing_images)} existing images in {local_image_dir}")
            print("🔄 Continuing pipeline with existing images...")
        else:
            print(f"❌ No images found in {local_image_dir}. Cannot proceed.")
            print("💡 Please check Dropbox credentials or manually upload images to _INPUT_IMAGES/")
    else:
        print(f"❌ Directory {local_image_dir} does not exist. Cannot proceed.")

print("\n🤖 PHASE 2: Enhanced LLaVA Specialist Inference with Keyword Filtering")
print("=" * 75)

# Process images (from Dropbox or local) with enhanced keyword filtering
if os.path.exists(local_image_dir):
    image_files = [f for f in os.listdir(local_image_dir)
                  if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    if image_files:
        print(f"🎯 Processing {len(image_files)} images with enhanced keyword filtering")
        print(f"💾 Pre-processing GPU Memory: {get_gpu_memory_usage():.2f}GB")
        print(f"🔧 NEW: Keyword filtering, geographic correction, and redundancy removal")

        # Enhanced processing with comprehensive DEBUG logging
        with tqdm(total=len(image_files), desc="🔄 Enhanced LLaVA", unit="img", ncols=100) as pbar:
            for i, filename in enumerate(image_files):
                try:
                    # Process image with enhanced filtering
                    image_path = os.path.join(local_image_dir, filename)
                    result = generate_keywords_with_finetuned_model_debug(image_path, model, processor)

                    # Extract timing and memory data
                    timing = result.get('timing', {})
                    memory = result.get('memory', {})
                    keywords = result.get('keywords', 'processing_failed')
                    raw_keywords = result.get('raw_keywords', '')
                    filtering_applied = result.get('filtering_applied', False)

                    # Update statistics
                    if 'processing_failed' not in keywords:
                        processing_stats['successful_inferences'] += 1
                        if filtering_applied:
                            processing_stats['keyword_filtering_applied'] += 1
                        if 'total' in timing:
                            processing_stats['total_processing_time'] += timing['total']
                    else:
                        processing_stats['failed_inferences'] += 1

                    # Track memory peaks
                    if 'post_inference' in memory:
                        processing_stats['memory_peaks'].append(memory['post_inference'])

                    # Create detailed progress display
                    progress_percent = ((i + 1) / len(image_files)) * 100
                    total_time = timing.get('total', 0)
                    current_memory = memory.get('post_inference', get_gpu_memory_usage())

                    # Calculate keyword count for quality tracking
                    keyword_count = len(keywords.split(',')) if keywords != 'processing_failed' else 0

                    # Calculate ETA
                    if processing_stats['successful_inferences'] > 0:
                        avg_time = processing_stats['total_processing_time'] / processing_stats['successful_inferences']
                        remaining_images = len(image_files) - (i + 1)
                        eta_seconds = remaining_images * avg_time
                        eta_display = f"{eta_seconds:.1f}s" if eta_seconds < 60 else f"{eta_seconds/60:.1f}min"
                    else:
                        eta_display = "calculating..."

                    # Enhanced progress update with filtering info
                    pbar.set_postfix_str(
                        f"⏱️{total_time:.2f}s | 🖥️{current_memory:.1f}GB | 🏷️{keyword_count}kw | 📊{progress_percent:.1f}% | ETA:{eta_display}"
                    )

                    # Store result with enhanced metadata
                    specialist_results.append({
                        "filename": filename,
                        "specialist_keywords": keywords,
                        "raw_keywords": raw_keywords,  # Store original for comparison
                        "filtering_applied": filtering_applied,
                        "keyword_count": keyword_count,
                        "debug_timing": timing,
                        "debug_memory": memory
                    })

                    # Memory cleanup every 5 images
                    if (i + 1) % 5 == 0:
                        cleanup_memory()
                        pbar.set_postfix_str(f"🧹 Memory cleanup | 🖥️{get_gpu_memory_usage():.1f}GB")

                except Exception as e:
                    logging.error(f"Failed to process {filename}: {e}")
                    # Add a placeholder entry to maintain data integrity
                    specialist_results.append({
                        "filename": filename,
                        "specialist_keywords": "processing failed",
                        "raw_keywords": "",
                        "filtering_applied": False,
                        "keyword_count": 0,
                        "debug_error": str(e)
                    })
                    processing_stats['failed_inferences'] += 1
                    pbar.set_postfix_str(f"❌ Error: {filename}")

                pbar.update(1)

        # Final cleanup and statistics
        cleanup_memory()
        final_memory = get_gpu_memory_usage()

        # Calculate enhanced final statistics
        total_images = len(specialist_results)
        success_rate = (processing_stats['successful_inferences'] / total_images * 100) if total_images > 0 else 0
        filtering_rate = (processing_stats['keyword_filtering_applied'] / processing_stats['successful_inferences'] * 100) if processing_stats['successful_inferences'] > 0 else 0
        avg_processing_time = (processing_stats['total_processing_time'] / processing_stats['successful_inferences']) if processing_stats['successful_inferences'] > 0 else 0
        peak_memory = max(processing_stats['memory_peaks']) if processing_stats['memory_peaks'] else 0

        # Calculate average keyword count
        successful_results = [r for r in specialist_results if r.get('keyword_count', 0) > 0]
        avg_keyword_count = sum(r['keyword_count'] for r in successful_results) / len(successful_results) if successful_results else 0

        pipeline_timer.end_step("LLaVA Specialist Inference")

        print(f"\n✅ Enhanced specialist inference complete!")
        print(f"📊 ENHANCED PROCESSING STATISTICS:")
        print(f"   🎯 Total Images: {total_images}")
        print(f"   ✅ Successful: {processing_stats['successful_inferences']} ({success_rate:.1f}%)")
        print(f"   ❌ Failed: {processing_stats['failed_inferences']}")
        print(f"   🔧 Keyword Filtering Applied: {processing_stats['keyword_filtering_applied']} ({filtering_rate:.1f}%)")
        print(f"   🏷️ Average Keywords per Image: {avg_keyword_count:.1f}")
        print(f"   ⏱️ Avg Processing Time: {avg_processing_time:.2f}s/image")
        print(f"   🖥️ Peak GPU Memory: {peak_memory:.2f}GB")
        print(f"   💾 Final GPU Memory: {final_memory:.2f}GB")
        print(f"   ⏱️ Step duration: {pipeline_timer.get_step_duration('LLaVA Specialist Inference'):.1f} seconds")

        print(f"🚀 Ready for next pipeline step with enhanced keywords!")


        # OPTIONAL: Export LLaVA results to CSV for analysis
        if specialist_results:
            print("\n📊 Exporting LLaVA results to CSV for analysis...")
            llava_csv_filename = "llava_keywords_output.csv"

            try:
                with open(llava_csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
                    fieldnames = ['filename', 'specialist_keywords', 'raw_keywords', 'keyword_count', 'filtering_applied']
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                    writer.writeheader()

                    for result in specialist_results:
                        writer.writerow({
                            'filename': result.get('filename', ''),
                            'specialist_keywords': result.get('specialist_keywords', ''),
                            'raw_keywords': result.get('raw_keywords', ''),
                            'keyword_count': result.get('keyword_count', 0),
                            'filtering_applied': result.get('filtering_applied', False)
                        })

                if os.path.exists(llava_csv_filename):
                    file_size = os.path.getsize(llava_csv_filename)
                    print(f"✅ LLaVA keywords exported to {llava_csv_filename} ({file_size:,} bytes)")

                    # Provide download link
                    print(f"\n⬇️ DOWNLOADING LLaVA CSV RESULTS...")
                    bulletproof_download_csv(llava_csv_filename, "LLaVA Keywords Output")
                else:
                    print(f"❌ Failed to create {llava_csv_filename}")

            except Exception as e:
                print(f"❌ LLaVA CSV export failed: {e}")
                logging.error(f"LLaVA CSV export error: {e}")
        else:
            print("⚠️ No specialist results to export")

        print(f"🚀 Ready for next pipeline step!")

    else:
        print(f"❌ No image files found in {local_image_dir}")
else:
    print(f"❌ Image directory {local_image_dir} not found")

print(f"\n🎉 ENHANCED PROCESSING SUMMARY:")
print(f"   • Dropbox Download: {'✅ Success' if dropbox_success else '❌ Failed (using local)'}")
print(f"   • Images Processed: {len(specialist_results)}")
print(f"   • Keyword Filtering: {'✅ Applied' if processing_stats['keyword_filtering_applied'] > 0 else '❌ Not applied'}")
print(f"   • Geographic Correction: ✅ Ready for GPT-4 phase")
print(f"   • Ready for Enhanced GPT-4: {'✅ Yes' if specialist_results else '❌ No'}")

## Step 7: Save Intermediate JSON (WITH VALIDATION)
Securely save specialist inference results with comprehensive validation and backup strategies.

In [None]:
print("💾 PHASE 3: Intermediate Data Persistence with Enhanced Validation")
print("=" * 60)

pipeline_timer.start_step("JSON Save and Validation")

# Validate data before saving
if validate_data_structure(specialist_results, "Step 7 - Save Intermediate"):
    try:
        # ENHANCED: Apply bulletproof escaping to all string data before JSON serialization
        print("🔧 Applying bulletproof string escaping to specialist results...")
        escaped_results = []

        for item in specialist_results:
            escaped_item = {}
            for key, value in item.items():
                if isinstance(value, str):
                    # Apply bulletproof escaping to string values
                    escaped_item[key] = bulletproof_escape_string(value)
                elif isinstance(value, dict):
                    # Handle nested dictionaries (like debug_timing, debug_memory)
                    escaped_dict = {}
                    for nested_key, nested_value in value.items():
                        if isinstance(nested_value, str):
                            escaped_dict[nested_key] = bulletproof_escape_string(nested_value)
                        else:
                            escaped_dict[nested_key] = nested_value
                    escaped_item[key] = escaped_dict
                else:
                    escaped_item[key] = value
            escaped_results.append(escaped_item)

        print(f"✅ Bulletproof escaping applied to {len(escaped_results)} records")

        # ENHANCED: Progressive JSON validation before saving
        print("🔍 Performing progressive JSON validation...")
        validation_passed, validation_message = progressive_json_validation(
            escaped_results,
            "Specialist Results - Pre-Save"
        )

        if not validation_passed:
            print(f"❌ JSON validation failed: {validation_message}")
            print("🔄 Attempting to save without escaping as fallback...")
            escaped_results = specialist_results
        else:
            print("✅ Progressive JSON validation passed")

        # Primary save with enhanced error handling
        primary_filename = 'intermediate_results.json'
        print(f"💾 Saving primary intermediate file: {primary_filename}")

        with open(primary_filename, 'w', encoding='utf-8') as f:
            json.dump(escaped_results, f, indent=2, ensure_ascii=False)

        # ENHANCED: Post-save validation
        print("🔍 Performing post-save validation...")
        try:
            with open(primary_filename, 'r', encoding='utf-8') as f:
                test_load = json.load(f)

            # Validate loaded data structure
            post_save_validation, post_save_message = progressive_json_validation(
                test_load,
                "Specialist Results - Post-Save"
            )

            if len(test_load) == len(escaped_results) and post_save_validation:
                print(f"✅ Primary save successful: {len(test_load)} records")
                print(f"✅ Post-save validation: {post_save_message}")
            else:
                raise ValueError(f"Data corruption detected: expected {len(escaped_results)}, got {len(test_load)} or validation failed: {post_save_message}")

        except Exception as e:
            print(f"❌ Post-save validation failed: {e}")
            raise

        # Create backup with timestamp and enhanced validation
        import datetime
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_filename = f'intermediate_results_backup_{timestamp}.json'

        print(f"🔄 Creating timestamped backup: {backup_filename}")
        with open(backup_filename, 'w', encoding='utf-8') as f:
            json.dump(escaped_results, f, indent=2, ensure_ascii=False)

        # Verify backup integrity
        try:
            with open(backup_filename, 'r', encoding='utf-8') as f:
                backup_test = json.load(f)
            backup_validation, backup_message = progressive_json_validation(
                backup_test,
                "Backup File Validation"
            )
            if len(backup_test) == len(escaped_results) and backup_validation:
                print(f"✅ Backup validation successful: {backup_message}")
            else:
                print(f"⚠️ Backup validation warning: {backup_message}")
        except Exception as e:
            print(f"⚠️ Backup validation failed: {e}")

        # Generate enhanced save summary with quality metrics
        total_keywords = 0
        successful_saves = 0
        escaped_string_count = 0

        for item in escaped_results:
            if 'specialist_keywords' in item and 'processing failed' not in item['specialist_keywords']:
                successful_saves += 1
                if isinstance(item['specialist_keywords'], str):
                    total_keywords += len(item['specialist_keywords'].split(','))

            # Count escaped strings for quality assurance
            for key, value in item.items():
                if isinstance(value, str) and ('\\' in value or '"' in value):
                    escaped_string_count += 1

        pipeline_timer.end_step("JSON Save and Validation")

        print(f"\n📊 ENHANCED SAVE SUMMARY:")
        print(f"   📁 Primary File: {primary_filename}")
        print(f"   🔄 Backup File: {backup_filename}")
        print(f"   📝 Total Records: {len(escaped_results)}")
        print(f"   ✅ Successful Inferences: {successful_saves}")
        print(f"   🏷️ Total Keywords Generated: ~{total_keywords}")
        print(f"   🛡️ Strings Escaped: {escaped_string_count}")
        print(f"   🔍 JSON Validation: ✅ Progressive validation applied")
        print(f"   💾 File Integrity: ✅ Verified with post-save checks")
        print(f"   ⏱️ Save Duration: {pipeline_timer.get_step_duration('JSON Save and Validation'):.1f} seconds")

    except Exception as e:
        logging.error(f"❌ Failed to save intermediate results: {e}")
        print(f"❌ Enhanced save operation failed: {e}")
        print("💡 Specialist results remain in memory for next step")
        print("🔧 Consider manual JSON export if critical")
else:
    print("⚠️ No valid data to save from specialist inference.")
    print("🔄 Pipeline will attempt to continue with available data")

## Step 8: Load and Enrich Intermediate Data (WITH ERROR HANDLING)
Load specialist results and enrich with photographer matching information using bulletproof error recovery.

In [None]:
print("🔄 PHASE 4: Data Loading and Enrichment with AI-Driven State Inference - NOW WITH BULLETPROOF ERROR HANDLING")
print("=" * 80)

pipeline_timer.start_step("Data Enrichment")

# Multi-source data loading with fallback strategy
intermediate_data = []
data_source = "unknown"

# Primary: Try loading from saved JSON file
try:
    print("📂 Attempting to load from intermediate_results.json...")
    with open('intermediate_results.json', 'r') as f:
        intermediate_data = json.load(f)
    data_source = "intermediate_results.json"
    print(f"✅ Loaded {len(intermediate_data)} records from {data_source}")
except (FileNotFoundError, json.JSONDecodeError) as e:
    print(f"⚠️ Failed to load from file: {e}")

    # Fallback: Use in-memory specialist_results
    if 'specialist_results' in globals() and specialist_results:
        print("🔄 Using in-memory specialist_results as fallback")
        intermediate_data = specialist_results
        data_source = "memory_fallback"
        print(f"✅ Loaded {len(intermediate_data)} records from memory")
    else:
        print("❌ No fallback data available")

# BULLETPROOF: Validate loaded data with enhanced checks
if not validate_data_structure(intermediate_data, "Step 8 - Load"):
    print("❌ Critical error: No valid data available for enrichment")
    raise Exception("Pipeline cannot continue without specialist inference data")

print(f"👥 Initializing photographer matching system...")

# BULLETPROOF: Initialize photographer matcher with comprehensive error handling
matcher = None
try:
    matcher = PhotographerMatcher(PHOTOGRAPHER_CSV_PATH)
    print("✅ Photographer matching system ready")
except Exception as e:
    print(f"⚠️ Photographer matcher initialization failed: {e}")
    print("🔄 Continuing with default photographer info")
    logging.error(f"PhotographerMatcher init failed: {e}")
    matcher = None

print(f"🔄 Enriching {len(intermediate_data)} records with BULLETPROOF error handling...")

# ENHANCED: Enrichment statistics with error details
enriched_data = []
enrichment_stats = {
    'processed': 0,
    'photographer_matches': 0,
    'sku_extractions': 0,
    'ai_state_inferences': 0,
    'state_corrections': 0,
    'errors': 0,
    'critical_errors': 0,      # NEW: Track critical errors
    'recoverable_errors': 0,   # NEW: Track recoverable errors
    'vendor_info_failures': 0, # NEW: Track vendor info failures
}

# BULLETPROOF: Main enrichment loop with comprehensive error handling
for item_index, item in enumerate(intermediate_data):
    item_filename = "unknown"
    try:
        # DEFENSIVE: Validate item structure first
        if not isinstance(item, dict):
            logging.warning(f"Item {item_index}: Not a dictionary, skipping: {type(item)}")
            enrichment_stats['errors'] += 1
            continue

        if 'filename' not in item:
            logging.warning(f"Item {item_index}: Missing filename field, skipping")
            enrichment_stats['errors'] += 1
            continue

        item_filename = item['filename']
        logging.debug(f"Processing item {item_index}: {item_filename}")

        # BULLETPROOF: Extract SKU from filename using multiple patterns with error handling
        sku = ''
        try:
            filename = str(item_filename)
            sku_patterns = [
                r'\b(\d{6,})\b',   # 6+ consecutive digits
                r'\b(\d{4,})\b',   # 4+ consecutive digits (fallback)
                r'AH[_-]?(\d+)',   # AH prefix pattern
                r'([A-Z]{2}\d+)'   # Letter-number combinations
            ]

            for pattern in sku_patterns:
                try:
                    sku_match = re.search(pattern, filename)
                    if sku_match:
                        sku = sku_match.group(1)
                        enrichment_stats['sku_extractions'] += 1
                        logging.debug(f"SKU extracted for {item_filename}: {sku}")
                        break
                except Exception as pattern_error:
                    logging.warning(f"Pattern matching error for {item_filename}: {pattern_error}")
                    continue

        except Exception as sku_error:
            logging.error(f"SKU extraction failed for {item_filename}: {sku_error}")
            enrichment_stats['recoverable_errors'] += 1
            sku = ''  # Continue with empty SKU

        # BULLETPROOF: Get photographer/vendor information with comprehensive error handling
        vendor_info = {'vendor': 'Unknown', 'photographer_match': 'no', 'state': 'Unknown'}
        try:
            if matcher and sku:
                # CRITICAL: This is where the tuple unpacking error was occurring - NOW FIXED!
                try:
                    vendor_info = matcher.get_vendor_info(sku)

                    # DEFENSIVE: Validate return value structure
                    if not isinstance(vendor_info, dict):
                        logging.warning(f"Invalid vendor_info type for {item_filename}: {type(vendor_info)}")
                        vendor_info = {'vendor': 'Unknown', 'photographer_match': 'no', 'state': 'Unknown'}
                        enrichment_stats['vendor_info_failures'] += 1
                    else:
                        # Ensure all required keys exist
                        required_keys = ['vendor', 'photographer_match', 'state']
                        for key in required_keys:
                            if key not in vendor_info:
                                vendor_info[key] = 'Unknown' if key != 'photographer_match' else 'no'

                        if vendor_info.get('photographer_match') == 'yes':
                            enrichment_stats['photographer_matches'] += 1
                            logging.debug(f"Photographer match found for {item_filename}: {vendor_info['vendor']}")

                except Exception as vendor_error:
                    logging.error(f"Vendor info retrieval failed for {item_filename} (SKU: {sku}): {vendor_error}")
                    enrichment_stats['vendor_info_failures'] += 1
                    enrichment_stats['recoverable_errors'] += 1
                    # vendor_info already has default values

            else:
                # DEFENSIVE: Provide defaults if no matcher or SKU
                if not matcher:
                    logging.debug(f"No matcher available for {item_filename}")
                if not sku:
                    logging.debug(f"No SKU available for {item_filename}")

        except Exception as vendor_outer_error:
            logging.error(f"Critical vendor processing error for {item_filename}: {vendor_outer_error}")
            enrichment_stats['critical_errors'] += 1
            # vendor_info already has default values

        # BULLETPROOF: AI-Driven State Inference with enhanced error handling
        try:
            original_state = vendor_info.get('state', 'Unknown')
            ai_generated_keywords = item.get('specialist_keywords', '')

            if ai_generated_keywords and ai_generated_keywords != 'processing failed':
                try:
                    # Use AI-driven state inference
                    photographer_context = {
                        'state': original_state,
                        'photographer': vendor_info.get('vendor', 'Unknown')
                    }

                    inferred_state = infer_photographer_state(ai_generated_keywords, photographer_context)

                    # DEFENSIVE: Validate inferred state
                    if not isinstance(inferred_state, str):
                        inferred_state = str(inferred_state) if inferred_state is not None else 'Unknown'

                    # Track AI inference usage
                    if inferred_state != original_state:
                        enrichment_stats['ai_state_inferences'] += 1
                        if original_state == 'Unknown' or not original_state:
                            logging.info(f"AI inferred state '{inferred_state}' for {item_filename}")
                        else:
                            enrichment_stats['state_corrections'] += 1
                            logging.info(f"AI corrected state from '{original_state}' to '{inferred_state}' for {item_filename}")

                    # Update vendor info with AI-inferred state
                    vendor_info['state'] = inferred_state
                    vendor_info['ai_state_inference'] = 'yes' if inferred_state != original_state else 'no'

                except Exception as ai_inference_error:
                    logging.error(f"AI state inference failed for {item_filename}: {ai_inference_error}")
                    vendor_info['ai_state_inference'] = 'no'
                    enrichment_stats['recoverable_errors'] += 1
            else:
                vendor_info['ai_state_inference'] = 'no'  # Mark as no AI inference used

        except Exception as ai_outer_error:
            logging.error(f"AI processing section failed for {item_filename}: {ai_outer_error}")
            vendor_info['ai_state_inference'] = 'no'
            enrichment_stats['critical_errors'] += 1

        # BULLETPROOF: Enrich the item with all gathered data
        try:
            # DEFENSIVE: Ensure item is still a dictionary
            if not isinstance(item, dict):
                logging.error(f"Item became non-dict during processing: {item_filename}")
                item = {'filename': item_filename}  # Reconstruct basic item

            item['sku'] = sku if sku else ''

            # DEFENSIVE: Update item with vendor_info safely
            if isinstance(vendor_info, dict):
                item.update(vendor_info)
            else:
                logging.warning(f"vendor_info is not dict for {item_filename}, adding defaults")
                item.update({'vendor': 'Unknown', 'photographer_match': 'no', 'state': 'Unknown', 'ai_state_inference': 'no'})

            enriched_data.append(item)
            enrichment_stats['processed'] += 1

            # Progress logging for large datasets
            if (item_index + 1) % 100 == 0:
                logging.info(f"Processed {item_index + 1}/{len(intermediate_data)} items ({enrichment_stats['processed']} successful)")

        except Exception as enrichment_error:
            logging.error(f"Final enrichment failed for {item_filename}: {enrichment_error}")
            # DEFENSIVE: Add item with minimal enrichment to maintain data flow
            try:
                minimal_item = {
                    'filename': item_filename,
                    'sku': '',
                    'vendor': 'Unknown',
                    'photographer_match': 'no',
                    'state': 'Unknown',
                    'ai_state_inference': 'no',
                    'enrichment_error': str(enrichment_error)
                }
                # Try to preserve original item data
                if isinstance(item, dict):
                    for key, value in item.items():
                        if key not in minimal_item:
                            minimal_item[key] = value

                enriched_data.append(minimal_item)
                enrichment_stats['critical_errors'] += 1
            except Exception as minimal_error:
                logging.critical(f"Could not create minimal item for {item_filename}: {minimal_error}")
                enrichment_stats['critical_errors'] += 1

    except Exception as outer_error:
        # CRITICAL: Outer exception handler for completely unexpected errors
        logging.critical(f"Catastrophic error processing item {item_index} ({item_filename}): {outer_error}")
        enrichment_stats['critical_errors'] += 1

        # DEFENSIVE: Try to add a basic error item
        try:
            error_item = {
                'filename': item_filename,
                'sku': '',
                'vendor': 'Unknown',
                'photographer_match': 'no',
                'state': 'Unknown',
                'ai_state_inference': 'no',
                'critical_error': str(outer_error)
            }
            enriched_data.append(error_item)
        except:
            logging.critical(f"Could not create error item for {item_filename}")

pipeline_timer.end_step("Data Enrichment")

# ENHANCED: Validation and comprehensive summary with error details
if validate_data_structure(enriched_data, "Step 8 - Enrich"):
    total_items = len(intermediate_data) if intermediate_data else 0
    success_rate = (enrichment_stats['processed'] / total_items * 100) if total_items > 0 else 0
    match_rate = (enrichment_stats['photographer_matches'] / enrichment_stats['processed'] * 100) if enrichment_stats['processed'] > 0 else 0
    ai_inference_rate = (enrichment_stats['ai_state_inferences'] / enrichment_stats['processed'] * 100) if enrichment_stats['processed'] > 0 else 0
    error_rate = ((enrichment_stats['errors'] + enrichment_stats['critical_errors'] + enrichment_stats['recoverable_errors']) / total_items * 100) if total_items > 0 else 0

    print(f"✅ BULLETPROOF data enrichment complete!")
    print(f"📊 COMPREHENSIVE ENRICHMENT STATISTICS:")
    print(f"   📂 Data Source: {data_source}")
    print(f"   📝 Total Input Records: {total_items}")
    print(f"   📊 Total Output Records: {len(enriched_data)}")
    print(f"   ✅ Successfully Processed: {enrichment_stats['processed']} ({success_rate:.1f}%)")
    print(f"   🏷️ SKU Extractions: {enrichment_stats['sku_extractions']}")
    print(f"   👥 Photographer Matches: {enrichment_stats['photographer_matches']} ({match_rate:.1f}%)")
    print(f"   🤖 AI State Inferences: {enrichment_stats['ai_state_inferences']} ({ai_inference_rate:.1f}%)")
    print(f"   🔧 State Corrections: {enrichment_stats['state_corrections']}")
    print(f"   ⚠️ Total Error Rate: {error_rate:.1f}%")
    print(f"   🛡️ ERROR BREAKDOWN:")
    print(f"      • General Errors: {enrichment_stats['errors']}")
    print(f"      • Critical Errors: {enrichment_stats['critical_errors']}")
    print(f"      • Recoverable Errors: {enrichment_stats['recoverable_errors']}")
    print(f"      • Vendor Info Failures: {enrichment_stats['vendor_info_failures']}")
    print(f"   ⏱️ Enrichment Duration: {pipeline_timer.get_step_duration('Data Enrichment'):.1f} seconds")
    print(f"   🚀 Ready for GPT-4 Vision refinement!")

    # ENHANCED: Log summary for debugging
    logging.info(f"Enrichment completed: {enrichment_stats['processed']}/{total_items} successful ({success_rate:.1f}%)")

else:
    print("❌ Failed to enrich data properly.")
    print(f"⚠️ ERROR SUMMARY: {enrichment_stats}")
    raise Exception("Data enrichment failed - cannot proceed to GPT-4 refinement")

In [None]:
print("🔧 AGStock Keyworder Enhanced v4.0 FIXED - GPT-4 Vision Setup")
print("=" * 60)

if not OPENAI_API_KEY:
    print("⚠️ OpenAI API key not configured")
    print("💡 GPT-4 Vision refinement will use enhanced fallback responses")
else:
    print("✅ OpenAI API key configured for GPT-4 Vision")
    print("🤖 Using gpt-4o model with geographic correction")

print("🌍 Geographic correction system ready")
print("📏 Strict format compliance (8-12 keywords) enforced")
print("🛡️ Enhanced rate limiting and cost optimization active")
print("🔧 Enhanced fallback responses with agricultural intelligence")

## Step 9: 🔥 Enhanced GPT-4 Vision Refinement with Geographic Correction
Production-grade GPT-4 Vision processing with enhanced keyword filtering, geographic correction, and strict format compliance.

## Step 9: Enhanced GPT-4 Vision Refinement with Geographic Correction

This step implements:
- API rate limiting with intelligent throttling
- Enhanced GPT-4o Vision calls with 4-step structured prompts
- Geographic correction based on photographer context
- Comprehensive error handling and fallback mechanisms

In [None]:
print("AGStock Keyworder Enhanced v4.0 FIXED - GPT-4 Vision Setup")
print("=" * 60)

if not OPENAI_API_KEY:
    print("OpenAI API key not configured")
    print("GPT-4 Vision refinement will use fallback mode")
    client = None
else:
    from openai import OpenAI
    client = OpenAI(api_key=OPENAI_API_KEY)
    print("OpenAI GPT-4 Vision client initialized")

print("\nStep 9 Ready: Enhanced GPT-4o Refinement with Rate Limiting")

In [None]:
# Step 9 Part 1: Rate Limiter Class

class SimpleRateLimiter:
    """Simple rate limiter for API requests"""

    def __init__(self, requests_per_minute=30):
        self.requests_per_minute = requests_per_minute
        self.request_times = []
        self.throttle_events = 0
        self.total_throttle_delay = 0.0

    def wait_if_needed(self):
        """Simple rate limiting check"""
        now = time.time()

        # Remove requests older than 1 minute
        self.request_times = [t for t in self.request_times if now - t < 60]

        # Check if we need to wait
        if len(self.request_times) >= self.requests_per_minute:
            sleep_time = 60 - (now - self.request_times[0])
            if sleep_time > 0:
                logging.info(f"Rate limiting: waiting {sleep_time:.1f}s")
                self.throttle_events += 1
                self.total_throttle_delay += sleep_time
                time.sleep(sleep_time)

        self.request_times.append(now)

    def get_rate_limit_status(self):
        """Get current rate limiting status"""
        now = time.time()
        recent_requests = len([t for t in self.request_times if now - t < 60])
        hour_requests = len([t for t in self.request_times if now - t < 3600])

        # Calculate estimated cost (approximation: $0.01 per 1K tokens, assume ~500 tokens per request)
        estimated_cost = len(self.request_times) * 0.005

        # Calculate cost savings from throttling (approximate)
        cost_savings = self.throttle_events * 0.001  # Small savings from avoiding rate limit penalties

        # Calculate average throttle delay
        avg_throttle_delay = (self.total_throttle_delay / self.throttle_events) if self.throttle_events > 0 else 0.0

        return {
            'current_minute_usage': f"{recent_requests}/{self.requests_per_minute}",
            'current_hour_usage': f"{hour_requests}/{self.requests_per_minute * 60}",
            'total_requests': len(self.request_times),
            'throttle_events': self.throttle_events,
            'estimated_cost': f"${estimated_cost:.3f}",
            'cost_savings': f"${cost_savings:.3f}",
            'avg_throttle_delay': f"{avg_throttle_delay:.2f}s"
        }

# Initialize rate limiter
api_rate_limiter = SimpleRateLimiter(requests_per_minute=25)

In [None]:
# Step 9 Part 2: Helper Functions

def create_enhanced_fallback_response(specialist_keywords, ground_truth):
    """Create high-quality fallback response with geographic correction and format compliance"""

    # Clean and enhance specialist keywords
    if specialist_keywords and specialist_keywords != 'processing failed':
        # Apply the same filtering functions used in LLaVA processing
        filtered_keywords = filter_llava_keywords(specialist_keywords, max_keywords=12)
        cleaned_keywords = [kw.strip() for kw in filtered_keywords.split(',')]
    else:
        cleaned_keywords = ['agriculture', 'farming']

    # Apply geographic correction based on photographer context
    photographer_context = {
        'state': ground_truth.get('location', 'Unknown'),
        'photographer': ground_truth.get('photographer', 'Unknown')
    }

    # Join keywords for geographic correction
    keywords_string = ', '.join(cleaned_keywords)
    corrected_keywords = correct_geographic_keywords(keywords_string, photographer_context)

    # Apply redundancy removal
    final_keyword_list = [kw.strip() for kw in corrected_keywords.split(',')]
    final_keywords = remove_redundant_terms(final_keyword_list)

    # Ensure 8-12 keyword count
    if len(final_keywords) < 8:
        base_keywords = ['agriculture', 'farming', 'crop', 'field', 'rural', 'farm']
        for kw in base_keywords:
            if kw not in final_keywords and len(final_keywords) < 8:
                final_keywords.append(kw)
    elif len(final_keywords) > 12:
        final_keywords = final_keywords[:12]

    # Generate title with geographic context
    location = ground_truth.get('location', 'Unknown')
    if location and location != 'Unknown':
        title = f"Agricultural Image {location}"
    else:
        title = "Agricultural Image"

    return {
        "title": title,
        "keywords": ', '.join(final_keywords),
        "debug_info": {
            "fallback_used": True,
            "fallback_type": "enhanced_with_geographic_correction",
            "geographic_correction_applied": True,
            "success": True
        }
    }

def validate_gpt4_response(result):
    """Enhanced validation for GPT-4 response quality and format compliance"""
    if not result or not isinstance(result, dict):
        return False, "Invalid response format"

    if 'title' not in result or 'keywords' not in result:
        return False, "Missing required fields"

    # Validate title
    title = result['title'].strip()
    if not title or len(title.split()) > 10:
        return False, "Title too long or empty (max 10 words)"

    # Enhanced keyword validation
    keywords = result['keywords'].strip()
    if not keywords:
        return False, "Keywords empty"

    keyword_list = [k.strip() for k in keywords.split(',')]
    keyword_list = [k for k in keyword_list if k]  # Remove empty keywords

    if len(keyword_list) < 8:
        return False, f"Too few keywords ({len(keyword_list)} found, minimum 8 required)"
    elif len(keyword_list) > 12:
        return False, f"Too many keywords ({len(keyword_list)} found, maximum 12 allowed)"

    return True, "Valid response"

print("✅ Helper functions loaded successfully!")

In [None]:
# Step 9 Part 3: GPT-4 Vision Processing Function

# API call function with retry logic
@simple_retry(max_attempts=3, delay=2, backoff=2)
def make_api_call(url, headers, data):
    """Simplified API call with automatic retry"""
    response = requests.post(url, headers=headers, json=data, timeout=45)
    response.raise_for_status()
    return response.json()

def refine_with_enhanced_gpt4_vision(image_path, specialist_keywords, ground_truth):
    """GPT-4o Vision refinement with enhanced processing"""
    start_time = time.time()
    debug_info = {
        'success': False,
        'fallback_used': False,
        'model_used': 'gpt-4o',
        'simplified_retry': True
    }

    if not OPENAI_API_KEY:
        debug_info['error'] = "No OpenAI API key provided"
        return create_enhanced_fallback_response(specialist_keywords, ground_truth)

    # Simple rate limiting
    api_rate_limiter.wait_if_needed()

    # Image encoding
    try:
        with open(image_path, "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    except Exception as e:
        debug_info['error'] = f"Image encoding failed: {str(e)}"
        return create_enhanced_fallback_response(specialist_keywords, ground_truth)

    # API request setup
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {OPENAI_API_KEY}"}

    prompt_text = f"""You are an expert agricultural photography keywording system.

**CONTEXT:**
- LLaVA Keywords: {specialist_keywords}
- Photographer: {ground_truth.get('photographer', 'Unknown')}
- Location: {ground_truth.get('location', 'Unknown')}

**REQUIREMENTS:**
1. Use ONLY the confirmed location
2. Generate EXACTLY 8-12 keywords
3. Output JSON format only

**OUTPUT FORMAT:**
```json
{{
  "title": "[SEO title with location, max 8 words]",
  "keywords": "[8-12 comma-separated lowercase keywords]"
}}
```

Enhance the keywords following this format."""

    payload = {
        "model": "gpt-4o",
        "messages": [{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt_text},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
            ]
        }],
        "max_tokens": 400,
        "temperature": 0.2
    }

    # API call with automatic retry
    try:
        response = make_api_call("https://api.openai.com/v1/chat/completions", headers, payload)
        content = response['choices'][0]['message']['content']
        debug_info['api_success'] = True
    except Exception as e:
        debug_info['api_success'] = False
        debug_info['api_error'] = str(e)
        debug_info['fallback_used'] = True
        return create_enhanced_fallback_response(specialist_keywords, ground_truth)

    # Response parsing
    try:
        # Try JSON code block extraction first
        json_match = re.search(r'```(?:json)?\n(.*?)\n```', content, re.DOTALL)
        if json_match:
            json_str = json_match.group(1).strip()
            parsed_response = json.loads(json_str)
        else:
            # Try direct JSON parsing
            clean_content = content.strip()
            if clean_content.startswith('```'):
                clean_content = re.sub(r'^```.*?\n', '', clean_content)
                clean_content = re.sub(r'\n```$', '', clean_content)
            parsed_response = json.loads(clean_content)

        if isinstance(parsed_response, dict) and 'keywords' in parsed_response:
            # Apply geographic correction and keyword validation
            photographer_context = {
                'state': ground_truth.get('location', 'Unknown'),
                'photographer': ground_truth.get('photographer', 'Unknown')
            }

            original_keywords = parsed_response.get('keywords', '')
            corrected_keywords = correct_geographic_keywords(original_keywords, photographer_context)
            keyword_list = [kw.strip() for kw in corrected_keywords.split(',')]
            final_keywords = remove_redundant_terms(keyword_list)

            # Ensure 8-12 keyword compliance
            if len(final_keywords) < 8:
                base_keywords = ['agriculture', 'farming', 'crop', 'field']
                for kw in base_keywords:
                    if kw not in final_keywords and len(final_keywords) < 8:
                        final_keywords.append(kw)
            elif len(final_keywords) > 12:
                final_keywords = final_keywords[:12]

            debug_info['success'] = True
            result = {
                "title": parsed_response.get('title', 'Agricultural Image'),
                "keywords": ', '.join(final_keywords),
                "debug_info": debug_info
            }

            # Validate result
            is_valid, validation_message = validate_gpt4_response(result)
            if is_valid:
                return result
            else:
                debug_info['validation_error'] = validation_message

    except (json.JSONDecodeError, KeyError) as e:
        debug_info['parsing_error'] = str(e)

    # Final fallback
    debug_info['fallback_used'] = True
    return create_enhanced_fallback_response(specialist_keywords, ground_truth)


In [None]:
print("🎯 PHASE 5: Enhanced GPT-4o Vision Refinement with Geographic Correction")
print("=" * 80)

pipeline_timer.start_step("Enhanced GPT-4o Refinement")

# Initialize enhanced refinement tracking
refined_data = []
enhanced_gpt4_stats = {
    'total_processed': 0,
    'api_successes': 0,
    'api_failures': 0,
    'parsing_successes': 0,
    'fallback_used': 0,
    'model_retries': 0,
    'total_api_time': 0,
    'total_parsing_time': 0,
    'rate_limit_events': 0,
    'geographic_corrections_applied': 0,
    'format_compliance_enforced': 0,
    'validation_passes': 0,
    'avg_keywords_per_image': 0
}

if validate_data_structure(enriched_data, "Step 9 - Enhanced Input"):
    print(f"🚀 Processing {len(enriched_data)} images with Enhanced GPT-4o Vision")
    print(f"🔧 Using enhanced geographic correction and strict format compliance")
    print(f"🤖 Model: gpt-4o with keyword filtering integration")
    print(f"🛡️ Intelligent rate limiting: {api_rate_limiter.requests_per_minute} req/min")
    print(f"🌍 Geographic correction: Enabled for photographer context")
    print(f"📊 Format compliance: Strict 8-12 keyword enforcement")

    total_keywords_generated = 0

    with tqdm(total=len(enriched_data), desc="🎯 Enhanced GPT-4o", unit="img", ncols=120) as pbar:
        for i, item in enumerate(enriched_data):
            try:
                image_path = os.path.join(local_image_dir, item['filename'])

                # Validate image file exists
                if not os.path.exists(image_path):
                    item.update({"title": "Image Not Found", "keywords": item.get('specialist_keywords', '')})
                    enhanced_gpt4_stats['api_failures'] += 1
                else:
                    ground_truth = {
                        'photographer': item.get('vendor', 'Unknown'),
                        'location': item.get('state', 'Unknown')
                    }

                    # Process with enhanced GPT-4o with geographic correction
                    gpt4_output = refine_with_enhanced_gpt4_vision(
                        image_path,
                        item.get('specialist_keywords', ''),
                        ground_truth
                    )

                    # Extract enhanced debug information
                    debug_info = gpt4_output.get('debug_info', {})

                    # Update enhanced statistics
                    if debug_info.get('api_success', False):
                        enhanced_gpt4_stats['api_successes'] += 1
                    else:
                        enhanced_gpt4_stats['api_failures'] += 1

                    if debug_info.get('success', False):
                        enhanced_gpt4_stats['parsing_successes'] += 1

                    if debug_info.get('fallback_used', False):
                        enhanced_gpt4_stats['fallback_used'] += 1

                    if debug_info.get('geographic_correction_applied', False):
                        enhanced_gpt4_stats['geographic_corrections_applied'] += 1

                    if debug_info.get('format_compliance_enforced', False):
                        enhanced_gpt4_stats['format_compliance_enforced'] += 1

                    # Count keywords for quality tracking
                    final_keywords = gpt4_output.get('keywords', '')
                    keyword_count = len(final_keywords.split(',')) if final_keywords else 0
                    total_keywords_generated += keyword_count

                    # Validate final output
                    is_valid, validation_message = validate_gpt4_response(gpt4_output)
                    if is_valid:
                        enhanced_gpt4_stats['validation_passes'] += 1

                    if debug_info.get('api_attempts', 0) > 1:
                        enhanced_gpt4_stats['model_retries'] += debug_info['api_attempts'] - 1

                    # Track rate limiting events
                    rate_limiting_info = debug_info.get('rate_limiting', {})
                    if rate_limiting_info.get('throttle_applied', False):
                        enhanced_gpt4_stats['rate_limit_events'] += 1

                    # Track enhanced timing
                    phases = debug_info.get('phases', {})
                    if 'api_call' in phases:
                        enhanced_gpt4_stats['total_api_time'] += phases['api_call']
                    if 'parsing' in phases:
                        enhanced_gpt4_stats['total_parsing_time'] += phases['parsing']

                    # Update item with enhanced results
                    item.update({
                        "title": gpt4_output.get('title', 'Processing Failed'),
                        "keywords": final_keywords,
                        "enhanced_gpt4_debug": debug_info,
                        "model_version": "gpt-4o-enhanced",
                        "keyword_count": keyword_count,
                        "geographic_corrected": debug_info.get('geographic_correction_applied', False),
                        "format_compliant": debug_info.get('format_compliance_enforced', False)
                    })

                refined_data.append(item)
                enhanced_gpt4_stats['total_processed'] += 1

                # Calculate enhanced progress statistics
                progress_percent = ((i + 1) / len(enriched_data)) * 100
                success_rate = (enhanced_gpt4_stats['api_successes'] / enhanced_gpt4_stats['total_processed'] * 100) if enhanced_gpt4_stats['total_processed'] > 0 else 0
                validation_rate = (enhanced_gpt4_stats['validation_passes'] / enhanced_gpt4_stats['total_processed'] * 100) if enhanced_gpt4_stats['total_processed'] > 0 else 0

                # Get current rate limit status for display
                rate_status = api_rate_limiter.get_rate_limit_status()

                # Calculate ETA with enhanced metrics
                if enhanced_gpt4_stats['total_processed'] > 0:
                    avg_time = (enhanced_gpt4_stats['total_api_time'] + enhanced_gpt4_stats['total_parsing_time']) / enhanced_gpt4_stats['total_processed']
                    remaining_items = len(enriched_data) - (i + 1)
                    eta_seconds = remaining_items * avg_time
                    eta_display = f"{eta_seconds:.1f}s" if eta_seconds < 60 else f"{eta_seconds/60:.1f}min"
                else:
                    eta_display = "calculating..."

                # Enhanced progress display with validation and geographic correction info
                pbar.set_postfix_str(
                    f"✅{enhanced_gpt4_stats['api_successes']}/{enhanced_gpt4_stats['total_processed']} | 📊{success_rate:.1f}% | 🌍{enhanced_gpt4_stats['geographic_corrections_applied']} | 📏{enhanced_gpt4_stats['format_compliance_enforced']} | ETA:{eta_display}"
                )

                # Progress checkpoint every 10 items with enhanced metrics
                if (i + 1) % 10 == 0:
                    rate_summary = api_rate_limiter.get_rate_limit_status()
                    pbar.set_postfix_str(f"Checkpoint: {i+1}/{len(enriched_data)} | Geo: {enhanced_gpt4_stats['geographic_corrections_applied']} | Valid: {enhanced_gpt4_stats['validation_passes']}")

            except Exception as e:
                logging.error(f"Failed to process enhanced GPT-4o refinement for {item.get('filename', 'unknown')}: {e}")
                # Add item with fallback data
                item.update({
                    "title": "Processing Error",
                    "keywords": item.get('specialist_keywords', ''),
                    "model_version": "error"
                })
                refined_data.append(item)
                enhanced_gpt4_stats['api_failures'] += 1
                enhanced_gpt4_stats['total_processed'] += 1
                pbar.set_postfix_str(f"❌ Error: {item.get('filename', 'unknown')}")

            pbar.update(1)

    pipeline_timer.end_step("Enhanced GPT-4o Refinement")

    # Calculate enhanced final statistics
    total_processed = enhanced_gpt4_stats['total_processed']
    api_success_rate = (enhanced_gpt4_stats['api_successes'] / total_processed * 100) if total_processed > 0 else 0
    parsing_success_rate = (enhanced_gpt4_stats['parsing_successes'] / total_processed * 100) if total_processed > 0 else 0
    fallback_rate = (enhanced_gpt4_stats['fallback_used'] / total_processed * 100) if total_processed > 0 else 0
    validation_rate = (enhanced_gpt4_stats['validation_passes'] / total_processed * 100) if total_processed > 0 else 0
    geographic_correction_rate = (enhanced_gpt4_stats['geographic_corrections_applied'] / total_processed * 100) if total_processed > 0 else 0
    format_compliance_rate = (enhanced_gpt4_stats['format_compliance_enforced'] / total_processed * 100) if total_processed > 0 else 0
    avg_keywords_per_image = (total_keywords_generated / total_processed) if total_processed > 0 else 0

    avg_api_time = (enhanced_gpt4_stats['total_api_time'] / enhanced_gpt4_stats['api_successes']) if enhanced_gpt4_stats['api_successes'] > 0 else 0
    avg_parsing_time = (enhanced_gpt4_stats['total_parsing_time'] / enhanced_gpt4_stats['parsing_successes']) if enhanced_gpt4_stats['parsing_successes'] > 0 else 0

    # Get final rate limiting summary
    final_rate_summary = api_rate_limiter.get_rate_limit_status()

    if validate_data_structure(refined_data, "Step 9 - Enhanced Output"):
        print(f"\n✅ Enhanced GPT-4o Vision refinement with geographic correction complete!")
        print(f"📊 ENHANCED GPT-4o PROCESSING STATISTICS:")
        print(f"   🎯 Total Processed: {total_processed}")
        print(f"   🤖 Model Used: gpt-4o-enhanced with geographic correction")
        print(f"   🌐 API Successes: {enhanced_gpt4_stats['api_successes']} ({api_success_rate:.1f}%)")
        print(f"   ❌ API Failures: {enhanced_gpt4_stats['api_failures']}")
        print(f"   🔍 Parsing Successes: {enhanced_gpt4_stats['parsing_successes']} ({parsing_success_rate:.1f}%)")
        print(f"   ✅ Validation Passes: {enhanced_gpt4_stats['validation_passes']} ({validation_rate:.1f}%)")
        print(f"   🔄 Enhanced Fallback Usage: {enhanced_gpt4_stats['fallback_used']} instances ({fallback_rate:.1f}%)")
        print(f"   🌍 Geographic Corrections Applied: {enhanced_gpt4_stats['geographic_corrections_applied']} ({geographic_correction_rate:.1f}%)")
        print(f"   📏 Format Compliance Enforced: {enhanced_gpt4_stats['format_compliance_enforced']} ({format_compliance_rate:.1f}%)")
        print(f"   🏷️ Average Keywords per Image: {avg_keywords_per_image:.1f}")
        print(f"   🔁 Model Retries: {enhanced_gpt4_stats['model_retries']}")
        print(f"   ⏱️ Avg API Time: {avg_api_time:.2f}s")
        print(f"   📝 Avg Parsing Time: {avg_parsing_time:.3f}s")
        print(f"   🛡️ ENHANCED RATE LIMITING SUMMARY:")
        print(f"      • Total API Requests: {final_rate_summary.get('total_requests', 0)}")
        print(f"      • Throttle Events: {final_rate_summary.get('throttle_events', 0)}")
        print(f"      • Estimated Cost: {final_rate_summary.get('estimated_cost', '$0.000')}")
        print(f"      • Cost Savings: {final_rate_summary.get('cost_savings', '$0.000')}")
        print(f"      • Avg Throttle Delay: {final_rate_summary.get('avg_throttle_delay', '0.00s')}")
        print(f"      • Final Usage: {final_rate_summary.get('current_minute_usage', '0/25')} (minute) | {final_rate_summary.get('current_hour_usage', '0/1500')} (hour)")
        print(f"   ⏱️ Step Duration: {pipeline_timer.get_step_duration('Enhanced GPT-4o Refinement'):.1f} seconds")
        print(f"   🚀 Ready for final assembly with enhanced quality!")

        # Export enhanced GPT-4 results to CSV
        print(f"\n📊 PHASE 6: Enhanced GPT-4 CSV Export")
        print("=" * 45)

        if refined_data:
            enhanced_gpt4_csv_filename = "enhanced_gpt4_keywords_output.csv"

            try:
                print(f"📝 Exporting {len(refined_data)} enhanced GPT-4 results to {enhanced_gpt4_csv_filename}...")

                # Create CSV with enhanced GPT-4-specific columns
                with open(enhanced_gpt4_csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
                    fieldnames = [
                        'filename',
                        'enhanced_title',
                        'enhanced_keywords',
                        'original_llava_keywords',
                        'keyword_count',
                        'geographic_corrected',
                        'format_compliant',
                        'validation_passed',
                        'processing_time',
                        'success_status'
                    ]
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                    writer.writeheader()

                    successful_exports = 0

                    for item in refined_data:
                        try:
                            # Clean and prepare enhanced keywords
                            enhanced_keywords = item.get('keywords', '')
                            original_keywords = item.get('specialist_keywords', '')

                            if 'processing_failed' in enhanced_keywords or 'Processing Failed' in item.get('title', ''):
                                success_status = 'failed'
                            else:
                                success_status = 'success'

                            # Extract enhanced debug information
                            debug_info = item.get('enhanced_gpt4_debug', {})
                            phases = debug_info.get('phases', {})
                            processing_time = phases.get('api_call', 0) + phases.get('parsing', 0)

                            writer.writerow({
                                'filename': item.get('filename', 'unknown'),
                                'enhanced_title': item.get('title', 'Unknown'),
                                'enhanced_keywords': enhanced_keywords,
                                'original_llava_keywords': original_keywords[:100] + '...' if len(str(original_keywords)) > 100 else original_keywords,  # Truncate for readability
                                'keyword_count': item.get('keyword_count', 0),
                                'geographic_corrected': item.get('geographic_corrected', False),
                                'format_compliant': item.get('format_compliant', False),
                                'validation_passed': debug_info.get('success', False),
                                'processing_time': f"{processing_time:.3f}s",
                                'success_status': success_status
                            })
                            successful_exports += 1

                        except Exception as e:
                            logging.error(f"Failed to export enhanced GPT-4 result for {item.get('filename', 'unknown')}: {e}")
                            # Write error row
                            writer.writerow({
                                'filename': item.get('filename', 'unknown'),
                                'enhanced_title': 'export_error',
                                'enhanced_keywords': 'export_error',
                                'original_llava_keywords': 'export_error',
                                'keyword_count': 0,
                                'geographic_corrected': False,
                                'format_compliant': False,
                                'validation_passed': False,
                                'processing_time': '0.000s',
                                'success_status': 'export_failed'
                            })

                # Verify export
                if os.path.exists(enhanced_gpt4_csv_filename):
                    file_size = os.path.getsize(enhanced_gpt4_csv_filename)
                    print(f"✅ Enhanced GPT-4 CSV export successful!")
                    print(f"   📁 File: {enhanced_gpt4_csv_filename}")
                    print(f"   📊 Records exported: {successful_exports}/{len(refined_data)}")
                    print(f"   💾 File size: {file_size:,} bytes")
                    print(f"   🔧 Includes geographic correction and format compliance data")

                    # Provide download link
                    print(f"\n⬇️ DOWNLOADING ENHANCED GPT-4 CSV RESULTS...")
                    bulletproof_download_csv(enhanced_gpt4_csv_filename, "Enhanced GPT-4 Keywords Output")
                else:
                    print(f"❌ Failed to create {enhanced_gpt4_csv_filename}")

            except Exception as e:
                print(f"❌ Enhanced GPT-4 CSV export failed: {e}")
                logging.error(f"Enhanced GPT-4 CSV export error: {e}")
        else:
            print("⚠️ No enhanced refined results to export")
    else:
        print("⚠️ Enhanced GPT-4o refinement completed with errors.")
else:
    print("⚠️ No valid data for enhanced GPT-4o refinement.")
    refined_data = enriched_data

## Step 10: Assemble Final Metadata Rows (WITH ERROR HANDLING)
Transform refined data into Shopify-compatible CSV format with comprehensive validation.

In [None]:
print("📋 PHASE 6: Final Metadata Assembly")
print("=" * 50)

final_csv_data = []
assembly_stats = {
    'processed': 0,
    'successful': 0,
    'errors': 0,
    'keyword_cleaning_applied': 0,
    'total_keywords_generated': 0
}

if validate_data_structure(refined_data, "Step 10 - Input"):
    print(f"🔄 Assembling final metadata for {len(refined_data)} images")

    for item in refined_data:
        try:
            # Extract and clean keywords with comprehensive error handling
            raw_keywords = item.get('keywords', item.get('specialist_keywords', ''))
            if raw_keywords:
                cleaned_keywords = clean_ai_keywords(raw_keywords)
                assembly_stats['keyword_cleaning_applied'] += 1
                assembly_stats['total_keywords_generated'] += len(cleaned_keywords)
            else:
                cleaned_keywords = ['agricultural', 'farming']

            keywords_str = ", ".join(cleaned_keywords) if cleaned_keywords else "agricultural, farming"

            # Extract core metadata with fallbacks
            title = item.get('title', 'Agricultural Image')
            sku = item.get('sku', '')
            filename = item.get('filename', 'unknown.jpg')
            vendor = item.get('vendor', 'Unknown')
            photographer_match = item.get('photographer_match', 'no')

            # Assemble final CSV row (CORRECTED FORMAT - no photographer_state)
            final_csv_data.append({
                'filename': filename,
                'title': f"{title} {sku}".strip(),
                'description': 'All standard image downloads are 2592 px long and available as a download link immediately after purchase. If you choose to buy a high-resolution image, it will be emailed to you within 24 hours.',
                'keywords': keywords_str,
                'tags': keywords_str,
                'collections': '',
                'models': '',
                'vendor': vendor,  # CORRECTED: vendor contains photographer name
                'Photographer_Match': photographer_match  # CORRECTED: yes/no only, no state column
            })

            assembly_stats['successful'] += 1

        except Exception as e:
            logging.error(f"Failed to process final metadata for {item.get('filename', 'unknown')}: {e}")
            # Add minimal valid row to maintain data integrity
            final_csv_data.append({
                'filename': item.get('filename', 'unknown.jpg'),
                'title': 'Processing Error',
                'description': 'Error occurred during processing',
                'keywords': 'error, processing',
                'tags': 'error, processing',
                'collections': '',
                'models': '',
                'vendor': 'Unknown',
                'Photographer_Name': 'Unknown',
                'Photographer_Match': 'no'
            })
            assembly_stats['errors'] += 1

        assembly_stats['processed'] += 1

    # Final validation and statistics
    success_rate = (assembly_stats['successful'] / assembly_stats['processed'] * 100) if assembly_stats['processed'] > 0 else 0
    avg_keywords_per_image = (assembly_stats['total_keywords_generated'] / assembly_stats['successful']) if assembly_stats['successful'] > 0 else 0

    print(f"\n✅ Final metadata assembly complete!")
    print(f"📊 ASSEMBLY STATISTICS:")
    print(f"   📝 Total Records: {len(final_csv_data)}")
    print(f"   ✅ Successfully Processed: {assembly_stats['successful']} ({success_rate:.1f}%)")
    print(f"   🧹 Keyword Cleaning Applied: {assembly_stats['keyword_cleaning_applied']}")
    print(f"   🏷️ Total Keywords Generated: {assembly_stats['total_keywords_generated']}")
    print(f"   📊 Avg Keywords/Image: {avg_keywords_per_image:.1f}")
    print(f"   ❌ Errors: {assembly_stats['errors']}")
    print(f"   🚀 Ready for CSV export!")
else:
    print("⚠️ No valid data for final assembly.")
    final_csv_data = []

## Step 11: Final CSV Export (Shopify-Compatible)
Export the final processed data to a Shopify-compatible CSV file with comprehensive validation and bulletproof download.

In [None]:
print("🚀 PHASE 7: Final CSV Export (Shopify-Compatible)")
print("=" * 55)

pipeline_timer.start_step("Final CSV Export")

def export_to_shopify_csv_v4_simplified(data, filename="final_shopify_export.csv"):
    """Export data to Shopify-compatible CSV with comprehensive validation"""
    export_stats = {
        'total_records': len(data),
        'successful_exports': 0,
        'failed_exports': 0,
        'validation_errors': 0,
        'character_limit_violations': 0
    }

    if not data:
        logging.error("No data provided for export")
        return False, export_stats

    print(f"📊 Exporting {len(data)} records to Shopify-compatible CSV...")

    # Shopify-compatible field names (9 columns as per BUILD_SPEC)
    fieldnames = [
        'filename',
        'title',
        'description',
        'keywords',
        'tags',
        'collections',
        'models',
        'vendor',
        'photographer_match'
    ]

    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for item in data:
                try:
                    # Extract SKU from filename for title appending
                    filename = item.get('filename', 'unknown.jpg')
                    sku = None

                    # Extract numeric SKU from filename (e.g., 'HS 115642.jpg' -> '115642')
                    import re
                    sku_patterns = [
                        r'[A-Z]{2} (\d{6})',     # Primary: AH 270480
                        r'([A-Z]{2})(\d{6})',    # Secondary: AH270480
                        r'(\d{6})',              # Basic: 270480
                        r'[A-Z]+_(\d{5,7})',     # Variable length: AB_12345
                        r'([A-Z]+)(\d{5,7})',    # No underscore: AB12345
                        r'(\d{5,7})',        # Word boundary: any 5-7 digits
                    ]

                    for pattern in sku_patterns:
                        match = re.search(pattern, filename)
                        if match:
                            if len(match.groups()) == 1:
                                sku = match.group(1)
                            elif len(match.groups()) == 2:
                                sku = match.group(2)  # Return the numeric part
                            break

                    # Create title with SKU appended
                    base_title = str(item.get('title', 'Agricultural Image'))
                    if sku:
                        title = f'{base_title} {sku}'
                    else:
                        title = base_title

                    title = title[:70]  # Enforce 70 char limit
                    if len(str(item.get('title', ''))) > 70:
                        export_stats['character_limit_violations'] += 1

                    keywords = str(item.get('keywords', ''))[:500]  # Enforce 500 char limit
                    if len(str(item.get('keywords', ''))) > 500:
                        export_stats['character_limit_violations'] += 1

                    description = str(item.get('description', 'All standard image downloads are 2592 px long and available as a download link immediately after purchase. If you choose to buy a high-resolution image, it will be emailed to you within 24 hours.'))

                    # Use bulletproof string escaping for safety
                    safe_title = bulletproof_escape_string(title)
                    safe_keywords = bulletproof_escape_string(keywords)
                    safe_description = bulletproof_escape_string(description)

                    # Write the row with exactly 9 columns
                    writer.writerow({
                        'filename': item.get('filename', 'unknown.jpg'),
                        'title': safe_title,
                        'description': safe_description,
                        'keywords': safe_keywords,
                        'tags': safe_keywords,  # Use same as keywords
                        'collections': item.get('collections', ''),
                        'models': item.get('models', ''),
                        'vendor': item.get('vendor', 'Unknown'),
                        'photographer_match': item.get('photographer_match', 'no')
                    })

                    export_stats['successful_exports'] += 1

                except Exception as e:
                    logging.error(f"Failed to export record {item.get('filename', 'unknown')}: {e}")
                    export_stats['failed_exports'] += 1
                    export_stats['validation_errors'] += 1

        # Validate the exported file
        if os.path.exists(filename):
            file_size = os.path.getsize(filename)

            # Verify row count
            with open(filename, 'r', encoding='utf-8') as csvfile:
                reader = csv.reader(csvfile)
                row_count = sum(1 for row in reader) - 1  # Subtract header

            pipeline_timer.end_step("Final CSV Export")

            print(f"✅ Shopify CSV export successful!")
            print(f"📊 EXPORT STATISTICS:")
            print(f"   📁 File: {filename}")
            print(f"   📝 Total Records: {export_stats['total_records']}")
            print(f"   ✅ Successful Exports: {export_stats['successful_exports']}")
            print(f"   ❌ Failed Exports: {export_stats['failed_exports']}")
            print(f"   ⚠️ Character Limit Violations: {export_stats['character_limit_violations']}")
            print(f"   💾 File Size: {file_size:,} bytes")
            print(f"   📊 CSV Rows: {row_count}")
            print(f"   ⏱️ Export Duration: {pipeline_timer.get_step_duration('Final CSV Export'):.1f} seconds")

            return True, export_stats
        else:
            print(f"❌ Failed to create export file: {filename}")
            return False, export_stats

    except Exception as e:
        logging.error(f"CSV export failed: {e}")
        print(f"❌ Export operation failed: {e}")
        return False, export_stats


def validate_csv_row(row_data):
    """Validate a single CSV row for Shopify compatibility"""

    validation_errors = []

    # Filename validation
    if not row_data.get('filename'):
        validation_errors.append("Missing filename")
    elif not row_data['filename'].lower().endswith(('.jpg', '.jpeg', '.png')):
        validation_errors.append("Invalid filename extension")

    # Title validation
    title = row_data.get('title', '')
    if len(title) > 70:
        validation_errors.append(f"Title too long ({len(title)} chars, max 70)")
        row_data['title'] = title[:67] + "..."  # Auto-truncate
    elif len(title) < 5:
        validation_errors.append("Title too short (minimum 5 characters)")

    # Keywords validation
    keywords = row_data.get('keywords', '')
    if len(keywords) > 500:
        validation_errors.append(f"Keywords too long ({len(keywords)} chars, max 500)")
        # Auto-truncate at last complete keyword
        truncated = keywords[:500]
        last_comma = truncated.rfind(',')
        if last_comma > 0:
            row_data['keywords'] = truncated[:last_comma]

    keyword_count = len(keywords.split(',')) if keywords else 0
    if keyword_count < 8:
        validation_errors.append(f"Too few keywords ({keyword_count}, minimum 8)")
    elif keyword_count > 12:
        validation_errors.append(f"Too many keywords ({keyword_count}, maximum 12)")

    # Vendor validation
    vendor = row_data.get('vendor', '')
    if not vendor or vendor.strip() == '':
        row_data['vendor'] = 'Unknown'

    # Photographer match validation
    photographer_match = row_data.get('photographer_match', '').lower()
    if photographer_match not in ['yes', 'no']:
        row_data['photographer_match'] = 'no'

    return validation_errors, row_data

# Execute final CSV export
print("🔄 Preparing final data for Shopify export...")

if validate_data_structure(final_csv_data, "Step 11 - Final Export"):
    export_success, export_stats = export_to_shopify_csv_v4_simplified(
        final_csv_data,
        "final_shopify_export.csv"
    )

    if export_success:
        print(f"\n⬇️ DOWNLOADING FINAL SHOPIFY CSV...")
        download_success = bulletproof_download_csv(
            "final_shopify_export.csv",
            "Final Shopify Export"
        )

        if download_success:
            print("✅ Final CSV successfully downloaded!")
        else:
            print("⚠️ Download failed - file available locally")
    else:
        print("❌ Final CSV export failed")
        print("💡 Check data integrity and try manual export")
else:
    print("⚠️ No valid data available for final export")
    print("🔄 Pipeline completed with limited data - check previous steps")

print("\n" + "=" * 55)
print("🏁 Step 11 Complete: Final CSV Export Done")

## Step 12: Pipeline Summary and Quality Assurance
Comprehensive performance metrics, quality assurance reporting, and complete pipeline statistics.

In [None]:
print("🎯 FINAL PHASE: Pipeline Summary & Quality Assurance")
print("=" * 60)

pipeline_timer.start_step("Pipeline Summary")

def generate_comprehensive_pipeline_summary():
    """Generate comprehensive pipeline performance and quality metrics"""

    summary = {
        'pipeline_info': {
            'version': 'AGStock Keyworder Enhanced v4.0 FIXED',
            'completion_time': time.strftime('%Y-%m-%d %H:%M:%S'),
            'total_pipeline_time': pipeline_timer.get_total_time()
        },
        'step_performance': {},
        'data_quality': {},
        'resource_usage': {},
        'success_metrics': {}
    }

    # Step-by-step performance analysis
    print("📊 STEP-BY-STEP PERFORMANCE ANALYSIS:")
    print("=" * 45)

    step_names = [
        "Dropbox Integration",
        "Model Loading",
        "LLaVA Specialist Inference",
        "Final CSV Export",
        "Pipeline Summary"
    ]

    total_step_time = 0
    for step in step_names:
        duration = pipeline_timer.get_step_duration(step)
        summary['step_performance'][step] = duration
        total_step_time += duration

        if duration > 0:
            print(f"   ⏱️ {step:<25}: {duration:>8.1f}s")
        else:
            print(f"   ⚠️ {step:<25}: {'Not tracked':>8s}")

    # Data quality assessment
    print(f"\n📋 DATA QUALITY ASSESSMENT:")
    print("=" * 35)

    # Count successful processing at each stage
    specialist_count = len(specialist_results) if 'specialist_results' in globals() else 0
    enriched_count = len(enriched_data) if 'enriched_data' in globals() else 0
    refined_count = len(refined_data) if 'refined_data' in globals() else 0
    final_count = len(final_csv_data) if 'final_csv_data' in globals() else 0

    summary['data_quality'] = {
        'specialist_inference': specialist_count,
        'data_enrichment': enriched_count,
        'gpt4_refinement': refined_count,
        'final_assembly': final_count,
        'data_retention_rate': (final_count / specialist_count * 100) if specialist_count > 0 else 0
    }

    print(f"   🤖 Specialist Inference Results: {specialist_count:>6}")
    print(f"   🔄 Data Enrichment Results:     {enriched_count:>6}")
    print(f"   🎯 GPT-4 Refinement Results:    {refined_count:>6}")
    print(f"   📋 Final Assembly Results:      {final_count:>6}")
    print(f"   📊 Data Retention Rate:         {(final_count / specialist_count * 100) if specialist_count > 0 else 0:>5.1f}%")

    # Resource usage analysis
    print(f"\n💾 RESOURCE USAGE ANALYSIS:")
    print("=" * 30)

    current_memory = get_gpu_memory_usage()
    system_memory = psutil.virtual_memory()

    summary['resource_usage'] = {
        'final_gpu_memory': current_memory,
        'system_memory_used': system_memory.percent,
        'system_memory_available': system_memory.available / (1024**3)  # GB
    }

    print(f"   🖥️ Final GPU Memory Usage:      {current_memory:>5.2f}GB")
    print(f"   💻 System Memory Usage:         {system_memory.percent:>5.1f}%")
    print(f"   💾 Available System Memory:     {system_memory.available / (1024**3):>5.1f}GB")

    # Success metrics and quality scores
    print(f"\n✅ SUCCESS METRICS & QUALITY SCORES:")
    print("=" * 40)

    # Calculate success rates
    llava_success_rate = 0
    gpt4_success_rate = 0
    export_success_rate = 0

    if 'processing_stats' in globals():
        total_processed = processing_stats.get('successful_inferences', 0) + processing_stats.get('failed_inferences', 0)
        llava_success_rate = (processing_stats.get('successful_inferences', 0) / total_processed * 100) if total_processed > 0 else 0

    if 'gpt4_stats' in globals():
        gpt4_total = gpt4_stats.get('total_processed', 0)
        gpt4_success_rate = (gpt4_stats.get('api_successes', 0) / gpt4_total * 100) if gpt4_total > 0 else 0

    if 'export_stats' in globals():
        export_total = export_stats.get('total_records', 0)
        export_success_rate = (export_stats.get('successful_exports', 0) / export_total * 100) if export_total > 0 else 0

    summary['success_metrics'] = {
        'llava_success_rate': llava_success_rate,
        'gpt4_success_rate': gpt4_success_rate,
        'export_success_rate': export_success_rate,
        'overall_pipeline_success': (llava_success_rate + gpt4_success_rate + export_success_rate) / 3
    }

    print(f"   🤖 LLaVA Success Rate:           {llava_success_rate:>5.1f}%")
    print(f"   🎯 GPT-4 Success Rate:           {gpt4_success_rate:>5.1f}%")
    print(f"   📊 Export Success Rate:          {export_success_rate:>5.1f}%")
    print(f"   🏆 Overall Pipeline Success:     {(llava_success_rate + gpt4_success_rate + export_success_rate) / 3:>5.1f}%")

    # Quality assurance recommendations
    print(f"\n🔍 QUALITY ASSURANCE RECOMMENDATIONS:")
    print("=" * 42)

    recommendations = []

    if llava_success_rate < 90:
        recommendations.append("⚠️ LLaVA inference success rate below 90% - check model performance")
    if gpt4_success_rate < 80:
        recommendations.append("⚠️ GPT-4 API success rate below 80% - verify API keys and quotas")
    if export_success_rate < 95:
        recommendations.append("⚠️ Export success rate below 95% - check data validation")
    if final_count < specialist_count * 0.9:
        recommendations.append("⚠️ High data loss through pipeline - review error handling")

    if not recommendations:
        print("   ✅ All quality metrics within acceptable ranges!")
        print("   🎉 Pipeline performed excellently!")
    else:
        for rec in recommendations:
            print(f"   {rec}")

    summary['quality_recommendations'] = recommendations

    return summary

# Generate and display comprehensive summary
pipeline_summary = generate_comprehensive_pipeline_summary()

pipeline_timer.end_step("Pipeline Summary")

print(f"\n🎉 PIPELINE COMPLETION SUMMARY:")
print("=" * 35)
print(f"   🚀 Version: {pipeline_summary['pipeline_info']['version']}")
print(f"   ⏰ Completed: {pipeline_summary['pipeline_info']['completion_time']}")
print(f"   ⏱️ Total Time: {pipeline_summary['pipeline_info']['total_pipeline_time']:.1f} seconds")
print(f"   📊 Final Records: {pipeline_summary['data_quality']['final_assembly']}")
print(f"   🏆 Overall Success: {pipeline_summary['success_metrics']['overall_pipeline_success']:.1f}%")

# Save summary to JSON for record keeping
summary_filename = f"pipeline_summary_{time.strftime('%Y%m%d_%H%M%S')}.json"
try:
    with open(summary_filename, 'w') as f:
        json.dump(pipeline_summary, f, indent=2, default=str)
    print(f"   📁 Summary saved: {summary_filename}")
except Exception as e:
    print(f"   ⚠️ Failed to save summary: {e}")

print(f"\n🏁 MASTER AGRICULTURAL KEYWORDING PIPELINE COMPLETE! 🏁")
print("=" * 65)
print("🌾 Thank you for using AGStock Keyworder Enhanced v4.0 FIXED! 🌾")
print("🚜 Professional agricultural image keywording at your service! 🚜")
print("=" * 65)

## Step 13: Integration Testing & Validation Verification
Comprehensive integration tests to verify all validation functions work correctly in the full pipeline context.

In [None]:
print("🧪 INTEGRATION TESTING & VALIDATION VERIFICATION")
print("=" * 55)

import unittest
from unittest.mock import MagicMock, patch
import tempfile
import shutil

class IntegrationTestSuite(unittest.TestCase):
    """Comprehensive integration tests for the agricultural keywording pipeline"""

    @classmethod
    def setUpClass(cls):
        """Set up test environment once for all tests"""
        cls.test_data_dir = tempfile.mkdtemp()
        cls.test_image_path = os.path.join(cls.test_data_dir, "test_image.jpg")

        # Create a minimal test image
        from PIL import Image
        test_img = Image.new('RGB', (100, 100), color='green')
        test_img.save(cls.test_image_path)

        print(f"🏗️ Test environment created: {cls.test_data_dir}")

    @classmethod
    def tearDownClass(cls):
        """Clean up test environment"""
        shutil.rmtree(cls.test_data_dir)
        print("🧹 Test environment cleaned up")

    def test_keyword_filtering_integration(self):
        """Test LLaVA keyword filtering with real data"""
        print("🔍 Testing keyword filtering integration...")

        # Test with typical LLaVA overflow output
        raw_keywords = "angus, maryland, usa, cattle, cattle cattle, cow, angus cattle, animal agriculture, black angus cattle, beef cattle, livestock, farm, farming, agriculture, rural, pastoral, processing, computer, technical, operation focus"

        filtered = filter_llava_keywords(raw_keywords, max_keywords=12)
        filtered_list = [kw.strip() for kw in filtered.split(',')]

        # Assertions
        self.assertLessEqual(len(filtered_list), 12, "Should limit to max 12 keywords")
        self.assertGreaterEqual(len(filtered_list), 8, "Should have at least 8 keywords")
        self.assertIn('maryland', filtered_list, "Should preserve geographic terms")
        self.assertNotIn('processing', filtered_list, "Should remove technical terms")

        print(f"   ✅ Filtered {len(raw_keywords.split(','))} → {len(filtered_list)} keywords")

    def test_geographic_correction_integration(self):
        """Test geographic correction with photographer context"""
        print("🌍 Testing geographic correction integration...")

        keywords = "iowa, corn, agriculture, farming"  # Wrong state
        photographer_context = {'state': 'wisconsin', 'photographer': 'TestPhotographer'}

        corrected = correct_geographic_keywords(keywords, photographer_context)
        corrected_list = [kw.strip() for kw in corrected.split(',')]

        # Assertions
        self.assertEqual(corrected_list[0], 'wisconsin', "Should place correct state first")
        self.assertNotIn('iowa', corrected_list, "Should remove incorrect states")
        self.assertIn('corn', corrected_list, "Should preserve non-geographic keywords")

        print(f"   ✅ Geographic correction: iowa → wisconsin")

    def test_redundancy_removal_integration(self):
        """Test redundancy removal with complex patterns"""
        print("🔄 Testing redundancy removal integration...")

        keywords = ['cattle', 'cattle cattle', 'cattle breeding', 'angus', 'processing', 'plant', 'beef']
        filtered = remove_redundant_terms(keywords)

        # Assertions
        self.assertIn('cattle', filtered, "Should keep base terms")
        self.assertIn('angus', filtered, "Should keep specific breeds")
        self.assertIn('beef', filtered, "Should keep agricultural terms")
        self.assertNotIn('cattle cattle', filtered, "Should remove duplications")
        self.assertNotIn('processing', filtered, "Should remove technical terms")

        print(f"   ✅ Removed {len(keywords) - len(filtered)} redundant terms")

    def test_photographer_matching_integration(self):
        """Test photographer matching with various SKU formats"""
        print("👥 Testing photographer matching integration...")

        # Create temporary photographer CSV
        temp_csv = os.path.join(self.test_data_dir, "test_photographers.csv")
        with open(temp_csv, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['sku_range', 'name', 'state'])
            writer.writerow(['270000-279999', 'Test Photographer', 'Wisconsin'])
            writer.writerow(['280', 'Another Photographer', 'Iowa'])

        matcher = PhotographerMatcher(temp_csv)

        # Test range matching
        result1 = matcher.get_vendor_info('270480')
        self.assertEqual(result1['vendor'], 'Test Photographer')
        self.assertEqual(result1['state'], 'Wisconsin')
        self.assertEqual(result1['photographer_match'], 'yes')

        # Test prefix matching
        result2 = matcher.get_vendor_info('280123')
        self.assertEqual(result2['vendor'], 'Another Photographer')

        print(f"   ✅ Photographer matching: 2 successful matches")

    def test_csv_export_integration(self):
        """Test CSV export with complete data structure"""
        print("📊 Testing CSV export integration...")

        test_data = [{
            'filename': 'test_image.jpg',
            'title': 'Wisconsin Agriculture',
            'description': 'Test description',
            'keywords': 'wisconsin, corn, agriculture, farming',
            'tags': 'wisconsin, corn, agriculture, farming',
            'collections': '',
            'models': '',
            'vendor': 'Test Photographer',
            'photographer_match': 'yes'
        }]

        temp_csv = os.path.join(self.test_data_dir, "test_export.csv")
        success, stats = export_to_shopify_csv_v4_simplified(test_data, temp_csv)

        # Assertions
        self.assertTrue(success, "Export should succeed")
        self.assertTrue(os.path.exists(temp_csv), "CSV file should be created")
        self.assertEqual(stats['successful_exports'], 1, "Should export 1 record")

        # Verify CSV content
        with open(temp_csv, 'r') as f:
            content = f.read()
            self.assertIn('wisconsin', content, "Should contain keywords")
            self.assertIn('Test Photographer', content, "Should contain vendor")

        print(f"   ✅ CSV export: 1 record exported successfully")

    def test_json_validation_integration(self):
        """Test JSON validation with complex nested data"""
        print("🔍 Testing JSON validation integration...")

        test_data = [
            {
                'filename': 'test.jpg',
                'specialist_keywords': 'corn, agriculture',
                'debug_timing': {'total': 2.5, 'inference': 1.8},
                'debug_memory': {'pre_inference': 2.1, 'post_inference': 2.3}
            }
        ]

        is_valid, message = progressive_json_validation(test_data, "Integration Test")

        # Assertions
        self.assertTrue(is_valid, f"JSON validation should pass: {message}")

        print(f"   ✅ JSON validation: {message}")

    def test_bulletproof_string_escaping_integration(self):
        """Test bulletproof string escaping with problematic content"""
        print("🛡️ Testing bulletproof string escaping integration...")

        problematic_text = 'Test "quotes" and\nnewlines and\ttabs'
        escaped = bulletproof_escape_string(problematic_text)

        # Assertions
        self.assertNotIn('"', escaped.replace('\\"', ''))  # No unescaped quotes
        self.assertNotIn('\n', escaped.replace('\\n', ''))  # No unescaped newlines
        self.assertIn('\\"', escaped, "Should escape quotes")
        self.assertIn('\\n', escaped, "Should escape newlines")

        print(f"   ✅ String escaping: All special characters properly escaped")

    def test_api_rate_limiting_integration(self):
        """Test API rate limiting functionality"""
        print("🔄 Testing API rate limiting integration...")

        limiter = APIRateLimiter(requests_per_minute=2, requests_per_hour=10)

        # Make multiple requests quickly
        start_time = time.time()
        for i in range(3):
            limiter.wait_if_needed()
        end_time = time.time()

        # Should have introduced delays
        total_time = end_time - start_time
        self.assertGreater(total_time, 2.0, "Should introduce throttling delays")

        status = limiter.get_rate_limit_status()
        self.assertEqual(status['total_requests'], 3, "Should track requests")

        print(f"   ✅ Rate limiting: {total_time:.1f}s for 3 requests (throttling active)")

    def test_memory_management_integration(self):
        """Test memory management functions"""
        print("💾 Testing memory management integration...")

        initial_memory = get_gpu_memory_usage()
        cleanup_memory()
        post_cleanup_memory = get_gpu_memory_usage()

        # Memory should be same or lower after cleanup
        self.assertLessEqual(post_cleanup_memory, initial_memory + 0.1,
                           "Memory should not increase after cleanup")

        print(f"   ✅ Memory management: {initial_memory:.2f}GB → {post_cleanup_memory:.2f}GB")

def run_integration_tests():
    """Run all integration tests and provide detailed reporting"""
    print("🚀 Starting Integration Test Suite...")
    print("=" * 50)

    # Create test suite
    suite = unittest.TestLoader().loadTestsFromTestCase(IntegrationTestSuite)

    # Custom test result class for detailed reporting
    class DetailedTestResult(unittest.TextTestResult):
        def __init__(self, stream, descriptions, verbosity):
            super().__init__(stream, descriptions, verbosity)
            self.test_results = []

        def addSuccess(self, test):
            super().addSuccess(test)
            self.test_results.append(('PASS', test._testMethodName, None))

        def addFailure(self, test, err):
            super().addFailure(test, err)
            self.test_results.append(('FAIL', test._testMethodName, err[1]))

        def addError(self, test, err):
            super().addError(test, err)
            self.test_results.append(('ERROR', test._testMethodName, err[1]))

    # Run tests with detailed output
    runner = unittest.TextTestRunner(verbosity=0, resultclass=DetailedTestResult)
    result = runner.run(suite)

    # Generate comprehensive report
    print(f"\n📊 INTEGRATION TEST RESULTS:")
    print("=" * 35)
    print(f"   🎯 Tests Run: {result.testsRun}")
    print(f"   ✅ Passed: {result.testsRun - len(result.failures) - len(result.errors)}")
    print(f"   ❌ Failed: {len(result.failures)}")
    print(f"   🚨 Errors: {len(result.errors)}")

    # Detailed results
    if hasattr(result, 'test_results'):
        print(f"\n📋 DETAILED TEST RESULTS:")
        print("-" * 30)
        for status, test_name, error in result.test_results:
            status_icon = "✅" if status == "PASS" else "❌"
            print(f"   {status_icon} {test_name}")
            if error and len(str(error)) < 100:
                print(f"      └─ {str(error)}")

    # Calculate success rate
    success_rate = ((result.testsRun - len(result.failures) - len(result.errors)) / result.testsRun * 100) if result.testsRun > 0 else 0

    print(f"\n🏆 INTEGRATION TEST SUMMARY:")
    print(f"   📊 Success Rate: {success_rate:.1f}%")

    if success_rate >= 90:
        print("   🎉 EXCELLENT: All major integration points validated!")
    elif success_rate >= 75:
        print("   ✅ GOOD: Core functionality validated with minor issues")
    else:
        print("   ⚠️ NEEDS ATTENTION: Multiple integration issues detected")

    return success_rate >= 75

# Execute Integration Tests
integration_success = run_integration_tests()

print(f"\n🔗 PIPELINE INTEGRATION STATUS:")
print("=" * 35)
if integration_success:
    print("✅ All critical pipeline integrations verified")
    print("🚀 System ready for production deployment")
else:
    print("⚠️ Integration issues detected - review required")
    print("🔧 Recommend addressing failures before production")

print(f"\n🏁 Step 13 Complete: Integration Testing Finished")
print("=" * 55)