<a href="https://colab.research.google.com/github/laurencoetzee001/Beads_Co-detect/blob/main/prompt_optimisation_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Bead Trade Coding - Test Version for Munashe_Cleaned.xlsx
==========================================================

Tests:
- Output format and structure
- JSON parsing and validation
- Excel output mapping
- Cost estimation for 27k rows
- Error handling
- Processing speed

Single session, all 1,453 rows
"""

import subprocess
import sys
import os
import json
import time
from datetime import datetime

# === INSTALL DEPENDENCIES ===
for pkg in ["anthropic", "openpyxl", "pandas", "tenacity"]:
    try:
        __import__(pkg if pkg != "openpyxl" else "openpyxl")
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

import pandas as pd
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_exponential

# === API KEY SETUP ===
# Option 1: Paste your API key directly here (between the quotes)
API_KEY_DIRECT = ""  # Paste like: "sk-ant-api03-YOUR_KEY_HERE"

# Option 2: Or set via environment variable before running
# export ANTHROPIC_API_KEY="sk-ant-api03-YOUR_KEY_HERE"

# Get API key from either source
ANTHROPIC_API_KEY = API_KEY_DIRECT or os.getenv("ANTHROPIC_API_KEY", "")

# If still not set, prompt user
if not ANTHROPIC_API_KEY:
    print("\n" + "="*80)
    print("API KEY REQUIRED")
    print("="*80)
    print("\nYou need an Anthropic API key to run this script.")
    print("Get one at: https://console.anthropic.com")
    print("\nOptions:")
    print("  1. Paste it directly in the script (line 33: API_KEY_DIRECT)")
    print("  2. Set environment variable: export ANTHROPIC_API_KEY='your-key'")
    print("  3. Enter it now (will only be used for this run)")
    print()

    user_input = input("Enter your API key (or press Enter to exit): ").strip()
    if user_input:
        ANTHROPIC_API_KEY = user_input
        print("✓ API key accepted for this session")
    else:
        print("\nExiting. Please set your API key and try again.")
        sys.exit(1)

# === CONFIGURATION ===
INPUT_FILE = "Munashe_Cleaned.xlsx"
OUTPUT_DIR = "./test_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "claude-3-5-haiku-20241022"
TEXT_COLUMN = "text_page_gp"
MAX_ROWS = 1453  # All rows for test

# === EFFICIENT SYSTEM PROMPT ===
SYSTEM_PROMPT = """You are a historian analyzing pre-colonial African bead trade records.

TASK: Extract structured data about bead exchanges. Apply conservative decision rules - require explicit evidence, do NOT infer.

RESPONSE SCHEMA (RETURN VALID JSON ONLY - no markdown):
{
  "4a_exchange": "xo or no",
  "4b_beads_exchanged": "description or null",
  "4c_exchanged_item": "what traded for or null",
  "6_bead_ethnic_group": "groups or null",
  "8_location_name": "location or null",
  "9_place_of_manufacture": "where made or null",
  "10_beads_observed": "physical description or null",
  "12_local_name": "indigenous names or null",
  "13_notes": "research context or null"
}

CRITICAL RULE FOR 4a_exchange:
- "xo" = ONLY explicit trade/barter transactions (e.g., "traded 20 beads for goats")
- "no" = manufacturing, adornment, displays, ceremonies, value statements, general demand, payment systems

PROCESSING RULES:
1. Apply rules individually to each row (don't infer from context)
2. Use null for missing information (not "unknown")
3. Preserve descriptive detail for research value
4. Return VALID JSON ONLY - no extra text
5. Base answers ONLY on the text provided
"""

USER_PROMPT_TEMPLATE = """Analyze this historical excerpt:

TEXT:
{text}

Extract data into the JSON schema. Return valid JSON only."""

# === UTILITY FUNCTIONS ===

def validate_json_response(json_obj):
    """Validate required fields."""
    if "4a_exchange" not in json_obj:
        return False, "Missing required field: 4a_exchange"

    if json_obj["4a_exchange"] not in ["xo", "no"]:
        return False, f"Invalid 4a_exchange: {json_obj['4a_exchange']}"

    return True, None

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=1, max=10),
    reraise=True
)
def call_claude(client, entry_text):
    """Call Claude API with retry logic."""
    response = client.messages.create(
        model=MODEL_NAME,
        max_tokens=1200,
        temperature=0,
        system=SYSTEM_PROMPT,
        messages=[{
            "role": "user",
            "content": USER_PROMPT_TEMPLATE.format(text=entry_text)
        }]
    )
    return response

def calculate_cost(input_tokens, output_tokens):
    """Calculate API cost for Haiku."""
    # Haiku pricing: $0.80 per million input, $0.24 per million output
    input_cost = (input_tokens / 1_000_000) * 0.80
    output_cost = (output_tokens / 1_000_000) * 0.24
    return input_cost + output_cost

def extrapolate_27k_cost(cost_per_row):
    """Estimate cost for 27,000 rows based on test."""
    return cost_per_row * 27000

def print_section(title):
    """Print formatted section header."""
    print(f"\n{'='*80}")
    print(f"{title}")
    print(f"{'='*80}")

# === MAIN TEST ===

def main():
    print_section("BEAD TRADE CODING - TEST VERSION")
    print(f"Input file: {INPUT_FILE}")
    print(f"Rows to test: {MAX_ROWS}")
    print(f"Model: {MODEL_NAME}")

    # Check API key
    if not ANTHROPIC_API_KEY:
        print("ERROR: ANTHROPIC_API_KEY not set (this shouldn't happen)")
        return

    # Verify API key format
    if not ANTHROPIC_API_KEY.startswith("sk-ant-"):
        print(f"WARNING: API key doesn't look right. Should start with 'sk-ant-'")
        print(f"Current value starts with: {ANTHROPIC_API_KEY[:10]}")
        confirm = input("\nContinue anyway? (y/n): ").strip().lower()
        if confirm != 'y':
            return

    # Load data
    print(f"\nLoading data file...")
    try:
        df = pd.read_excel(INPUT_FILE)
        print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
    except Exception as e:
        print(f"ERROR: Could not load file: {e}")
        return

    # Verify text column
    if TEXT_COLUMN not in df.columns:
        print(f"ERROR: Column '{TEXT_COLUMN}' not found")
        print(f"Available columns: {list(df.columns)}")
        return

    # Initialize
    client = Anthropic(api_key=ANTHROPIC_API_KEY)
    responses = []
    total_input_tokens = 0
    total_output_tokens = 0
    session_start_time = time.time()
    success_count = 0
    error_count = 0
    skipped_count = 0

    # Sample tracking for quality check
    sample_responses = []
    exchange_values = {"xo": 0, "no": 0}

    # Process rows
    print_section("PROCESSING")
    print(f"Starting test on {MAX_ROWS} rows...")
    print(f"Progress updates every 100 rows\n")

    for row_idx in range(MAX_ROWS):
        try:
            row = df.iloc[row_idx]
            entry_text = row.get(TEXT_COLUMN)

            # Skip empty
            if pd.isna(entry_text) or not str(entry_text).strip():
                responses.append(None)
                skipped_count += 1
                continue

            entry_text = str(entry_text).strip()

            # Call Claude
            response = call_claude(client, entry_text)
            total_input_tokens += response.usage.input_tokens
            total_output_tokens += response.usage.output_tokens
            response_text = response.content[0].text.strip()

            # Parse JSON
            try:
                parsed_json = json.loads(response_text)

                # Validate
                is_valid, error = validate_json_response(parsed_json)
                if is_valid:
                    responses.append(parsed_json)
                    success_count += 1

                    # Track exchange values
                    exchange_val = parsed_json.get("4a_exchange")
                    if exchange_val in ["xo", "no"]:
                        exchange_values[exchange_val] += 1

                    # Save first 5 for inspection
                    if len(sample_responses) < 5:
                        sample_responses.append({
                            "row": row_idx,
                            "response": parsed_json,
                            "text_preview": entry_text[:100]
                        })
                else:
                    responses.append({"error": error})
                    error_count += 1

            except json.JSONDecodeError as e:
                responses.append({"error": f"JSON parse error"})
                error_count += 1

            # Progress
            if (row_idx + 1) % 100 == 0:
                elapsed = (time.time() - session_start_time) / 60
                cost_so_far = calculate_cost(total_input_tokens, total_output_tokens)
                rows_done = row_idx + 1
                rate = rows_done / elapsed if elapsed > 0 else 0
                eta_total = MAX_ROWS / rate if rate > 0 else 0

                print(f"Row {row_idx+1}/{MAX_ROWS} | Success: {success_count} | "
                      f"Cost: ${cost_so_far:.2f} | Rate: {rate:.1f} rows/min | ETA: {eta_total:.0f}m")

        except Exception as e:
            responses.append({"error": str(e)[:100]})
            error_count += 1

    # === ANALYSIS ===

    elapsed_time = (time.time() - session_start_time) / 60
    final_cost = calculate_cost(total_input_tokens, total_output_tokens)
    cost_per_row = final_cost / (MAX_ROWS - skipped_count) if (MAX_ROWS - skipped_count) > 0 else 0
    extrapolated_27k = extrapolate_27k_cost(cost_per_row)

    print_section("TEST RESULTS")

    print(f"\nProcessing Summary:")
    print(f"  Total rows: {MAX_ROWS}")
    print(f"  Successfully coded: {success_count}")
    print(f"  Errors: {error_count}")
    print(f"  Skipped (empty): {skipped_count}")
    print(f"  Success rate: {(success_count/(MAX_ROWS-skipped_count)*100):.1f}%")

    print(f"\nToken Usage:")
    print(f"  Input tokens: {total_input_tokens:,}")
    print(f"  Output tokens: {total_output_tokens:,}")
    print(f"  Total tokens: {total_input_tokens + total_output_tokens:,}")
    print(f"  Avg tokens per row: {(total_input_tokens + total_output_tokens)/(MAX_ROWS-skipped_count):.0f}")

    print(f"\nCost Analysis:")
    print(f"  Test cost: ${final_cost:.2f}")
    print(f"  Cost per row: ${cost_per_row:.4f}")
    print(f"  EXTRAPOLATED 27,000 rows: ${extrapolated_27k:.2f}")

    print(f"\nTiming:")
    print(f"  Total time: {elapsed_time:.1f} minutes")
    print(f"  Rows per minute: {(MAX_ROWS-skipped_count)/elapsed_time:.1f}")
    print(f"  Time per row: {(elapsed_time/(MAX_ROWS-skipped_count))*1000:.1f}ms")
    print(f"  Extrapolated for 27,000 rows: {(elapsed_time/(MAX_ROWS-skipped_count))*27000/60:.1f} hours")

    print(f"\nCoding Distribution (4a_exchange):")
    print(f"  'xo' (transactions): {exchange_values['xo']} ({(exchange_values['xo']/success_count*100):.1f}%)")
    print(f"  'no' (other mentions): {exchange_values['no']} ({(exchange_values['no']/success_count*100):.1f}%)")

    # === OUTPUT FORMAT CHECK ===

    print_section("OUTPUT FORMAT VALIDATION")

    # Check what fields are being extracted
    all_fields = set()
    for resp in responses:
        if resp and isinstance(resp, dict) and "error" not in resp:
            all_fields.update(resp.keys())

    print(f"\nFields extracted: {len(all_fields)}")
    for field in sorted(all_fields):
        count = sum(1 for r in responses if r and isinstance(r, dict) and field in r and r[field] is not None)
        pct = (count / success_count * 100) if success_count > 0 else 0
        print(f"  {field}: {count} rows ({pct:.1f}%)")

    # === SAMPLE INSPECTION ===

    print_section("SAMPLE RESPONSES (First 5)")

    for i, sample in enumerate(sample_responses, 1):
        print(f"\nRow {sample['row']}:")
        print(f"  Text: {sample['text_preview']}...")
        print(f"  Response:")
        for key, value in sample['response'].items():
            if value:
                preview = str(value)[:60]
                print(f"    {key}: {preview}")

    # === CREATE TEST OUTPUT ===

    print_section("SAVING OUTPUT")

    # Create output dataframe
    output_df = df.iloc[:len(responses)].copy()

    # Add response columns
    for i, resp in enumerate(responses):
        if resp and isinstance(resp, dict) and "error" not in resp:
            for key, value in resp.items():
                if key not in output_df.columns:
                    output_df[key] = None
                output_df.at[i, key] = value

    # Save output
    output_file = os.path.join(OUTPUT_DIR, "test_output_1453_rows.xlsx")
    output_df.to_excel(output_file, index=False)
    print(f"\nOutput saved: {output_file}")
    print(f"  Rows: {len(output_df)}")
    print(f"  Columns: {len(output_df.columns)} (original: {len(df.columns)}, new: {len(output_df.columns) - len(df.columns)})")

    # Save detailed report
    report_file = os.path.join(OUTPUT_DIR, "test_report.txt")
    with open(report_file, 'w') as f:
        f.write("BEAD TRADE CODING - TEST REPORT\n")
        f.write("="*80 + "\n\n")
        f.write(f"Test date: {datetime.now().isoformat()}\n")
        f.write(f"Input file: {INPUT_FILE}\n")
        f.write(f"Rows processed: {MAX_ROWS}\n\n")
        f.write(f"RESULTS:\n")
        f.write(f"  Success rate: {(success_count/(MAX_ROWS-skipped_count)*100):.1f}%\n")
        f.write(f"  Test cost: ${final_cost:.2f}\n")
        f.write(f"  Extrapolated 27k cost: ${extrapolated_27k:.2f}\n")
        f.write(f"  Processing time: {elapsed_time:.1f} minutes\n")
        f.write(f"  Extrapolated 27k time: {(elapsed_time/(MAX_ROWS-skipped_count))*27000/60:.1f} hours\n")
        f.write(f"  Cost per row: ${cost_per_row:.4f}\n")

    print(f"\nReport saved: {report_file}")

    # === FINAL ASSESSMENT ===

    print_section("READY FOR PRODUCTION?")

    checks = [
        ("Success rate > 85%", success_count/(MAX_ROWS-skipped_count)*100 > 85),
        ("Fields being extracted", len(all_fields) >= 5),
        ("Reasonable cost", extrapolated_27k < 50),
        ("Acceptable speed", (elapsed_time/(MAX_ROWS-skipped_count))*27000/60 < 40),
        ("Output file created", os.path.exists(output_file))
    ]

    all_pass = True
    for check_name, result in checks:
        status = "PASS" if result else "FAIL"
        print(f"  [{status}] {check_name}")
        if not result:
            all_pass = False

    print()
    if all_pass:
        print("✓ All checks passed. Script is ready for 27,000 row production run.")
        print(f"\nExpected for full 27,000 rows:")
        print(f"  Processing time: ~{(elapsed_time/(MAX_ROWS-skipped_count))*27000/60:.0f} hours (across {((elapsed_time/(MAX_ROWS-skipped_count))*27000/60)/11:.0f} sessions)")
        print(f"  Total cost: ~${extrapolated_27k:.2f}")
        print(f"  Success rate: ~{(success_count/(MAX_ROWS-skipped_count)*100):.0f}%")
    else:
        print("✗ Some checks failed. Review issues before running production.")

    print()

if __name__ == "__main__":
    main()


BEAD TRADE CODING - TEST VERSION
Input file: Munashe_Cleaned.xlsx
Rows to test: 1453
Model: claude-3-5-haiku-20241022

Loading data file...
Loaded 1453 rows, 23 columns

PROCESSING
Starting test on 1453 rows...
Progress updates every 100 rows

Row 100/1453 | Success: 100 | Cost: $0.07 | Rate: 19.6 rows/min | ETA: 74m
Row 200/1453 | Success: 200 | Cost: $0.14 | Rate: 21.0 rows/min | ETA: 69m
Row 300/1453 | Success: 300 | Cost: $0.21 | Rate: 20.8 rows/min | ETA: 70m
Row 400/1453 | Success: 400 | Cost: $0.29 | Rate: 21.0 rows/min | ETA: 69m
Row 500/1453 | Success: 500 | Cost: $0.40 | Rate: 20.3 rows/min | ETA: 72m
Row 600/1453 | Success: 600 | Cost: $0.47 | Rate: 20.5 rows/min | ETA: 71m
Row 700/1453 | Success: 699 | Cost: $0.55 | Rate: 20.6 rows/min | ETA: 71m
Row 800/1453 | Success: 799 | Cost: $0.68 | Rate: 20.1 rows/min | ETA: 72m
Row 900/1453 | Success: 899 | Cost: $0.75 | Rate: 19.9 rows/min | ETA: 73m
Row 1000/1453 | Success: 999 | Cost: $0.82 | Rate: 20.1 rows/min | ETA: 72m
Row 

In [None]:
"""
Bead Trade Coding - COMPLETE VERSION with ALL 13 Fields
========================================================

Codes ALL 13 fields from the codebook:
1. 1_price_HUMAN (price/exchange info)
2. 2_size_HUMAN (size classification)
3. 3_colour_HUMAN (color codes)
4. 4_location_HUMAN (location type)
5. 5_function_HUMAN (function in society)
6. 6_origin_of_bead (geographic origin)
7. 7_shape_HUMAN (shape)
8. 8_type_bead_HUMAN (material)
9. 9_local_name_HUMAN (ethnic/language name)
10. 10_relationship_ (other exchange forms)
11. 11_units_of_measure (measurement units)
12. 12_bead_ethnic_ (ethnic group)
13. 13_nature_of_exchange (consensual/conflictual)

Single session, all 1,453 rows
"""

import subprocess
import sys
import os
import json
import time
from datetime import datetime

# === INSTALL DEPENDENCIES ===
for pkg in ["anthropic", "openpyxl", "pandas", "tenacity"]:
    try:
        __import__(pkg if pkg != "openpyxl" else "openpyxl")
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

import pandas as pd
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_exponential

# === API KEY SETUP ===
API_KEY_DIRECT = ""  # Paste like: "sk-ant-api03-YOUR_KEY_HERE"
ANTHROPIC_API_KEY = API_KEY_DIRECT or os.getenv("ANTHROPIC_API_KEY", "")

if not ANTHROPIC_API_KEY:
    print("\n" + "="*80)
    print("API KEY REQUIRED")
    print("="*80)
    user_input = input("Enter your API key (or press Enter to exit): ").strip()
    if user_input:
        ANTHROPIC_API_KEY = user_input
        print("✓ API key accepted for this session")
    else:
        print("\nExiting. Please set your API key and try again.")
        sys.exit(1)

# === CONFIGURATION ===
INPUT_FILE = "Munashe_Cleaned.xlsx"
OUTPUT_DIR = "./test_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "claude-haiku-4-5-20251001"
TEXT_COLUMN = "text_page_gp"
MAX_ROWS = 1453

# === COMPREHENSIVE SYSTEM PROMPT WITH ALL 13 FIELDS ===
SYSTEM_PROMPT = """You are a historian analyzing pre-colonial African bead trade records.

TASK: Extract ALL 13 structured data fields about bead exchanges. Apply conservative rules - require explicit evidence.

RESPONSE SCHEMA (RETURN VALID JSON ONLY):
{
  "1_price_HUMAN": {
    "status": "yes|no|xo",
    "amount": "number or measurement if mentioned, else null",
    "currency": "currency/commodity name if mentioned, else null",
    "description": "full price/exchange description from text"
  },
  "2_size_HUMAN": {
    "code": [1, 2, 3, 4, 5, or 6] or null,
    "description": "exact size description from text"
  },
  "3_colour_HUMAN": {
    "codes": [array of color codes],
    "description": "exact color description from text (especially if code=14/other)"
  },
  "4_location_HUMAN": {
    "codes": [array of location type codes],
    "names": "actual location names mentioned"
  },
  "5_function_HUMAN": {
    "codes": [array of function codes],
    "description": "detailed function description from text"
  },
  "6_origin_of_bead": "geographic origin text or null",
  "7_shape_HUMAN": {
    "codes": [array of shape codes],
    "description": "exact shape description from text (include if unusual)"
  },
  "8_type_bead_HUMAN": {
    "codes": [array of material codes],
    "description": "exact material description from text (especially if not standard)"
  },
  "9_local_name_HUMAN": {
    "exists": "1|2",
    "names": ["list of names"] or null
  },
  "10_relationship_": {
    "codes": [array of relationship codes],
    "description": "detailed description of exchange items from text"
  },
  "11_units_of_measure": {
    "type": code or null,
    "description": "exact measurement description from text"
  },
  "12_bead_ethnic_": ["array of ethnic group names"] or null,
  "13_nature_of_exchange": {
    "code": code or null,
    "description": "description of exchange nature from text"
  },
  "notes": "additional research context not captured above"
}

FIELD DEFINITIONS:

1_price_HUMAN (Field 28):
- status: "yes" = price mentioned, "no" = no mention, "xo" = exchanged
- amount: number of beads (5, 10, 20, 50) or body measurement (thumb to elbow)
- currency: what was exchanged
- description: ALWAYS include the full price/exchange text from the source

2_size_HUMAN (Field 29): Classify size
- code: 1=large, 2=medium, 3=small, 4=various, 5=thin, 6=thick
- description: ALWAYS include exact size description from text (e.g., "thumbnail-sized", "as big as a pigeon's egg")

3_colour_HUMAN (Field 30): Array of color codes + description
- codes: 1=red, 2=blue, 3=white, 4=pink, 5=coral, 6=amber, 7=copper, 8=green
  9=yellow, 10=transparent, 11=seed(glass), 12=black, 13=multicoloured, 14=other
- description: ALWAYS include exact color text from source. CRITICAL for code 14 (other) or unusual colors

4_location_HUMAN (Field 31): Array of location types + names
- codes: 1=mountain/hill/peak, 2=lake, 3=river/waterfall, 4=populated place
- names: ALWAYS include actual location names from text (e.g., "Kumasi", "Niger River")

5_function_HUMAN (Field 32): Array of functions + description
- codes: 1=jewellery/adornment/clothing, 2=currency/exchange/tax
  3=ceremonial/religious/tribute, 4=class/status symbol/gift/social exchange
- description: ALWAYS include detailed function text from source

6_origin_of_bead (Field 33):
- ALWAYS include full geographic origin text (e.g., "Venetian glass beads from Murano")

7_shape_HUMAN (Field 34): Array of shape codes + description
- codes: 1=round, 2=tubular, 3=square, 4=oval, 5=oblong, 6=punched
  7=wound, 8=pressed, 9=decorative, 10=faceted, 11=bugle, 12=chevron
- description: ALWAYS include exact shape text, especially for unusual shapes not in codes

8_type_bead_HUMAN (Field 35): Array of material codes + description
- codes: 1=glass/seed beads, 2=clay, 3=metal(brass/copper/silver/gold/iron)
  4=stone(quartz/agate/carnelian/jasper/amethyst/lapis/turquoise/malachite)
  5=coral, 6=amber, 7=bone, 8=ivory, 9=dried seed, 10=ceramic
  11=wooden, 12=porcelain, 13=shell(seashells), 14=eggshell(ostrich)
- description: ALWAYS include exact material text, CRITICAL for materials not in standard list

9_local_name_HUMAN (Field 36):
- exists: "1" = yes (provide names), "2" = unspecified
- names: ALWAYS include all ethnic/language names mentioned exactly as written

10_relationship_ (Field 37): Array of connected exchange forms + description
- codes: 1=wire, 2=cloth, 3=shells(cowries), 4=coins, 5=livestock, 6=iron bars/rods
  7=scarabs, 8=precious stones/agates, 9=antiquities/furniture, 10=ostrich feathers
  11=ebony/ivory, 12=salt, 13=rubber/gum, 14=medicines/remedies/herbal plants
  15=spices/essences/fragrances/perfumes/oil, 16=wax/stamps/seals
  17=skin/leather/hides/horns/animal products, 18=indigenous weapons/spears/shields
  19=dried food/fruits/consumables, 20=prints/artwork/books/paper/scrolls
  21=guns/gunpowder, 22=jewellery, 23=textiles/clothing, 24=gold/silver/gold dust
  25=slaves, 26=glass objects, 27=hardware/manufactures, 28=tobacco/snuff
  29=musical instruments, 30=water, 31=alcohol
- description: ALWAYS include detailed description of ALL exchange items from text

11_units_of_measure (Field 38):
- type: 1=string, 2=plaited/woven string, 3=necklace/anklet/bracelet/waist beads/headwear, 4=other
- description: ALWAYS include exact measurement description, CRITICAL when type=4

12_bead_ethnic_ (Field 39):
- ALWAYS include all ethnic group names mentioned exactly as written

13_nature_of_exchange (Field 40): Code + description
- code: 1=consensual, 2=conflictual, 3=unspecified, 4=competitive/bartering/haggling
  5=social(gifts/tributes), 6=uncommercial
- description: ALWAYS include text describing the exchange nature

CRITICAL RULES:
1. Return ONLY valid JSON (no markdown, no extra text)
2. Use null for missing information (not "unknown" or "not mentioned")
3. For arrays, return empty array [] if no data, or null if not applicable
4. **ALWAYS include description fields - preserve the original text verbatim**
5. **When using "other" categories or unusual values, description field is MANDATORY**
6. Base answers ONLY on provided text
7. Apply each field's rules independently
8. Descriptions preserve research value - never skip them
"""

USER_PROMPT_TEMPLATE = """Analyze this historical excerpt and extract ALL 13 fields:

TEXT:
{text}

Return valid JSON with all 13 fields."""

# === UTILITY FUNCTIONS ===

def validate_json_response(json_obj):
    """Validate required fields exist."""
    required_fields = [
        '1_price_HUMAN', '2_size_HUMAN', '3_colour_HUMAN', '4_location_HUMAN',
        '5_function_HUMAN', '6_origin_of_bead', '7_shape_HUMAN', '8_type_bead_HUMAN',
        '9_local_name_HUMAN', '10_relationship_', '11_units_of_measure',
        '12_bead_ethnic_', '13_nature_of_exchange'
    ]

    missing = [f for f in required_fields if f not in json_obj]
    if missing:
        return False, f"Missing fields: {', '.join(missing)}"

    return True, None

def flatten_json_for_excel(json_obj):
    """Flatten nested JSON structure for Excel output with BOTH codes and descriptions."""
    flat = {}

    # 1_price_HUMAN - flatten nested object
    if json_obj.get('1_price_HUMAN'):
        price = json_obj['1_price_HUMAN']
        if isinstance(price, dict):
            flat['1_price_status'] = price.get('status')
            flat['1_price_amount'] = price.get('amount')
            flat['1_price_currency'] = price.get('currency')
            flat['1_price_description'] = price.get('description')
        else:
            flat['1_price_status'] = str(price) if price else None
            flat['1_price_amount'] = None
            flat['1_price_currency'] = None
            flat['1_price_description'] = None
    else:
        flat['1_price_status'] = None
        flat['1_price_amount'] = None
        flat['1_price_currency'] = None
        flat['1_price_description'] = None

    # 2_size_HUMAN - code + description
    size = json_obj.get('2_size_HUMAN')
    if isinstance(size, dict):
        flat['2_size_code'] = size.get('code')
        flat['2_size_description'] = size.get('description')
    else:
        flat['2_size_code'] = size  # Handle old format
        flat['2_size_description'] = None

    # 3_colour_HUMAN - codes + description
    color = json_obj.get('3_colour_HUMAN')
    if isinstance(color, dict):
        codes = color.get('codes')
        flat['3_colour_codes'] = ','.join(map(str, codes)) if codes else None
        flat['3_colour_description'] = color.get('description')
    else:
        flat['3_colour_codes'] = ','.join(map(str, color)) if color else None
        flat['3_colour_description'] = None

    # 4_location_HUMAN - codes + names
    location = json_obj.get('4_location_HUMAN')
    if isinstance(location, dict):
        codes = location.get('codes')
        flat['4_location_codes'] = ','.join(map(str, codes)) if codes else None
        flat['4_location_names'] = location.get('names')
    else:
        flat['4_location_codes'] = ','.join(map(str, location)) if location else None
        flat['4_location_names'] = None

    # 5_function_HUMAN - codes + description
    function = json_obj.get('5_function_HUMAN')
    if isinstance(function, dict):
        codes = function.get('codes')
        flat['5_function_codes'] = ','.join(map(str, codes)) if codes else None
        flat['5_function_description'] = function.get('description')
    else:
        flat['5_function_codes'] = ','.join(map(str, function)) if function else None
        flat['5_function_description'] = None

    # 6_origin_of_bead - text
    flat['6_origin_of_bead'] = json_obj.get('6_origin_of_bead')

    # 7_shape_HUMAN - codes + description
    shape = json_obj.get('7_shape_HUMAN')
    if isinstance(shape, dict):
        codes = shape.get('codes')
        flat['7_shape_codes'] = ','.join(map(str, codes)) if codes else None
        flat['7_shape_description'] = shape.get('description')
    else:
        flat['7_shape_codes'] = ','.join(map(str, shape)) if shape else None
        flat['7_shape_description'] = None

    # 8_type_bead_HUMAN - codes + description
    bead_type = json_obj.get('8_type_bead_HUMAN')
    if isinstance(bead_type, dict):
        codes = bead_type.get('codes')
        flat['8_type_codes'] = ','.join(map(str, codes)) if codes else None
        flat['8_type_description'] = bead_type.get('description')
    else:
        flat['8_type_codes'] = ','.join(map(str, bead_type)) if bead_type else None
        flat['8_type_description'] = None

    # 9_local_name_HUMAN - flatten nested object
    local_name = json_obj.get('9_local_name_HUMAN')
    if isinstance(local_name, dict):
        flat['9_local_name_exists'] = local_name.get('exists')
        names = local_name.get('names')
        flat['9_local_name_names'] = '; '.join(names) if names else None
    else:
        flat['9_local_name_exists'] = None
        flat['9_local_name_names'] = None

    # 10_relationship_ - codes + description
    relationship = json_obj.get('10_relationship_')
    if isinstance(relationship, dict):
        codes = relationship.get('codes')
        flat['10_relationship_codes'] = ','.join(map(str, codes)) if codes else None
        flat['10_relationship_description'] = relationship.get('description')
    else:
        flat['10_relationship_codes'] = ','.join(map(str, relationship)) if relationship else None
        flat['10_relationship_description'] = None

    # 11_units_of_measure - flatten nested object
    units = json_obj.get('11_units_of_measure')
    if isinstance(units, dict):
        flat['11_units_type'] = units.get('type')
        flat['11_units_description'] = units.get('description')
    else:
        flat['11_units_type'] = None
        flat['11_units_description'] = None

    # 12_bead_ethnic_ - array to semicolon-separated string
    ethnics = json_obj.get('12_bead_ethnic_')
    flat['12_bead_ethnic_'] = '; '.join(ethnics) if ethnics else None

    # 13_nature_of_exchange - code + description
    nature = json_obj.get('13_nature_of_exchange')
    if isinstance(nature, dict):
        flat['13_nature_code'] = nature.get('code')
        flat['13_nature_description'] = nature.get('description')
    else:
        flat['13_nature_code'] = nature
        flat['13_nature_description'] = None

    # notes - text
    flat['notes'] = json_obj.get('notes')

    return flat

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=1, max=10),
    reraise=True
)
def call_claude(client, entry_text):
    """Call Claude API with retry logic."""
    response = client.messages.create(
        model=MODEL_NAME,
        max_tokens=2000,  # Increased for more complex response
        temperature=0,
        system=SYSTEM_PROMPT,
        messages=[{
            "role": "user",
            "content": USER_PROMPT_TEMPLATE.format(text=entry_text)
        }]
    )
    return response

def calculate_cost(input_tokens, output_tokens):
    """Calculate API cost for Haiku."""
    input_cost = (input_tokens / 1_000_000) * 0.80
    output_cost = (output_tokens / 1_000_000) * 0.24
    return input_cost + output_cost

def print_section(title):
    """Print formatted section header."""
    print(f"\n{'='*80}")
    print(f"{title}")
    print(f"{'='*80}")

# === MAIN TEST ===

def main():
    print_section("BEAD TRADE CODING - COMPLETE 13 FIELDS VERSION")
    print(f"Input file: {INPUT_FILE}")
    print(f"Rows to test: {MAX_ROWS}")
    print(f"Model: {MODEL_NAME}")
    print(f"Fields: ALL 13 from codebook")

    # Verify API key format
    if not ANTHROPIC_API_KEY.startswith("sk-ant-"):
        print(f"WARNING: API key doesn't look right. Should start with 'sk-ant-'")
        confirm = input("\nContinue anyway? (y/n): ").strip().lower()
        if confirm != 'y':
            return

    # Load data
    print(f"\nLoading data file...")
    try:
        df = pd.read_excel(INPUT_FILE)
        print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
    except Exception as e:
        print(f"ERROR: Could not load file: {e}")
        return

    if TEXT_COLUMN not in df.columns:
        print(f"ERROR: Column '{TEXT_COLUMN}' not found")
        return

    # Initialize
    client = Anthropic(api_key=ANTHROPIC_API_KEY)
    responses = []
    total_input_tokens = 0
    total_output_tokens = 0
    session_start_time = time.time()
    success_count = 0
    error_count = 0
    skipped_count = 0
    last_processed_row_idx = -1
    last_response_text = None
    last_error = None

    # Process rows
    print_section("PROCESSING")
    print(f"Starting processing on {MAX_ROWS} rows...")
    print(f"Progress updates every 100 rows\n")

    try:
        for row_idx in range(MAX_ROWS):
            last_processed_row_idx = row_idx
            row = df.iloc[row_idx]
            entry_text = row.get(TEXT_COLUMN)

            # Skip empty
            if pd.isna(entry_text) or not str(entry_text).strip():
                responses.append(None)
                skipped_count += 1
                continue

            entry_text = str(entry_text).strip()

            # Call Claude
            response = call_claude(client, entry_text)
            total_input_tokens += response.usage.input_tokens
            total_output_tokens += response.usage.output_tokens
            response_text = response.content[0].text.strip()
            last_response_text = response_text # Store for potential error inspection

            # Parse JSON
            try:
                parsed_json = json.loads(response_text)

                # Validate
                is_valid, error = validate_json_response(parsed_json)
                if is_valid:
                    # Flatten for Excel
                    flat_json = flatten_json_for_excel(parsed_json)
                    responses.append(flat_json)
                    success_count += 1
                    last_error = None # Clear error if successful
                else:
                    responses.append({"error": error})
                    error_count += 1
                    last_error = error

            except json.JSONDecodeError as e:
                responses.append({"error": f"JSON parse error"})
                error_count += 1
                last_error = f"JSON parse error: {e}"


            # Progress
            if (row_idx + 1) % 100 == 0:
                elapsed = (time.time() - session_start_time) / 60
                cost_so_far = calculate_cost(total_input_tokens, total_output_tokens)
                rows_done = row_idx + 1
                rate = rows_done / elapsed if elapsed > 0 else 0

                print(f"Row {row_idx+1}/{MAX_ROWS} | Success: {success_count} | "
                      f"Errors: {error_count} | Cost: ${cost_so_far:.2f} | Rate: {rate:.1f} rows/min")

        # === ANALYSIS ===

        elapsed_time = (time.time() - session_start_time) / 60
        final_cost = calculate_cost(total_input_tokens, total_output_tokens)

        print_section("TEST RESULTS")

        print(f"\nProcessing Summary:")
        print(f"  Total rows: {MAX_ROWS}")
        print(f"  Successfully coded: {success_count}")
        print(f"  Errors: {error_count}")
        print(f"  Skipped (empty): {skipped_count}")
        print(f"  Success rate: {(success_count/(MAX_ROWS-skipped_count)*100):.1f}%" if (MAX_ROWS-skipped_count) > 0 else "N/A")

        print(f"\nToken Usage:")
        print(f"  Input tokens: {total_input_tokens:,}")
        print(f"  Output tokens: {total_output_tokens:,}")
        print(f"  Total tokens: {total_input_tokens + total_output_tokens:,}")

        print(f"\nCost Analysis:")
        print(f"  Test cost: ${final_cost:.2f}")
        print(f"  Cost per row: ${final_cost/(MAX_ROWS-skipped_count):.4f}" if (MAX_ROWS-skipped_count) > 0 else "N/A")

        print(f"\nTiming:")
        print(f"  Total time: {elapsed_time:.1f} minutes")
        print(f"  Rows per minute: {(MAX_ROWS-skipped_count)/elapsed_time:.1f}" if elapsed_time > 0 and (MAX_ROWS-skipped_count) > 0 else "N/A")

        # === CREATE OUTPUT ===

        print_section("SAVING OUTPUT")

        # Create output dataframe
        output_df = df.iloc[:len(responses)].copy()

        # Add response columns
        for i, resp in enumerate(responses):
            if resp and isinstance(resp, dict) and "error" not in resp:
                for key, value in resp.items():
                    if key not in output_df.columns:
                        output_df[key] = None
                    output_df.at[i, key] = value

        # Save output
        output_file = os.path.join(OUTPUT_DIR, "COMPLETE_output_13_fields.xlsx")
        output_df.to_excel(output_file, index=False)

        print(f"\n✓ Output saved: {output_file}")
        print(f"  Rows: {len(output_df)}")
        print(f"  Original columns: {len(df.columns)}")
        print(f"  New coding columns: {len(output_df.columns) - len(df.columns)}")
        print(f"  Total columns: {len(output_df.columns)}")

        # Show field coverage
        print(f"\n13 Field Coverage:")
        coding_fields = [col for col in output_df.columns if col not in df.columns]
        for field in sorted(coding_fields):
            count = output_df[field].notna().sum()
            pct = (count / success_count * 100) if success_count > 0 else 0
            print(f"  {field:30s} | {count:4d} rows ({pct:5.1f}%)")

        print()

    except KeyboardInterrupt:
        print_section("PROCESS INTERRUPTED")
        print(f"Processing stopped at row index: {last_processed_row_idx}")
        if last_response_text:
            print(f"Last API response (partial): {last_response_text[:500]}...")
        if last_error:
            print(f"Last recorded error: {last_error}")
        print("You can inspect the intermediate 'responses' list if needed.")
    except Exception as e:
        print_section("AN UNEXPECTED ERROR OCCURRED")
        print(f"Error at row index: {last_processed_row_idx}")
        print(f"Error details: {e}")
        if last_response_text:
            print(f"Last API response (partial): {last_response_text[:500]}...")


if __name__ == "__main__":
    main()


BEAD TRADE CODING - COMPLETE 13 FIELDS VERSION
Input file: Munashe_Cleaned.xlsx
Rows to test: 1453
Model: claude-haiku-4-5-20251001
Fields: ALL 13 from codebook

Loading data file...
Loaded 1453 rows, 23 columns

PROCESSING
Starting processing on 1453 rows...
Progress updates every 100 rows

Row 100/1453 | Success: 0 | Errors: 100 | Cost: $0.22 | Rate: 8.1 rows/min

PROCESS INTERRUPTED
Processing stopped at row index: 102
Last API response (partial): ```json
{
  "1_price_HUMAN": {
    "status": "yes",
    "amount": 10,
    "currency": "large blue beads",
    "description": "We would buy a somb for ten large blue beads their local value corresponds roughly to a calabash containing one or two liters of rice."
  },
  "2_size_HUMAN": {
    "code": 1,
    "description": "large blue beads"
  },
  "3_colour_HUMAN": {
    "codes": [2, 2],
    "description": "ultramarine beads and cerulean blue beads; blue beads; The Gouros are sensitive to color and...
Last recorded error: JSON parse error: E

In [None]:
ls -la ./test_output/


total 1456
drwxr-xr-x 2 root root    4096 Oct 16 13:24 [0m[01;34m.[0m/
drwxr-xr-x 1 root root    4096 Oct 16 12:13 [01;34m..[0m/
-rw-r--r-- 1 root root 1476864 Oct 16 14:58 test_output_1453_rows.xlsx
-rw-r--r-- 1 root root     381 Oct 16 14:58 test_report.txt


In [None]:
# Check the test report
cat ./test_output/test_report.txt

# Check what's in the Excel file (columns and sample data)
python3 << 'EOF'
import pandas as pd

df = pd.read_excel('./test_output/test_output_1453_rows.xlsx')

print("="*80)
print("FILE INFO:")
print("="*80)
print(f"Rows: {len(df)}")
print(f"Columns: {len(df.columns)}")
print()

print("="*80)
print("COLUMN NAMES:")
print("="*80)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print()
print("="*80)
print("SAMPLE OF CODING COLUMNS (First 3 rows):")
print("="*80)

# Check for error columns
error_cols = [col for col in df.columns if 'error' in col.lower()]
if error_cols:
    print(f"\nERROR COLUMNS FOUND: {error_cols}")
    for col in error_cols:
        print(f"\n{col}:")
        print(df[col].head(3))

# Check for coding columns
coding_cols = [col for col in df.columns if any(x in col for x in ['price', 'size', 'colour', 'shape', 'type'])]
if coding_cols:
    print(f"\nCODING COLUMNS FOUND: {coding_cols[:5]}")
    for col in coding_cols[:3]:
        print(f"\n{col}:")
        print(df[col].head(3))

EOF

SyntaxError: invalid syntax (ipython-input-144328232.py, line 2)

In [None]:
"""
Bead Trade Coding - COMPLETE & FIXED Version
=============================================

All 13 fields from codebook with codes + descriptions
Bug fixes applied for JSON parsing
Ready for production use

Features:
- All 13 fields coded (1_price through 13_nature_of_exchange)
- Dual structure: codes for analysis + descriptions for research
- Robust JSON parsing (handles markdown)
- Comprehensive error handling
- Progress tracking and cost reporting

Author: Claude
Date: October 2025
"""

import subprocess
import sys
import os
import json
import time
from datetime import datetime

# === INSTALL DEPENDENCIES ===
print("Checking dependencies...")
for pkg in ["anthropic", "openpyxl", "pandas", "tenacity"]:
    try:
        __import__(pkg if pkg != "openpyxl" else "openpyxl")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

import pandas as pd
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_exponential

# === API KEY SETUP ===
API_KEY_DIRECT = ""  # Option 1: Paste your key here: "sk-ant-api03-..."

ANTHROPIC_API_KEY = API_KEY_DIRECT or os.getenv("ANTHROPIC_API_KEY", "")

if not ANTHROPIC_API_KEY:
    print("\n" + "="*80)
    print("API KEY REQUIRED")
    print("="*80)
    print("\nGet your key at: https://console.anthropic.com")
    user_input = input("\nEnter your API key (or press Enter to exit): ").strip()
    if user_input:
        ANTHROPIC_API_KEY = user_input
        print("✓ API key accepted")
    else:
        print("\nExiting. Please set your API key and try again.")
        sys.exit(1)

# === CONFIGURATION ===
INPUT_FILE = "Munashe_Cleaned.xlsx"
OUTPUT_DIR = "./bead_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "claude-haiku-4-5-20251001"  # Latest Haiku 4.5
TEXT_COLUMN = "text_page_gp"
MAX_ROWS = 1453  # Change to 10 for testing

# === COMPREHENSIVE SYSTEM PROMPT ===
SYSTEM_PROMPT = """You are a historian analyzing pre-colonial African bead trade records.

TASK: Extract ALL 13 structured data fields. Be conservative - require explicit evidence.

RESPONSE FORMAT: Return ONLY a JSON object. NO markdown, NO ```json``` tags, NO extra text.

JSON STRUCTURE:
{
  "1_price_HUMAN": {
    "status": "yes|no|xo",
    "amount": "number or measurement, or null",
    "currency": "currency/commodity, or null",
    "description": "full price text from source"
  },
  "2_size_HUMAN": {
    "code": 1-6 or null,
    "description": "exact size text from source"
  },
  "3_colour_HUMAN": {
    "codes": [array of 1-14],
    "description": "exact color text, REQUIRED if code=14"
  },
  "4_location_HUMAN": {
    "codes": [array of 1-4],
    "names": "actual location names"
  },
  "5_function_HUMAN": {
    "codes": [array of 1-4],
    "description": "detailed function text"
  },
  "6_origin_of_bead": "geographic origin text or null",
  "7_shape_HUMAN": {
    "codes": [array of 1-12],
    "description": "exact shape text"
  },
  "8_type_bead_HUMAN": {
    "codes": [array of 1-14],
    "description": "exact material text"
  },
  "9_local_name_HUMAN": {
    "exists": "1|2",
    "names": ["array of names"] or null
  },
  "10_relationship_": {
    "codes": [array of 1-31],
    "description": "detailed exchange items text"
  },
  "11_units_of_measure": {
    "type": 1-4 or null,
    "description": "exact measurement text"
  },
  "12_bead_ethnic_": ["array of ethnic group names"] or null,
  "13_nature_of_exchange": {
    "code": 1-6 or null,
    "description": "exchange nature text"
  },
  "notes": "additional research context"
}

FIELD CODES:

1_price_HUMAN: status: yes=mentioned, no=not mentioned, xo=exchanged

2_size_HUMAN: 1=large, 2=medium, 3=small, 4=various, 5=thin, 6=thick

3_colour_HUMAN: 1=red, 2=blue, 3=white, 4=pink, 5=coral, 6=amber, 7=copper, 8=green, 9=yellow, 10=transparent, 11=seed glass, 12=black, 13=multicoloured, 14=other

4_location_HUMAN: 1=mountain/hill, 2=lake, 3=river/waterfall, 4=populated place

5_function_HUMAN: 1=jewellery/adornment, 2=currency/exchange, 3=ceremonial/religious, 4=status/gift

6_origin_of_bead: Text describing geographic origin

7_shape_HUMAN: 1=round, 2=tubular, 3=square, 4=oval, 5=oblong, 6=punched, 7=wound, 8=pressed, 9=decorative, 10=faceted, 11=bugle, 12=chevron

8_type_bead_HUMAN: 1=glass, 2=clay, 3=metal, 4=stone, 5=coral, 6=amber, 7=bone, 8=ivory, 9=dried seed, 10=ceramic, 11=wooden, 12=porcelain, 13=shell, 14=eggshell

9_local_name_HUMAN: exists: 1=yes (provide names), 2=unspecified

10_relationship_: 1=wire, 2=cloth, 3=shells, 4=coins, 5=livestock, 6=iron bars, 7=scarabs, 8=precious stones, 9=antiquities, 10=ostrich feathers, 11=ebony/ivory, 12=salt, 13=rubber/gum, 14=medicines, 15=spices/perfumes, 16=wax/seals, 17=leather/hides, 18=weapons, 19=dried food, 20=prints/books, 21=guns/gunpowder, 22=jewellery, 23=textiles, 24=gold/silver, 25=slaves, 26=glass objects, 27=hardware, 28=tobacco, 29=musical instruments, 30=water, 31=alcohol

11_units_of_measure: 1=string, 2=plaited/woven string, 3=necklace/bracelet/waist beads, 4=other

12_bead_ethnic_: Array of ethnic group names

13_nature_of_exchange: 1=consensual, 2=conflictual, 3=unspecified, 4=competitive/bartering, 5=social/gifts, 6=uncommercial

CRITICAL RULES:
1. Return ONLY the JSON object - NO ```json``` tags, NO markdown, NO extra text
2. Use null for missing data (not "unknown")
3. For arrays: [] if no data, null if not applicable
4. ALWAYS include description fields with verbatim text
5. When code=14 (other) or unusual items, description is MANDATORY
6. Base answers ONLY on provided text
7. Preserve exact terminology and details
"""

USER_PROMPT_TEMPLATE = """Analyze this historical text and extract ALL 13 fields:

TEXT:
{text}

Return ONLY the JSON object. No ```json``` tags, no other text."""

# === UTILITY FUNCTIONS ===

def strip_markdown_json(text):
    """Remove markdown code blocks from JSON response."""
    text = text.strip()

    # Remove ```json ... ``` or ``` ... ```
    if text.startswith('```'):
        lines = text.split('\n')
        # Remove first line (```json or ```)
        if lines[0].strip().startswith('```'):
            lines = lines[1:]
        # Remove last line (```)
        if lines and lines[-1].strip() == '```':
            lines = lines[:-1]
        text = '\n'.join(lines).strip()

    return text

def validate_json_response(json_obj):
    """Validate all required fields exist."""
    required = [
        '1_price_HUMAN', '2_size_HUMAN', '3_colour_HUMAN', '4_location_HUMAN',
        '5_function_HUMAN', '6_origin_of_bead', '7_shape_HUMAN', '8_type_bead_HUMAN',
        '9_local_name_HUMAN', '10_relationship_', '11_units_of_measure',
        '12_bead_ethnic_', '13_nature_of_exchange'
    ]

    missing = [f for f in required if f not in json_obj]
    if missing:
        return False, f"Missing fields: {', '.join(missing[:3])}"

    return True, None

def flatten_for_excel(json_obj):
    """Flatten nested JSON to Excel-friendly format."""
    flat = {}

    # 1_price_HUMAN (4 columns)
    price = json_obj.get('1_price_HUMAN', {})
    if isinstance(price, dict):
        flat['1_price_status'] = price.get('status')
        flat['1_price_amount'] = price.get('amount')
        flat['1_price_currency'] = price.get('currency')
        flat['1_price_description'] = price.get('description')
    else:
        flat['1_price_status'] = flat['1_price_amount'] = flat['1_price_currency'] = flat['1_price_description'] = None

    # 2_size_HUMAN (2 columns)
    size = json_obj.get('2_size_HUMAN', {})
    if isinstance(size, dict):
        flat['2_size_code'] = size.get('code')
        flat['2_size_description'] = size.get('description')
    else:
        flat['2_size_code'] = size
        flat['2_size_description'] = None

    # 3_colour_HUMAN (2 columns)
    color = json_obj.get('3_colour_HUMAN', {})
    if isinstance(color, dict):
        codes = color.get('codes', [])
        flat['3_colour_codes'] = ','.join(map(str, codes)) if codes else None
        flat['3_colour_description'] = color.get('description')
    else:
        flat['3_colour_codes'] = ','.join(map(str, color)) if color else None
        flat['3_colour_description'] = None

    # 4_location_HUMAN (2 columns)
    location = json_obj.get('4_location_HUMAN', {})
    if isinstance(location, dict):
        codes = location.get('codes', [])
        flat['4_location_codes'] = ','.join(map(str, codes)) if codes else None
        flat['4_location_names'] = location.get('names')
    else:
        flat['4_location_codes'] = ','.join(map(str, location)) if location else None
        flat['4_location_names'] = None

    # 5_function_HUMAN (2 columns)
    function = json_obj.get('5_function_HUMAN', {})
    if isinstance(function, dict):
        codes = function.get('codes', [])
        flat['5_function_codes'] = ','.join(map(str, codes)) if codes else None
        flat['5_function_description'] = function.get('description')
    else:
        flat['5_function_codes'] = ','.join(map(str, function)) if function else None
        flat['5_function_description'] = None

    # 6_origin_of_bead (1 column)
    flat['6_origin_of_bead'] = json_obj.get('6_origin_of_bead')

    # 7_shape_HUMAN (2 columns)
    shape = json_obj.get('7_shape_HUMAN', {})
    if isinstance(shape, dict):
        codes = shape.get('codes', [])
        flat['7_shape_codes'] = ','.join(map(str, codes)) if codes else None
        flat['7_shape_description'] = shape.get('description')
    else:
        flat['7_shape_codes'] = ','.join(map(str, shape)) if shape else None
        flat['7_shape_description'] = None

    # 8_type_bead_HUMAN (2 columns)
    bead_type = json_obj.get('8_type_bead_HUMAN', {})
    if isinstance(bead_type, dict):
        codes = bead_type.get('codes', [])
        flat['8_type_codes'] = ','.join(map(str, codes)) if codes else None
        flat['8_type_description'] = bead_type.get('description')
    else:
        flat['8_type_codes'] = ','.join(map(str, bead_type)) if bead_type else None
        flat['8_type_description'] = None

    # 9_local_name_HUMAN (2 columns)
    local = json_obj.get('9_local_name_HUMAN', {})
    if isinstance(local, dict):
        flat['9_local_name_exists'] = local.get('exists')
        names = local.get('names', [])
        flat['9_local_name_names'] = '; '.join(names) if names else None
    else:
        flat['9_local_name_exists'] = None
        flat['9_local_name_names'] = None

    # 10_relationship_ (2 columns)
    rel = json_obj.get('10_relationship_', {})
    if isinstance(rel, dict):
        codes = rel.get('codes', [])
        flat['10_relationship_codes'] = ','.join(map(str, codes)) if codes else None
        flat['10_relationship_description'] = rel.get('description')
    else:
        flat['10_relationship_codes'] = ','.join(map(str, rel)) if rel else None
        flat['10_relationship_description'] = None

    # 11_units_of_measure (2 columns)
    units = json_obj.get('11_units_of_measure', {})
    if isinstance(units, dict):
        flat['11_units_type'] = units.get('type')
        flat['11_units_description'] = units.get('description')
    else:
        flat['11_units_type'] = None
        flat['11_units_description'] = None

    # 12_bead_ethnic_ (1 column)
    ethnics = json_obj.get('12_bead_ethnic_', [])
    flat['12_bead_ethnic_'] = '; '.join(ethnics) if ethnics else None

    # 13_nature_of_exchange (2 columns)
    nature = json_obj.get('13_nature_of_exchange', {})
    if isinstance(nature, dict):
        flat['13_nature_code'] = nature.get('code')
        flat['13_nature_description'] = nature.get('description')
    else:
        flat['13_nature_code'] = nature
        flat['13_nature_description'] = None

    # Notes (1 column)
    flat['notes'] = json_obj.get('notes')

    return flat

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=1, max=10),
    reraise=True
)
def call_claude(client, entry_text):
    """Call Claude API with retry logic."""
    response = client.messages.create(
        model=MODEL_NAME,
        max_tokens=2500,
        temperature=0,
        system=SYSTEM_PROMPT,
        messages=[{
            "role": "user",
            "content": USER_PROMPT_TEMPLATE.format(text=entry_text)
        }]
    )
    return response

def calculate_cost(input_tokens, output_tokens):
    """Calculate API cost for Haiku 4.5."""
    # Haiku 4.5 pricing (estimate - verify actual pricing)
    input_cost = (input_tokens / 1_000_000) * 0.80
    output_cost = (output_tokens / 1_000_000) * 0.24
    return input_cost + output_cost

def print_header(title):
    """Print formatted section header."""
    print(f"\n{'='*80}")
    print(f"{title}")
    print(f"{'='*80}")

# === MAIN PROCESSING ===

def main():
    print_header("BEAD TRADE CODING - COMPLETE VERSION")
    print(f"Model: {MODEL_NAME}")
    print(f"Input: {INPUT_FILE}")
    print(f"Rows: {MAX_ROWS}")
    print(f"All 13 fields with codes + descriptions")

    # Verify API key
    if not ANTHROPIC_API_KEY.startswith("sk-ant-"):
        print(f"\n⚠️  Warning: API key format looks unusual")
        print(f"   Should start with: sk-ant-")
        confirm = input("Continue anyway? (y/n): ").strip().lower()
        if confirm != 'y':
            return

    # Load data
    print(f"\nLoading {INPUT_FILE}...")
    try:
        df = pd.read_excel(INPUT_FILE)
        print(f"✓ Loaded {len(df)} rows, {len(df.columns)} columns")
    except Exception as e:
        print(f"✗ Error loading file: {e}")
        return

    if TEXT_COLUMN not in df.columns:
        print(f"✗ Column '{TEXT_COLUMN}' not found")
        print(f"Available columns: {list(df.columns)}")
        return

    # Initialize
    client = Anthropic(api_key=ANTHROPIC_API_KEY)
    responses = []
    total_input_tokens = 0
    total_output_tokens = 0
    start_time = time.time()
    success = 0
    errors = 0
    skipped = 0

    print_header("PROCESSING")
    print(f"Starting... Progress every 100 rows\n")

    # Process rows
    for idx in range(MAX_ROWS):
        try:
            row = df.iloc[idx]
            text = row.get(TEXT_COLUMN)

            # Skip empty
            if pd.isna(text) or not str(text).strip():
                responses.append(None)
                skipped += 1
                continue

            text = str(text).strip()

            # Call Claude
            response = call_claude(client, text)
            total_input_tokens += response.usage.input_tokens
            total_output_tokens += response.usage.output_tokens
            response_text = response.content[0].text

            # Strip markdown and parse
            cleaned = strip_markdown_json(response_text)

            if not cleaned:
                responses.append({"error": "Empty response"})
                errors += 1
                continue

            try:
                parsed = json.loads(cleaned)

                # Validate
                valid, error = validate_json_response(parsed)
                if valid:
                    flat = flatten_for_excel(parsed)
                    responses.append(flat)
                    success += 1
                else:
                    responses.append({"error": error})
                    errors += 1

            except json.JSONDecodeError as e:
                responses.append({"error": f"JSON error: {str(e)[:40]}"})
                errors += 1

                # Debug first 3 errors
                if errors <= 3:
                    print(f"\n⚠️  Parse error at row {idx}:")
                    print(f"   First 150 chars: {cleaned[:150]}")
                    print(f"   Error: {str(e)}\n")

            # Progress
            if (idx + 1) % 100 == 0:
                elapsed = (time.time() - start_time) / 60
                cost = calculate_cost(total_input_tokens, total_output_tokens)
                rate = (idx + 1) / elapsed if elapsed > 0 else 0

                print(f"Row {idx+1}/{MAX_ROWS} | Success: {success} | Errors: {errors} | "
                      f"Cost: ${cost:.2f} | Rate: {rate:.1f}/min")

        except Exception as e:
            responses.append({"error": str(e)[:60]})
            errors += 1

    # === RESULTS ===

    elapsed = (time.time() - start_time) / 60
    final_cost = calculate_cost(total_input_tokens, total_output_tokens)

    print_header("RESULTS")

    print(f"\nProcessing Summary:")
    print(f"  Total rows: {MAX_ROWS}")
    print(f"  Successfully coded: {success}")
    print(f"  Errors: {errors}")
    print(f"  Skipped (empty): {skipped}")
    print(f"  Success rate: {(success/(MAX_ROWS-skipped)*100):.1f}%")

    print(f"\nToken Usage:")
    print(f"  Input: {total_input_tokens:,}")
    print(f"  Output: {total_output_tokens:,}")
    print(f"  Total: {total_input_tokens + total_output_tokens:,}")
    print(f"  Avg per row: {(total_input_tokens + total_output_tokens)/(MAX_ROWS-skipped):.0f}")

    print(f"\nCost:")
    print(f"  Total: ${final_cost:.2f}")
    print(f"  Per row: ${final_cost/(MAX_ROWS-skipped):.4f}")

    print(f"\nTime:")
    print(f"  Duration: {elapsed:.1f} minutes")
    print(f"  Rate: {(MAX_ROWS-skipped)/elapsed:.1f} rows/min")

    # === SAVE OUTPUT ===

    print_header("SAVING OUTPUT")

    # Create output dataframe
    output_df = df.iloc[:len(responses)].copy()

    # Add coding columns
    for i, resp in enumerate(responses):
        if resp and isinstance(resp, dict) and "error" not in resp:
            for key, value in resp.items():
                if key not in output_df.columns:
                    output_df[key] = None
                output_df.at[i, key] = value

    # Save
    output_file = os.path.join(OUTPUT_DIR, "bead_coded_complete.xlsx")
    output_df.to_excel(output_file, index=False)

    print(f"\n✓ Output saved: {output_file}")
    print(f"  Rows: {len(output_df)}")
    print(f"  Original columns: {len(df.columns)}")
    print(f"  New columns: {len(output_df.columns) - len(df.columns)}")
    print(f"  Total columns: {len(output_df.columns)}")

    # Show field coverage
    print(f"\nField Coverage (27 columns):")
    coding_cols = [c for c in output_df.columns if c not in df.columns and c != 'error']
    for col in sorted(coding_cols):
        count = output_df[col].notna().sum()
        pct = (count / success * 100) if success > 0 else 0
        print(f"  {col:35s} | {count:4d} ({pct:5.1f}%)")

    # Save report
    report_file = os.path.join(OUTPUT_DIR, "coding_report.txt")
    with open(report_file, 'w') as f:
        f.write(f"Bead Trade Coding Report\n")
        f.write(f"{'='*60}\n\n")
        f.write(f"Date: {datetime.now().isoformat()}\n")
        f.write(f"Model: {MODEL_NAME}\n")
        f.write(f"Rows processed: {MAX_ROWS}\n")
        f.write(f"Success: {success} ({(success/(MAX_ROWS-skipped)*100):.1f}%)\n")
        f.write(f"Errors: {errors}\n")
        f.write(f"Cost: ${final_cost:.2f}\n")
        f.write(f"Time: {elapsed:.1f} minutes\n")

    print(f"\n✓ Report saved: {report_file}")

    print_header("COMPLETE")

    if success / (MAX_ROWS - skipped) > 0.85:
        print("✓ High success rate! Data ready for analysis.")
    else:
        print("⚠️  Lower success rate. Review errors in output file.")

    print(f"\nNext steps:")
    print(f"  1. Open: {output_file}")
    print(f"  2. Verify: Check sample rows")
    print(f"  3. Analyze: Use codes for stats, descriptions for context")
    print()

if __name__ == "__main__":
    main()

Checking dependencies...

BEAD TRADE CODING - COMPLETE VERSION
Model: claude-haiku-4-5-20251001
Input: Munashe_Cleaned.xlsx
Rows: 1453
All 13 fields with codes + descriptions

Loading Munashe_Cleaned.xlsx...
✓ Loaded 1453 rows, 23 columns

PROCESSING
Starting... Progress every 100 rows


⚠️  Parse error at row 38:
   First 150 chars: {
  "1_price_HUMAN": {
    "status": "no",
    "amount": null,
    "currency": null,
    "description": null
  },
  "2_size_HUMAN": {
    "code": null
   Error: Extra data: line 39 column 4 (char 1246)


⚠️  Parse error at row 58:
   First 150 chars: {
  "1_price_HUMAN": {
    "status": "yes",
    "amount": "6",
    "currency": "beads of cowries",
    "description": "Turkeys are rare, and worth six
   Error: Extra data: line 39 column 4 (char 1135)

Row 100/1453 | Success: 98 | Errors: 2 | Cost: $0.17 | Rate: 11.5/min


In [None]:
"""
Bead Trade Coding - COMPLETE & FIXED Version
=============================================

All 13 fields from codebook with codes + descriptions
Bug fixes applied for JSON parsing
Ready for production use

Features:
- All 13 fields coded (1_price through 13_nature_of_exchange)
- Dual structure: codes for analysis + descriptions for research
- Robust JSON parsing (handles markdown)
- Comprehensive error handling
- Progress tracking and cost reporting

Author: Claude
Date: October 2025
"""

import subprocess
import sys
import os
import json
import time
from datetime import datetime

# === INSTALL DEPENDENCIES ===
print("Checking dependencies...")
for pkg in ["anthropic", "openpyxl", "pandas", "tenacity"]:
    try:
        __import__(pkg if pkg != "openpyxl" else "openpyxl")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

import pandas as pd
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_exponential

# === API KEY SETUP ===
API_KEY_DIRECT = ""  # Option 1: Paste your key here: "sk-ant-api03-..."

ANTHROPIC_API_KEY = API_KEY_DIRECT or os.getenv("ANTHROPIC_API_KEY", "")

if not ANTHROPIC_API_KEY:
    print("\n" + "="*80)
    print("API KEY REQUIRED")
    print("="*80)
    print("\nGet your key at: https://console.anthropic.com")
    user_input = input("\nEnter your API key (or press Enter to exit): ").strip()
    if user_input:
        ANTHROPIC_API_KEY = user_input
        print("✓ API key accepted")
    else:
        print("\nExiting. Please set your API key and try again.")
        sys.exit(1)

# === CONFIGURATION ===
INPUT_FILE = "Munashe_Cleaned.xlsx"
OUTPUT_DIR = "./bead_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "claude-haiku-4-5-20251001"  # Latest Haiku 4.5
TEXT_COLUMN = "text_page_gp"
MAX_ROWS = 1453  # Change to 10 for testing

# === COMPREHENSIVE SYSTEM PROMPT ===
SYSTEM_PROMPT = """You are a historian analyzing pre-colonial African bead trade records.

TASK: Extract ALL 13 structured data fields. Be conservative - require explicit evidence.

RESPONSE FORMAT: Return ONLY a JSON object. NO markdown, NO ```json``` tags, NO extra text.

JSON STRUCTURE:
{
  "1_price_HUMAN": {
    "status": "yes|no|xo",
    "amount": "number or measurement, or null",
    "currency": "currency/commodity, or null",
    "description": "full price text from source"
  },
  "2_size_HUMAN": {
    "code": 1-6 or null,
    "description": "exact size text from source"
  },
  "3_colour_HUMAN": {
    "codes": [array of 1-14],
    "description": "exact color text, REQUIRED if code=14"
  },
  "4_location_HUMAN": {
    "codes": [array of 1-4],
    "names": "actual location names"
  },
  "5_function_HUMAN": {
    "codes": [array of 1-4],
    "description": "detailed function text"
  },
  "6_origin_of_bead": "geographic origin text or null",
  "7_shape_HUMAN": {
    "codes": [array of 1-12],
    "description": "exact shape text"
  },
  "8_type_bead_HUMAN": {
    "codes": [array of 1-14],
    "description": "exact material text"
  },
  "9_local_name_HUMAN": {
    "exists": "1|2",
    "names": ["array of names"] or null
  },
  "10_relationship_": {
    "codes": [array of 1-31],
    "description": "detailed exchange items text"
  },
  "11_units_of_measure": {
    "type": 1-4 or null,
    "description": "exact measurement text"
  },
  "12_bead_ethnic_": ["array of ethnic group names"] or null,
  "13_nature_of_exchange": {
    "code": 1-6 or null,
    "description": "exchange nature text"
  },
  "notes": "additional research context"
}

FIELD CODES:

1_price_HUMAN: status: yes=mentioned, no=not mentioned, xo=exchanged

2_size_HUMAN: 1=large, 2=medium, 3=small, 4=various, 5=thin, 6=thick

3_colour_HUMAN: 1=red, 2=blue, 3=white, 4=pink, 5=coral, 6=amber, 7=copper, 8=green, 9=yellow, 10=transparent, 11=seed glass, 12=black, 13=multicoloured, 14=other

4_location_HUMAN: 1=mountain/hill, 2=lake, 3=river/waterfall, 4=populated place

5_function_HUMAN: 1=jewellery/adornment, 2=currency/exchange, 3=ceremonial/religious, 4=status/gift

6_origin_of_bead: Text describing geographic origin

7_shape_HUMAN: 1=round, 2=tubular, 3=square, 4=oval, 5=oblong, 6=punched, 7=wound, 8=pressed, 9=decorative, 10=faceted, 11=bugle, 12=chevron

8_type_bead_HUMAN: 1=glass, 2=clay, 3=metal, 4=stone, 5=coral, 6=amber, 7=bone, 8=ivory, 9=dried seed, 10=ceramic, 11=wooden, 12=porcelain, 13=shell, 14=eggshell

9_local_name_HUMAN: exists: 1=yes (provide names), 2=unspecified

10_relationship_: 1=wire, 2=cloth, 3=shells, 4=coins, 5=livestock, 6=iron bars, 7=scarabs, 8=precious stones, 9=antiquities, 10=ostrich feathers, 11=ebony/ivory, 12=salt, 13=rubber/gum, 14=medicines, 15=spices/perfumes, 16=wax/seals, 17=leather/hides, 18=weapons, 19=dried food, 20=prints/books, 21=guns/gunpowder, 22=jewellery, 23=textiles, 24=gold/silver, 25=slaves, 26=glass objects, 27=hardware, 28=tobacco, 29=musical instruments, 30=water, 31=alcohol

11_units_of_measure: 1=string, 2=plaited/woven string, 3=necklace/bracelet/waist beads, 4=other

12_bead_ethnic_: Array of ethnic group names

13_nature_of_exchange: 1=consensual, 2=conflictual, 3=unspecified, 4=competitive/bartering, 5=social/gifts, 6=uncommercial

CRITICAL RULES:
1. Return ONLY the JSON object - NO ```json``` tags, NO markdown, NO extra text
2. Use null for missing data (not "unknown")
3. For arrays: [] if no data, null if not applicable
4. ALWAYS include description fields with verbatim text
5. When code=14 (other) or unusual items, description is MANDATORY
6. Base answers ONLY on provided text
7. Preserve exact terminology and details
"""

USER_PROMPT_TEMPLATE = """Analyze this historical text and extract ALL 13 fields:

TEXT:
{text}

Return ONLY the JSON object. No ```json``` tags, no other text."""

# === UTILITY FUNCTIONS ===

def strip_markdown_json(text):
    """Remove markdown code blocks and extract only the first complete JSON object."""
    text = text.strip()

    # Remove ```json ... ``` or ``` ... ```
    if text.startswith('```'):
        lines = text.split('\n')
        # Remove first line (```json or ```)
        if lines[0].strip().startswith('```'):
            lines = lines[1:]
        # Remove last line (```)
        if lines and lines[-1].strip() == '```':
            lines = lines[:-1]
        text = '\n'.join(lines).strip()

    # Extract only the first complete JSON object
    # Find the first { and its matching }
    if not text.startswith('{'):
        # Try to find where JSON starts
        start = text.find('{')
        if start == -1:
            return text  # No JSON found
        text = text[start:]

    # Find the matching closing brace
    brace_count = 0
    in_string = False
    escape_next = False

    for i, char in enumerate(text):
        if escape_next:
            escape_next = False
            continue

        if char == '\\':
            escape_next = True
            continue

        if char == '"':
            in_string = not in_string
            continue

        if not in_string:
            if char == '{':
                brace_count += 1
            elif char == '}':
                brace_count -= 1
                if brace_count == 0:
                    # Found the matching closing brace
                    return text[:i+1]

    return text

def validate_json_response(json_obj):
    """Validate all required fields exist."""
    required = [
        '1_price_HUMAN', '2_size_HUMAN', '3_colour_HUMAN', '4_location_HUMAN',
        '5_function_HUMAN', '6_origin_of_bead', '7_shape_HUMAN', '8_type_bead_HUMAN',
        '9_local_name_HUMAN', '10_relationship_', '11_units_of_measure',
        '12_bead_ethnic_', '13_nature_of_exchange'
    ]

    missing = [f for f in required if f not in json_obj]
    if missing:
        return False, f"Missing fields: {', '.join(missing[:3])}"

    return True, None

def flatten_for_excel(json_obj):
    """Flatten nested JSON to Excel-friendly format."""
    flat = {}

    # 1_price_HUMAN (4 columns)
    price = json_obj.get('1_price_HUMAN', {})
    if isinstance(price, dict):
        flat['1_price_status'] = price.get('status')
        flat['1_price_amount'] = price.get('amount')
        flat['1_price_currency'] = price.get('currency')
        flat['1_price_description'] = price.get('description')
    else:
        flat['1_price_status'] = flat['1_price_amount'] = flat['1_price_currency'] = flat['1_price_description'] = None

    # 2_size_HUMAN (2 columns)
    size = json_obj.get('2_size_HUMAN', {})
    if isinstance(size, dict):
        flat['2_size_code'] = size.get('code')
        flat['2_size_description'] = size.get('description')
    else:
        flat['2_size_code'] = size
        flat['2_size_description'] = None

    # 3_colour_HUMAN (2 columns)
    color = json_obj.get('3_colour_HUMAN', {})
    if isinstance(color, dict):
        codes = color.get('codes', [])
        flat['3_colour_codes'] = ','.join(map(str, codes)) if codes else None
        flat['3_colour_description'] = color.get('description')
    else:
        flat['3_colour_codes'] = ','.join(map(str, color)) if color else None
        flat['3_colour_description'] = None

    # 4_location_HUMAN (2 columns)
    location = json_obj.get('4_location_HUMAN', {})
    if isinstance(location, dict):
        codes = location.get('codes', [])
        flat['4_location_codes'] = ','.join(map(str, codes)) if codes else None
        flat['4_location_names'] = location.get('names')
    else:
        flat['4_location_codes'] = ','.join(map(str, location)) if location else None
        flat['4_location_names'] = None

    # 5_function_HUMAN (2 columns)
    function = json_obj.get('5_function_HUMAN', {})
    if isinstance(function, dict):
        codes = function.get('codes', [])
        flat['5_function_codes'] = ','.join(map(str, codes)) if codes else None
        flat['5_function_description'] = function.get('description')
    else:
        flat['5_function_codes'] = ','.join(map(str, function)) if function else None
        flat['5_function_description'] = None

    # 6_origin_of_bead (1 column)
    flat['6_origin_of_bead'] = json_obj.get('6_origin_of_bead')

    # 7_shape_HUMAN (2 columns)
    shape = json_obj.get('7_shape_HUMAN', {})
    if isinstance(shape, dict):
        codes = shape.get('codes', [])
        flat['7_shape_codes'] = ','.join(map(str, codes)) if codes else None
        flat['7_shape_description'] = shape.get('description')
    else:
        flat['7_shape_codes'] = ','.join(map(str, shape)) if shape else None
        flat['7_shape_description'] = None

    # 8_type_bead_HUMAN (2 columns)
    bead_type = json_obj.get('8_type_bead_HUMAN', {})
    if isinstance(bead_type, dict):
        codes = bead_type.get('codes', [])
        flat['8_type_codes'] = ','.join(map(str, codes)) if codes else None
        flat['8_type_description'] = bead_type.get('description')
    else:
        flat['8_type_codes'] = ','.join(map(str, bead_type)) if bead_type else None
        flat['8_type_description'] = None

    # 9_local_name_HUMAN (2 columns)
    local = json_obj.get('9_local_name_HUMAN', {})
    if isinstance(local, dict):
        flat['9_local_name_exists'] = local.get('exists')
        names = local.get('names', [])
        flat['9_local_name_names'] = '; '.join(names) if names else None
    else:
        flat['9_local_name_exists'] = None
        flat['9_local_name_names'] = None

    # 10_relationship_ (2 columns)
    rel = json_obj.get('10_relationship_', {})
    if isinstance(rel, dict):
        codes = rel.get('codes', [])
        flat['10_relationship_codes'] = ','.join(map(str, codes)) if codes else None
        flat['10_relationship_description'] = rel.get('description')
    else:
        flat['10_relationship_codes'] = ','.join(map(str, rel)) if rel else None
        flat['10_relationship_description'] = None

    # 11_units_of_measure (2 columns)
    units = json_obj.get('11_units_of_measure', {})
    if isinstance(units, dict):
        flat['11_units_type'] = units.get('type')
        flat['11_units_description'] = units.get('description')
    else:
        flat['11_units_type'] = None
        flat['11_units_description'] = None

    # 12_bead_ethnic_ (1 column)
    ethnics = json_obj.get('12_bead_ethnic_', [])
    flat['12_bead_ethnic_'] = '; '.join(ethnics) if ethnics else None

    # 13_nature_of_exchange (2 columns)
    nature = json_obj.get('13_nature_of_exchange', {})
    if isinstance(nature, dict):
        flat['13_nature_code'] = nature.get('code')
        flat['13_nature_description'] = nature.get('description')
    else:
        flat['13_nature_code'] = nature
        flat['13_nature_description'] = None

    # Notes (1 column)
    flat['notes'] = json_obj.get('notes')

    return flat

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=1, max=10),
    reraise=True
)
def call_claude(client, entry_text):
    """Call Claude API with retry logic."""
    response = client.messages.create(
        model=MODEL_NAME,
        max_tokens=2500,
        temperature=0,
        system=SYSTEM_PROMPT,
        messages=[{
            "role": "user",
            "content": USER_PROMPT_TEMPLATE.format(text=entry_text)
        }]
    )
    return response

def calculate_cost(input_tokens, output_tokens):
    """Calculate API cost for Haiku 4.5."""
    # Haiku 4.5 pricing (estimate - verify actual pricing)
    input_cost = (input_tokens / 1_000_000) * 0.80
    output_cost = (output_tokens / 1_000_000) * 0.24
    return input_cost + output_cost

def print_header(title):
    """Print formatted section header."""
    print(f"\n{'='*80}")
    print(f"{title}")
    print(f"{'='*80}")

# === MAIN PROCESSING ===

def main():
    print_header("BEAD TRADE CODING - COMPLETE VERSION")
    print(f"Model: {MODEL_NAME}")
    print(f"Input: {INPUT_FILE}")
    print(f"Rows: {MAX_ROWS}")
    print(f"All 13 fields with codes + descriptions")

    # Verify API key
    if not ANTHROPIC_API_KEY.startswith("sk-ant-"):
        print(f"\n⚠️  Warning: API key format looks unusual")
        print(f"   Should start with: sk-ant-")
        confirm = input("Continue anyway? (y/n): ").strip().lower()
        if confirm != 'y':
            return

    # Load data
    print(f"\nLoading {INPUT_FILE}...")
    try:
        df = pd.read_excel(INPUT_FILE)
        print(f"✓ Loaded {len(df)} rows, {len(df.columns)} columns")
    except Exception as e:
        print(f"✗ Error loading file: {e}")
        return

    if TEXT_COLUMN not in df.columns:
        print(f"✗ Column '{TEXT_COLUMN}' not found")
        print(f"Available columns: {list(df.columns)}")
        return

    # Initialize
    client = Anthropic(api_key=ANTHROPIC_API_KEY)
    responses = []
    total_input_tokens = 0
    total_output_tokens = 0
    start_time = time.time()
    success = 0
    errors = 0
    skipped = 0

    print_header("PROCESSING")
    print(f"Starting... Progress every 100 rows\n")

    # Process rows
    for idx in range(MAX_ROWS):
        try:
            row = df.iloc[idx]
            text = row.get(TEXT_COLUMN)

            # Skip empty
            if pd.isna(text) or not str(text).strip():
                responses.append(None)
                skipped += 1
                continue

            text = str(text).strip()

            # Call Claude
            response = call_claude(client, text)
            total_input_tokens += response.usage.input_tokens
            total_output_tokens += response.usage.output_tokens
            response_text = response.content[0].text

            # Strip markdown and parse
            cleaned = strip_markdown_json(response_text)

            if not cleaned:
                responses.append({"error": "Empty response"})
                errors += 1
                continue

            try:
                parsed = json.loads(cleaned)

                # Validate
                valid, error = validate_json_response(parsed)
                if valid:
                    flat = flatten_for_excel(parsed)
                    responses.append(flat)
                    success += 1
                else:
                    responses.append({"error": error})
                    errors += 1

            except json.JSONDecodeError as e:
                responses.append({"error": f"JSON error: {str(e)[:40]}"})
                errors += 1

                # Debug first 3 errors
                if errors <= 3:
                    print(f"\n⚠️  Parse error at row {idx}:")
                    print(f"   First 150 chars: {cleaned[:150]}")
                    print(f"   Error: {str(e)}\n")

            # Progress
            if (idx + 1) % 100 == 0:
                elapsed = (time.time() - start_time) / 60
                cost = calculate_cost(total_input_tokens, total_output_tokens)
                rate = (idx + 1) / elapsed if elapsed > 0 else 0

                print(f"Row {idx+1}/{MAX_ROWS} | Success: {success} | Errors: {errors} | "
                      f"Cost: ${cost:.2f} | Rate: {rate:.1f}/min")

        except Exception as e:
            responses.append({"error": str(e)[:60]})
            errors += 1

    # === RESULTS ===

    elapsed = (time.time() - start_time) / 60
    final_cost = calculate_cost(total_input_tokens, total_output_tokens)

    print_header("RESULTS")

    print(f"\nProcessing Summary:")
    print(f"  Total rows: {MAX_ROWS}")
    print(f"  Successfully coded: {success}")
    print(f"  Errors: {errors}")
    print(f"  Skipped (empty): {skipped}")
    print(f"  Success rate: {(success/(MAX_ROWS-skipped)*100):.1f}%")

    print(f"\nToken Usage:")
    print(f"  Input: {total_input_tokens:,}")
    print(f"  Output: {total_output_tokens:,}")
    print(f"  Total: {total_input_tokens + total_output_tokens:,}")
    print(f"  Avg per row: {(total_input_tokens + total_output_tokens)/(MAX_ROWS-skipped):.0f}")

    print(f"\nCost:")
    print(f"  Total: ${final_cost:.2f}")
    print(f"  Per row: ${final_cost/(MAX_ROWS-skipped):.4f}")

    print(f"\nTime:")
    print(f"  Duration: {elapsed:.1f} minutes")
    print(f"  Rate: {(MAX_ROWS-skipped)/elapsed:.1f} rows/min")

    # === SAVE OUTPUT ===

    print_header("SAVING OUTPUT")

    # Create output dataframe
    output_df = df.iloc[:len(responses)].copy()

    # Add coding columns
    for i, resp in enumerate(responses):
        if resp and isinstance(resp, dict) and "error" not in resp:
            for key, value in resp.items():
                if key not in output_df.columns:
                    output_df[key] = None
                output_df.at[i, key] = value

    # Save
    output_file = os.path.join(OUTPUT_DIR, "bead_coded_complete.xlsx")
    output_df.to_excel(output_file, index=False)

    print(f"\n✓ Output saved: {output_file}")
    print(f"  Rows: {len(output_df)}")
    print(f"  Original columns: {len(df.columns)}")
    print(f"  New columns: {len(output_df.columns) - len(df.columns)}")
    print(f"  Total columns: {len(output_df.columns)}")

    # Show field coverage
    print(f"\nField Coverage (27 columns):")
    coding_cols = [c for c in output_df.columns if c not in df.columns and c != 'error']
    for col in sorted(coding_cols):
        count = output_df[col].notna().sum()
        pct = (count / success * 100) if success > 0 else 0
        print(f"  {col:35s} | {count:4d} ({pct:5.1f}%)")

    # Save report
    report_file = os.path.join(OUTPUT_DIR, "coding_report.txt")
    with open(report_file, 'w') as f:
        f.write(f"Bead Trade Coding Report\n")
        f.write(f"{'='*60}\n\n")
        f.write(f"Date: {datetime.now().isoformat()}\n")
        f.write(f"Model: {MODEL_NAME}\n")
        f.write(f"Rows processed: {MAX_ROWS}\n")
        f.write(f"Success: {success} ({(success/(MAX_ROWS-skipped)*100):.1f}%)\n")
        f.write(f"Errors: {errors}\n")
        f.write(f"Cost: ${final_cost:.2f}\n")
        f.write(f"Time: {elapsed:.1f} minutes\n")

    print(f"\n✓ Report saved: {report_file}")

    print_header("COMPLETE")

    if success / (MAX_ROWS - skipped) > 0.85:
        print("✓ High success rate! Data ready for analysis.")
    else:
        print("⚠️  Lower success rate. Review errors in output file.")

    print(f"\nNext steps:")
    print(f"  1. Open: {output_file}")
    print(f"  2. Verify: Check sample rows")
    print(f"  3. Analyze: Use codes for stats, descriptions for context")
    print()

if __name__ == "__main__":
    main()

In [1]:
"""
Bead Trade Coding - COMPLETE & FIXED Version
=============================================

All 13 fields from codebook with codes + descriptions
Bug fixes applied for JSON parsing
Ready for production use

Features:
- All 13 fields coded (1_price through 13_nature_of_exchange)
- Dual structure: codes for analysis + descriptions for research
- Robust JSON parsing (handles markdown)
- Comprehensive error handling
- Progress tracking and cost reporting

Author: Claude
Date: October 2025
"""

import subprocess
import sys
import os
import json
import time
from datetime import datetime

# === INSTALL DEPENDENCIES ===
print("Checking dependencies...")
for pkg in ["anthropic", "openpyxl", "pandas", "tenacity"]:
    try:
        __import__(pkg if pkg != "openpyxl" else "openpyxl")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

import pandas as pd
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_exponential

# === API KEY SETUP ===
API_KEY_DIRECT = ""  # Option 1: Paste your key here: "sk-ant-api03-..."

ANTHROPIC_API_KEY = API_KEY_DIRECT or os.getenv("ANTHROPIC_API_KEY", "")

if not ANTHROPIC_API_KEY:
    print("\n" + "="*80)
    print("API KEY REQUIRED")
    print("="*80)
    print("\nGet your key at: https://console.anthropic.com")
    user_input = input("\nEnter your API key (or press Enter to exit): ").strip()
    if user_input:
        ANTHROPIC_API_KEY = user_input
        print("✓ API key accepted")
    else:
        print("\nExiting. Please set your API key and try again.")
        sys.exit(1)

# === CONFIGURATION ===
INPUT_FILE = "Munashe_Cleaned.xlsx"
OUTPUT_DIR = "./bead_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "claude-haiku-4-5-20251001"  # Latest Haiku 4.5
TEXT_COLUMN = "text_page_gp"
MAX_ROWS = 1453  # Change to 10 for testing

# === COMPREHENSIVE SYSTEM PROMPT ===
SYSTEM_PROMPT = """You are a historian analyzing pre-colonial African bead trade records.

TASK: Extract ALL 13 structured data fields. Be conservative - require explicit evidence.

RESPONSE FORMAT: Return ONLY a JSON object. NO markdown, NO ```json``` tags, NO extra text.

JSON STRUCTURE:
{
  "1_price_HUMAN": {
    "status": "yes|no|xo",
    "amount": "number or measurement, or null",
    "currency": "currency/commodity, or null",
    "description": "full price text from source"
  },
  "2_size_HUMAN": {
    "code": 1-6 or null,
    "description": "exact size text from source"
  },
  "3_colour_HUMAN": {
    "codes": [array of 1-14],
    "description": "exact color text, REQUIRED if code=14"
  },
  "4_location_HUMAN": {
    "codes": [array of 1-4],
    "names": "actual location names"
  },
  "5_function_HUMAN": {
    "codes": [array of 1-4],
    "description": "detailed function text"
  },
  "6_origin_of_bead": "geographic origin text or null",
  "7_shape_HUMAN": {
    "codes": [array of 1-12],
    "description": "exact shape text"
  },
  "8_type_bead_HUMAN": {
    "codes": [array of 1-14],
    "description": "exact material text"
  },
  "9_local_name_HUMAN": {
    "exists": "1|2",
    "names": ["array of names"] or null
  },
  "10_relationship_": {
    "codes": [array of 1-31],
    "description": "detailed exchange items text"
  },
  "11_units_of_measure": {
    "type": 1-4 or null,
    "description": "exact measurement text"
  },
  "12_bead_ethnic_": ["array of ethnic group names"] or null,
  "13_nature_of_exchange": {
    "code": 1-6 or null,
    "description": "exchange nature text"
  },
  "notes": "additional research context"
}

FIELD CODES:

1_price_HUMAN: status: yes=mentioned, no=not mentioned, xo=exchanged

2_size_HUMAN: 1=large, 2=medium, 3=small, 4=various, 5=thin, 6=thick

3_colour_HUMAN: 1=red, 2=blue, 3=white, 4=pink, 5=coral, 6=amber, 7=copper, 8=green, 9=yellow, 10=transparent, 11=seed glass, 12=black, 13=multicoloured, 14=other

4_location_HUMAN: 1=mountain/hill, 2=lake, 3=river/waterfall, 4=populated place

5_function_HUMAN: 1=jewellery/adornment, 2=currency/exchange, 3=ceremonial/religious, 4=status/gift

6_origin_of_bead: Text describing geographic origin

7_shape_HUMAN: 1=round, 2=tubular, 3=square, 4=oval, 5=oblong, 6=punched, 7=wound, 8=pressed, 9=decorative, 10=faceted, 11=bugle, 12=chevron

8_type_bead_HUMAN: 1=glass, 2=clay, 3=metal, 4=stone, 5=coral, 6=amber, 7=bone, 8=ivory, 9=dried seed, 10=ceramic, 11=wooden, 12=porcelain, 13=shell, 14=eggshell

9_local_name_HUMAN: exists: 1=yes (provide names), 2=unspecified

10_relationship_: 1=wire, 2=cloth, 3=shells, 4=coins, 5=livestock, 6=iron bars, 7=scarabs, 8=precious stones, 9=antiquities, 10=ostrich feathers, 11=ebony/ivory, 12=salt, 13=rubber/gum, 14=medicines, 15=spices/perfumes, 16=wax/seals, 17=leather/hides, 18=weapons, 19=dried food, 20=prints/books, 21=guns/gunpowder, 22=jewellery, 23=textiles, 24=gold/silver, 25=slaves, 26=glass objects, 27=hardware, 28=tobacco, 29=musical instruments, 30=water, 31=alcohol

11_units_of_measure: 1=string, 2=plaited/woven string, 3=necklace/bracelet/waist beads, 4=other

12_bead_ethnic_: Array of ethnic group names

13_nature_of_exchange: 1=consensual, 2=conflictual, 3=unspecified, 4=competitive/bartering, 5=social/gifts, 6=uncommercial

CRITICAL RULES:
1. Return ONLY the JSON object - NO ```json``` tags, NO markdown, NO extra text
2. Use null for missing data (not "unknown")
3. For arrays: [] if no data, null if not applicable
4. ALWAYS include description fields with verbatim text
5. When code=14 (other) or unusual items, description is MANDATORY
6. Base answers ONLY on provided text
7. Preserve exact terminology and details
"""

USER_PROMPT_TEMPLATE = """Analyze this historical text and extract ALL 13 fields:

TEXT:
{text}

Return ONLY the JSON object. No ```json``` tags, no other text."""

# === UTILITY FUNCTIONS ===

def strip_markdown_json(text):
    """Remove markdown code blocks and extract only the first complete JSON object."""
    text = text.strip()

    # Remove ```json ... ``` or ``` ... ```
    if text.startswith('```'):
        lines = text.split('\n')
        # Remove first line (```json or ```)
        if lines[0].strip().startswith('```'):
            lines = lines[1:]
        # Remove last line (```)
        if lines and lines[-1].strip() == '```':
            lines = lines[:-1]
        text = '\n'.join(lines).strip()

    # Extract only the first complete JSON object
    # Find the first { and its matching }
    if not text.startswith('{'):
        # Try to find where JSON starts
        start = text.find('{')
        if start == -1:
            return text  # No JSON found
        text = text[start:]

    # Find the matching closing brace
    brace_count = 0
    in_string = False
    escape_next = False

    for i, char in enumerate(text):
        if escape_next:
            escape_next = False
            continue

        if char == '\\':
            escape_next = True
            continue

        if char == '"':
            in_string = not in_string
            continue

        if not in_string:
            if char == '{':
                brace_count += 1
            elif char == '}':
                brace_count -= 1
                if brace_count == 0:
                    # Found the matching closing brace
                    return text[:i+1]

    return text

def validate_json_response(json_obj):
    """Validate all required fields exist."""
    required = [
        '1_price_HUMAN', '2_size_HUMAN', '3_colour_HUMAN', '4_location_HUMAN',
        '5_function_HUMAN', '6_origin_of_bead', '7_shape_HUMAN', '8_type_bead_HUMAN',
        '9_local_name_HUMAN', '10_relationship_', '11_units_of_measure',
        '12_bead_ethnic_', '13_nature_of_exchange'
    ]

    missing = [f for f in required if f not in json_obj]
    if missing:
        return False, f"Missing fields: {', '.join(missing[:3])}"

    return True, None

def flatten_for_excel(json_obj):
    """Flatten nested JSON to Excel-friendly format."""
    flat = {}

    # 1_price_HUMAN (4 columns)
    price = json_obj.get('1_price_HUMAN', {})
    if isinstance(price, dict):
        flat['1_price_status'] = price.get('status')
        flat['1_price_amount'] = price.get('amount')
        flat['1_price_currency'] = price.get('currency')
        flat['1_price_description'] = price.get('description')
    else:
        flat['1_price_status'] = flat['1_price_amount'] = flat['1_price_currency'] = flat['1_price_description'] = None

    # 2_size_HUMAN (2 columns)
    size = json_obj.get('2_size_HUMAN', {})
    if isinstance(size, dict):
        flat['2_size_code'] = size.get('code')
        flat['2_size_description'] = size.get('description')
    else:
        flat['2_size_code'] = size
        flat['2_size_description'] = None

    # 3_colour_HUMAN (2 columns)
    color = json_obj.get('3_colour_HUMAN', {})
    if isinstance(color, dict):
        codes = color.get('codes', [])
        flat['3_colour_codes'] = ','.join(map(str, codes)) if codes else None
        flat['3_colour_description'] = color.get('description')
    else:
        flat['3_colour_codes'] = ','.join(map(str, color)) if color else None
        flat['3_colour_description'] = None

    # 4_location_HUMAN (2 columns)
    location = json_obj.get('4_location_HUMAN', {})
    if isinstance(location, dict):
        codes = location.get('codes', [])
        flat['4_location_codes'] = ','.join(map(str, codes)) if codes else None
        flat['4_location_names'] = location.get('names')
    else:
        flat['4_location_codes'] = ','.join(map(str, location)) if location else None
        flat['4_location_names'] = None

    # 5_function_HUMAN (2 columns)
    function = json_obj.get('5_function_HUMAN', {})
    if isinstance(function, dict):
        codes = function.get('codes', [])
        flat['5_function_codes'] = ','.join(map(str, codes)) if codes else None
        flat['5_function_description'] = function.get('description')
    else:
        flat['5_function_codes'] = ','.join(map(str, function)) if function else None
        flat['5_function_description'] = None

    # 6_origin_of_bead (1 column)
    flat['6_origin_of_bead'] = json_obj.get('6_origin_of_bead')

    # 7_shape_HUMAN (2 columns)
    shape = json_obj.get('7_shape_HUMAN', {})
    if isinstance(shape, dict):
        codes = shape.get('codes', [])
        flat['7_shape_codes'] = ','.join(map(str, codes)) if codes else None
        flat['7_shape_description'] = shape.get('description')
    else:
        flat['7_shape_codes'] = ','.join(map(str, shape)) if shape else None
        flat['7_shape_description'] = None

    # 8_type_bead_HUMAN (2 columns)
    bead_type = json_obj.get('8_type_bead_HUMAN', {})
    if isinstance(bead_type, dict):
        codes = bead_type.get('codes', [])
        flat['8_type_codes'] = ','.join(map(str, codes)) if codes else None
        flat['8_type_description'] = bead_type.get('description')
    else:
        flat['8_type_codes'] = ','.join(map(str, bead_type)) if bead_type else None
        flat['8_type_description'] = None

    # 9_local_name_HUMAN (2 columns)
    local = json_obj.get('9_local_name_HUMAN', {})
    if isinstance(local, dict):
        flat['9_local_name_exists'] = local.get('exists')
        names = local.get('names', [])
        flat['9_local_name_names'] = '; '.join(names) if names else None
    else:
        flat['9_local_name_exists'] = None
        flat['9_local_name_names'] = None

    # 10_relationship_ (2 columns)
    rel = json_obj.get('10_relationship_', {})
    if isinstance(rel, dict):
        codes = rel.get('codes', [])
        flat['10_relationship_codes'] = ','.join(map(str, codes)) if codes else None
        flat['10_relationship_description'] = rel.get('description')
    else:
        flat['10_relationship_codes'] = ','.join(map(str, rel)) if rel else None
        flat['10_relationship_description'] = None

    # 11_units_of_measure (2 columns)
    units = json_obj.get('11_units_of_measure', {})
    if isinstance(units, dict):
        flat['11_units_type'] = units.get('type')
        flat['11_units_description'] = units.get('description')
    else:
        flat['11_units_type'] = None
        flat['11_units_description'] = None

    # 12_bead_ethnic_ (1 column)
    ethnics = json_obj.get('12_bead_ethnic_', [])
    flat['12_bead_ethnic_'] = '; '.join(ethnics) if ethnics else None

    # 13_nature_of_exchange (2 columns)
    nature = json_obj.get('13_nature_of_exchange', {})
    if isinstance(nature, dict):
        flat['13_nature_code'] = nature.get('code')
        flat['13_nature_description'] = nature.get('description')
    else:
        flat['13_nature_code'] = nature
        flat['13_nature_description'] = None

    # Notes (1 column)
    flat['notes'] = json_obj.get('notes')

    return flat

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=1, max=10),
    reraise=True
)
def call_claude(client, entry_text):
    """Call Claude API with retry logic."""
    response = client.messages.create(
        model=MODEL_NAME,
        max_tokens=2500,
        temperature=0,
        system=SYSTEM_PROMPT,
        messages=[{
            "role": "user",
            "content": USER_PROMPT_TEMPLATE.format(text=entry_text)
        }]
    )
    return response

def calculate_cost(input_tokens, output_tokens):
    """Calculate API cost for Haiku 4.5."""
    # Haiku 4.5 pricing (estimate - verify actual pricing)
    input_cost = (input_tokens / 1_000_000) * 0.80
    output_cost = (output_tokens / 1_000_000) * 0.24
    return input_cost + output_cost

def print_header(title):
    """Print formatted section header."""
    print(f"\n{'='*80}")
    print(f"{title}")
    print(f"{'='*80}")

# === MAIN PROCESSING ===

def main():
    print_header("BEAD TRADE CODING - COMPLETE VERSION")
    print(f"Model: {MODEL_NAME}")
    print(f"Input: {INPUT_FILE}")
    print(f"Rows: {MAX_ROWS}")
    print(f"All 13 fields with codes + descriptions")

    # Verify API key
    if not ANTHROPIC_API_KEY.startswith("sk-ant-"):
        print(f"\n⚠️  Warning: API key format looks unusual")
        print(f"   Should start with: sk-ant-")
        confirm = input("Continue anyway? (y/n): ").strip().lower()
        if confirm != 'y':
            return

    # Load data
    print(f"\nLoading {INPUT_FILE}...")
    try:
        df = pd.read_excel(INPUT_FILE)
        print(f"✓ Loaded {len(df)} rows, {len(df.columns)} columns")
    except Exception as e:
        print(f"✗ Error loading file: {e}")
        return

    if TEXT_COLUMN not in df.columns:
        print(f"✗ Column '{TEXT_COLUMN}' not found")
        print(f"Available columns: {list(df.columns)}")
        return

    # Initialize
    client = Anthropic(api_key=ANTHROPIC_API_KEY)
    responses = []
    total_input_tokens = 0
    total_output_tokens = 0
    start_time = time.time()
    success = 0
    errors = 0
    skipped = 0

    print_header("PROCESSING")
    print(f"Starting... Progress every 100 rows\n")

    # Process rows
    for idx in range(MAX_ROWS):
        try:
            row = df.iloc[idx]
            text = row.get(TEXT_COLUMN)

            # Skip empty
            if pd.isna(text) or not str(text).strip():
                responses.append(None)
                skipped += 1
                continue

            text = str(text).strip()

            # Call Claude
            response = call_claude(client, text)
            total_input_tokens += response.usage.input_tokens
            total_output_tokens += response.usage.output_tokens
            response_text = response.content[0].text

            # Strip markdown and parse
            cleaned = strip_markdown_json(response_text)

            if not cleaned:
                responses.append({"error": "Empty response"})
                errors += 1
                continue

            try:
                parsed = json.loads(cleaned)

                # Validate
                valid, error = validate_json_response(parsed)
                if valid:
                    flat = flatten_for_excel(parsed)
                    responses.append(flat)
                    success += 1
                else:
                    responses.append({"error": error})
                    errors += 1

            except json.JSONDecodeError as e:
                responses.append({"error": f"JSON error: {str(e)[:40]}"})
                errors += 1

                # Debug first 3 errors
                if errors <= 3:
                    print(f"\n⚠️  Parse error at row {idx}:")
                    print(f"   First 150 chars: {cleaned[:150]}")
                    print(f"   Error: {str(e)}\n")

            # Progress
            if (idx + 1) % 100 == 0:
                elapsed = (time.time() - start_time) / 60
                cost = calculate_cost(total_input_tokens, total_output_tokens)
                rate = (idx + 1) / elapsed if elapsed > 0 else 0

                print(f"Row {idx+1}/{MAX_ROWS} | Success: {success} | Errors: {errors} | "
                      f"Cost: ${cost:.2f} | Rate: {rate:.1f}/min")

        except Exception as e:
            responses.append({"error": str(e)[:60]})
            errors += 1

    # === RESULTS ===

    elapsed = (time.time() - start_time) / 60
    final_cost = calculate_cost(total_input_tokens, total_output_tokens)

    print_header("RESULTS")

    print(f"\nProcessing Summary:")
    print(f"  Total rows: {MAX_ROWS}")
    print(f"  Successfully coded: {success}")
    print(f"  Errors: {errors}")
    print(f"  Skipped (empty): {skipped}")
    print(f"  Success rate: {(success/(MAX_ROWS-skipped)*100):.1f}%")

    print(f"\nToken Usage:")
    print(f"  Input: {total_input_tokens:,}")
    print(f"  Output: {total_output_tokens:,}")
    print(f"  Total: {total_input_tokens + total_output_tokens:,}")
    print(f"  Avg per row: {(total_input_tokens + total_output_tokens)/(MAX_ROWS-skipped):.0f}")

    print(f"\nCost:")
    print(f"  Total: ${final_cost:.2f}")
    print(f"  Per row: ${final_cost/(MAX_ROWS-skipped):.4f}")

    print(f"\nTime:")
    print(f"  Duration: {elapsed:.1f} minutes")
    print(f"  Rate: {(MAX_ROWS-skipped)/elapsed:.1f} rows/min")

    # === SAVE OUTPUT ===

    print_header("SAVING OUTPUT")

    # Create output dataframe
    output_df = df.iloc[:len(responses)].copy()

    # Add coding columns
    for i, resp in enumerate(responses):
        if resp and isinstance(resp, dict) and "error" not in resp:
            for key, value in resp.items():
                if key not in output_df.columns:
                    output_df[key] = None
                output_df.at[i, key] = value

    # Save
    output_file = os.path.join(OUTPUT_DIR, "bead_coded_complete.xlsx")
    output_df.to_excel(output_file, index=False)

    print(f"\n✓ Output saved: {output_file}")
    print(f"  Rows: {len(output_df)}")
    print(f"  Original columns: {len(df.columns)}")
    print(f"  New columns: {len(output_df.columns) - len(df.columns)}")
    print(f"  Total columns: {len(output_df.columns)}")

    # Show field coverage
    print(f"\nField Coverage (27 columns):")
    coding_cols = [c for c in output_df.columns if c not in df.columns and c != 'error']
    for col in sorted(coding_cols):
        count = output_df[col].notna().sum()
        pct = (count / success * 100) if success > 0 else 0
        print(f"  {col:35s} | {count:4d} ({pct:5.1f}%)")

    # Save report
    report_file = os.path.join(OUTPUT_DIR, "coding_report.txt")
    with open(report_file, 'w') as f:
        f.write(f"Bead Trade Coding Report\n")
        f.write(f"{'='*60}\n\n")
        f.write(f"Date: {datetime.now().isoformat()}\n")
        f.write(f"Model: {MODEL_NAME}\n")
        f.write(f"Rows processed: {MAX_ROWS}\n")
        f.write(f"Success: {success} ({(success/(MAX_ROWS-skipped)*100):.1f}%)\n")
        f.write(f"Errors: {errors}\n")
        f.write(f"Cost: ${final_cost:.2f}\n")
        f.write(f"Time: {elapsed:.1f} minutes\n")

    print(f"\n✓ Report saved: {report_file}")

    print_header("COMPLETE")

    if success / (MAX_ROWS - skipped) > 0.85:
        print("✓ High success rate! Data ready for analysis.")
    else:
        print("⚠️  Lower success rate. Review errors in output file.")

    print(f"\nNext steps:")
    print(f"  1. Open: {output_file}")
    print(f"  2. Verify: Check sample rows")
    print(f"  3. Analyze: Use codes for stats, descriptions for context")
    print()

if __name__ == "__main__":
    main()

Checking dependencies...
Installing anthropic...

BEAD TRADE CODING - COMPLETE VERSION
Model: claude-haiku-4-5-20251001
Input: Munashe_Cleaned.xlsx
Rows: 1453
All 13 fields with codes + descriptions

Loading Munashe_Cleaned.xlsx...
✓ Loaded 1453 rows, 23 columns

PROCESSING
Starting... Progress every 100 rows

Row 100/1453 | Success: 98 | Errors: 2 | Cost: $0.17 | Rate: 11.5/min
Row 200/1453 | Success: 198 | Errors: 2 | Cost: $0.33 | Rate: 12.0/min
Row 300/1453 | Success: 298 | Errors: 2 | Cost: $0.50 | Rate: 11.8/min
Row 400/1453 | Success: 396 | Errors: 4 | Cost: $0.67 | Rate: 11.8/min
Row 500/1453 | Success: 495 | Errors: 5 | Cost: $0.88 | Rate: 11.7/min
Row 600/1453 | Success: 595 | Errors: 5 | Cost: $1.04 | Rate: 11.8/min
Row 700/1453 | Success: 695 | Errors: 5 | Cost: $1.22 | Rate: 11.9/min
Row 800/1453 | Success: 795 | Errors: 5 | Cost: $1.44 | Rate: 11.5/min
Row 900/1453 | Success: 895 | Errors: 5 | Cost: $1.61 | Rate: 11.5/min
Row 1000/1453 | Success: 995 | Errors: 5 | Cost: $