# Forecast Review

Pulls all data sources needed for forecast review:
1. CT metadata (fetched from API, saved for future use)
2. VP data (user-provided local file)
3. VOVI forecasts (US + CA, AMZL, premium)
4. Pipeline artifacts from S3

In [None]:
# name: setup | type: python
import os
from pathlib import Path
from datetime import datetime, timedelta
import pytz

# Auto-calculate target date (tomorrow Pacific)
pacific = pytz.timezone('US/Pacific')
now_pacific = datetime.now(pacific)
tomorrow = now_pacific + timedelta(days=1)
target_date = tomorrow.strftime('%Y-%m-%d')

# Generate context ID
ctx_id = generate_ctx_id('forecast_review')

# Build context output directory
ctx_dir = contexts_dir / ctx_id
ctx_dir.mkdir(parents=True, exist_ok=True)

print(f'Context ID: {ctx_id}')
print(f'Target date: {target_date}')
print(f'VP file: {vp_file_path}')
print(f'Output: {ctx_dir}')

# Validate VP file exists
vp_path = Path(vp_file_path)
if not vp_path.exists():
    raise FileNotFoundError(f'VP file not found: {vp_file_path}')

result = {
    'ctx_id': ctx_id,
    'target_date': target_date,
    'ctx_dir': str(ctx_dir)
}

In [None]:
# name: fetch_ct | type: python
# Fetch CT metadata from API — saved for future use, not actively used in v1
import json as _json

ct_result_json = fetch_ct_metadata(ctx_id)
ct_result = _json.loads(ct_result_json)

if ct_result.get('success'):
    print(f'CT metadata: {ct_result["row_count"]} stations')
    print(f'Saved to: {ct_result["output_file"]}')
else:
    print(f'CT metadata fetch failed (non-blocking): {ct_result.get("error", "unknown")}')
    print('Continuing without CT metadata — not required for v1')

result = ct_result

In [None]:
# name: load_vp | type: python
# Load user-provided VP file into DuckDB
import shutil

# Copy VP file to context directory
vp_ctx_dir = ctx_dir / 'vp'
vp_ctx_dir.mkdir(parents=True, exist_ok=True)
vp_dest = vp_ctx_dir / vp_path.name
shutil.copy2(str(vp_path), str(vp_dest))

# Register in DuckDB
conn.execute(f"CREATE OR REPLACE TABLE vp AS SELECT * FROM read_csv_auto('{vp_dest}')")
vp_count = conn.execute('SELECT COUNT(*) FROM vp').fetchone()[0]
vp_cols = [col[0] for col in conn.execute('DESCRIBE vp').fetchall()]

print(f'VP data: {vp_count:,} rows, {len(vp_cols)} columns')
print(f'Columns: {vp_cols}')
print(f'Saved to: {vp_dest}')

result = {
    'rows': vp_count,
    'columns': vp_cols,
    'path': str(vp_dest)
}

In [None]:
# name: fetch_vovi | type: python
# Fetch VOVI forecasts for US and CA
import subprocess
import json as _json

cookie_path = str(Path.home() / '.midway' / 'cookie')
vovi_base = 'https://prod.vovi.last-mile.amazon.dev/api/forecast/list_approved'

countries = ['US', 'CA']
business_type = 'amzl'
shipping_type = 'premium'

vovi_ctx_dir = ctx_dir / 'vovi'
vovi_ctx_dir.mkdir(parents=True, exist_ok=True)

vovi_results = []

for country in countries:
    url = f'{vovi_base}?country={country}&cptDateKey={target_date}&shippingType={shipping_type}&businessType={business_type}'
    print(f'Fetching VOVI: {country} / {business_type} / {shipping_type} / {target_date}...')
    
    try:
        curl_result = subprocess.run(
            ['curl.exe', '--location-trusted', '-b', cookie_path, '-s', url],
            capture_output=True, text=True
        )
        
        if curl_result.returncode != 0:
            print(f'  {country}: curl failed - {curl_result.stderr[:100]}')
            vovi_results.append({'country': country, 'success': False, 'error': curl_result.stderr[:200]})
            continue
        
        data = _json.loads(curl_result.stdout)
        df = pd.DataFrame(data)
        
        # Save to context directory
        csv_file = vovi_ctx_dir / f'vovi_{country.lower()}_{business_type}_{shipping_type}.csv'
        df.to_csv(csv_file, index=False)
        
        # Register in DuckDB
        table_name = f'vovi_{country.lower()}'
        conn.register(table_name, df)
        
        print(f'  {country}: {len(df):,} rows -> {table_name}')
        vovi_results.append({'country': country, 'success': True, 'rows': len(df), 'table': table_name, 'path': str(csv_file)})
        
    except Exception as e:
        print(f'  {country}: failed - {e}')
        vovi_results.append({'country': country, 'success': False, 'error': str(e)})

# Also create a combined vovi table
try:
    tables_to_union = [r['table'] for r in vovi_results if r.get('success')]
    if tables_to_union:
        union_sql = ' UNION ALL '.join([f"SELECT *, '{t.split('_')[1].upper()}' as country FROM {t}" for t in tables_to_union])
        conn.execute(f'CREATE OR REPLACE VIEW vovi AS {union_sql}')
        vovi_total = conn.execute('SELECT COUNT(*) FROM vovi').fetchone()[0]
        print(f'\nCombined vovi view: {vovi_total:,} rows')
except Exception as e:
    print(f'Could not create combined view: {e}')

result = vovi_results

In [None]:
# name: download_artifacts | type: python
# Download latest pipeline artifacts from S3
import boto3
import re

artifact_bucket = 'lma-glue-pipeline'
sort_code = 'DS-A'
artifacts_dir = ctx_dir / 'pipeline_artifacts'
artifacts_dir.mkdir(parents=True, exist_ok=True)

session = boto3.Session(profile_name=aws_profile)
s3 = session.client('s3')

s3_prefix = f'pipeline_output/internal_sort_code={sort_code}/target_forecast_date={target_date}/'
print(f'Scanning: s3://{artifact_bucket}/{s3_prefix}')

# Group by artifact type, keep latest timestamp
paginator = s3.get_paginator('list_objects_v2')
files_by_type = {}

for page in paginator.paginate(Bucket=artifact_bucket, Prefix=s3_prefix):
    for obj in page.get('Contents', []):
        key = obj['Key']
        filename = key.split('/')[-1]
        match = re.match(r'^(.+?)_(\d{8}_\d{6})(.*)\.csv$', filename)
        if match:
            base_name = match.group(1)
            timestamp = match.group(2)
            suffix = match.group(3)
            artifact_type = base_name + suffix
            if artifact_type not in files_by_type or timestamp > files_by_type[artifact_type]['timestamp']:
                files_by_type[artifact_type] = {
                    'key': key, 'filename': filename, 'timestamp': timestamp,
                    'size': obj['Size'], 'artifact_type': artifact_type
                }

print(f'Found {len(files_by_type)} artifact types')

# Download and register each
artifact_results = []
for atype, info in sorted(files_by_type.items()):
    dest_file = artifacts_dir / info['filename']
    size_mb = info['size'] / 1024 / 1024
    
    try:
        print(f'  {info["filename"]} ({size_mb:.1f} MB)...', flush=True)
        s3.download_file(artifact_bucket, info['key'], str(dest_file))
        
        # Register in DuckDB
        conn.execute(f"CREATE OR REPLACE TABLE {atype} AS SELECT * FROM read_csv_auto('{dest_file}')")
        row_count = conn.execute(f'SELECT COUNT(*) FROM {atype}').fetchone()[0]
        
        artifact_results.append({'artifact': atype, 'rows': row_count, 'size_mb': round(size_mb, 1), 'path': str(dest_file)})
    except Exception as e:
        print(f'    FAILED: {e}')
        artifact_results.append({'artifact': atype, 'error': str(e)})

print(f'\nDownloaded and registered {len([a for a in artifact_results if "rows" in a])}/{len(files_by_type)} artifacts')

result = artifact_results

In [None]:
# name: summary | type: python
# Print summary of all loaded data
print(f'\n{"="*60}')
print(f'Forecast Review Context: {ctx_id}')
print(f'Target Date: {target_date}')
print(f'Output Directory: {ctx_dir}')
print(f'{"="*60}')

# List all registered tables
tables = conn.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'").fetchall()
print(f'\nRegistered tables ({len(tables)}):')
for t in tables:
    name = t[0]
    try:
        count = conn.execute(f'SELECT COUNT(*) FROM "{name}"').fetchone()[0]
        print(f'  {name}: {count:,} rows')
    except:
        print(f'  {name}: (view)')

result = {
    'status': 'success',
    'ctx_id': ctx_id,
    'target_date': target_date,
    'ctx_dir': str(ctx_dir),
    'tables': [t[0] for t in tables]
}