# Forecast Setup

Data pulls, then transforms.

Inputs (injected variables):
1. `site_list_path` — Site list Excel file
2. `ct_file_path` — VP/CT CSV file

Data pulls:
3. VOVI forecasts (US + CA, AMZL, premium)
4. Pipeline artifacts from S3
5. Intraday PBA data from S3

Transforms:
- VP pivot (long to wide, util columns, grid keys)
- Site list + VP + VOVI outer join with available_inputs flag
- PBA query (separate output file)

In [None]:
# name: setup | type: python
from pathlib import Path
from datetime import datetime, timedelta
import pytz

site_list = Path(site_list_path)
ct_file = Path(ct_file_path)

if not site_list.exists():
    raise FileNotFoundError(f'Site list not found: {site_list_path}')
if not ct_file.exists():
    raise FileNotFoundError(f'CT file not found: {ct_file_path}')

# Auto-calculate target date (tomorrow Pacific)
pacific = pytz.timezone('US/Pacific')
now_pacific = datetime.now(pacific)
tomorrow = now_pacific + timedelta(days=1)
target_date = tomorrow.strftime('%Y-%m-%d')

# Generate context ID and output directory
ctx_id = generate_ctx_id('forecast_setup')
ctx_dir = contexts_dir / ctx_id
ctx_dir.mkdir(parents=True, exist_ok=True)

print(f'Site list: {site_list}')
print(f'CT file: {ct_file}')
print(f'Target date: {target_date}')
print(f'Context ID: {ctx_id}')
print(f'Output: {ctx_dir}')

result = {'site_list_path': str(site_list), 'ct_file_path': str(ct_file), 'target_date': target_date, 'ctx_id': ctx_id, 'ctx_dir': str(ctx_dir)}

In [None]:
-- name: load_site_list | type: sql
CREATE OR REPLACE TABLE site_list AS
SELECT
    "Station" AS station,
    "Cycle" AS cycle,
    "Business Org" AS business_org
FROM read_xlsx(site_list_path)
WHERE "Business Org" = 'AMZL'

In [None]:
-- name: load_vp | type: sql
CREATE OR REPLACE TABLE vp_raw AS
SELECT * FROM read_csv_auto(ct_file_path)

In [None]:
# name: fetch_vovi | type: python
# Fetch VOVI forecasts for US and CA
import subprocess
import json as _json

cookie_path = str(Path.home() / '.midway' / 'cookie')
vovi_base = 'https://prod.vovi.last-mile.amazon.dev/api/forecast/list_approved'

countries = ['US', 'CA']
business_type = 'amzl'
shipping_type = 'premium'

vovi_ctx_dir = ctx_dir / 'vovi'
vovi_ctx_dir.mkdir(parents=True, exist_ok=True)

vovi_results = []

for country in countries:
    url = f'{vovi_base}?country={country}&cptDateKey={target_date}&shippingType={shipping_type}&businessType={business_type}'
    print(f'Fetching VOVI: {country} / {business_type} / {shipping_type} / {target_date}...')
    
    try:
        curl_result = subprocess.run(
            ['curl.exe', '--location-trusted', '-b', cookie_path, '-s', url],
            capture_output=True, text=True
        )
        
        if curl_result.returncode != 0:
            print(f'  {country}: curl failed - {curl_result.stderr[:100]}')
            vovi_results.append({'country': country, 'success': False, 'error': curl_result.stderr[:200]})
            continue
        
        data = _json.loads(curl_result.stdout)
        df = pd.DataFrame(data)
        
        # Save to context directory
        csv_file = vovi_ctx_dir / f'vovi_{country.lower()}_{business_type}_{shipping_type}.csv'
        df.to_csv(csv_file, index=False)
        
        # Register in DuckDB
        table_name = f'vovi_{country.lower()}'
        conn.register(table_name, df)
        
        print(f'  {country}: {len(df):,} rows, {len(df.columns)} cols -> {table_name}')
        vovi_results.append({'country': country, 'success': True, 'rows': len(df), 'table': table_name, 'path': str(csv_file)})
        
    except Exception as e:
        print(f'  {country}: failed - {e}')
        vovi_results.append({'country': country, 'success': False, 'error': str(e)})

# Create combined vovi table using UNION ALL BY NAME (handles differing column counts)
try:
    tables_to_union = [r['table'] for r in vovi_results if r.get('success')]
    if tables_to_union:
        union_sql = ' UNION ALL BY NAME '.join([f"SELECT * FROM {t}" for t in tables_to_union])
        conn.execute(f'CREATE OR REPLACE VIEW vovi AS {union_sql}')
        vovi_total = conn.execute('SELECT COUNT(*) FROM vovi').fetchone()[0]
        print(f'\nCombined vovi view: {vovi_total:,} rows')
except Exception as e:
    print(f'Could not create combined view: {e}')

result = vovi_results

In [None]:
# name: download_artifacts | type: python
# Download latest pipeline artifacts from S3
import re

artifact_bucket = 'lma-glue-pipeline'
sort_code = 'DS-A'
artifacts_dir = ctx_dir / 'pipeline_artifacts'
artifacts_dir.mkdir(parents=True, exist_ok=True)

session = make_boto3_session(profile_name=aws_profile)
s3 = session.client('s3')

s3_prefix = f'pipeline_output/internal_sort_code={sort_code}/target_forecast_date={target_date}/'
print(f'Scanning: s3://{artifact_bucket}/{s3_prefix}')

# Group by artifact type, keep latest timestamp
paginator = s3.get_paginator('list_objects_v2')
files_by_type = {}

for page in paginator.paginate(Bucket=artifact_bucket, Prefix=s3_prefix):
    for obj in page.get('Contents', []):
        key = obj['Key']
        filename = key.split('/')[-1]
        match = re.match(r'^(.+?)_(\d{8}_\d{6})(.*)\.csv$', filename)
        if match:
            base_name = match.group(1)
            timestamp = match.group(2)
            suffix = match.group(3)
            artifact_type = base_name + suffix
            if artifact_type not in files_by_type or timestamp > files_by_type[artifact_type]['timestamp']:
                files_by_type[artifact_type] = {
                    'key': key, 'filename': filename, 'timestamp': timestamp,
                    'size': obj['Size'], 'artifact_type': artifact_type
                }

print(f'Found {len(files_by_type)} artifact types')

# Download and register each
artifact_results = []
for atype, info in sorted(files_by_type.items()):
    dest_file = artifacts_dir / info['filename']
    size_mb = info['size'] / 1024 / 1024
    
    try:
        print(f'  {info["filename"]} ({size_mb:.1f} MB)...', flush=True)
        s3.download_file(artifact_bucket, info['key'], str(dest_file))
        
        # Register in DuckDB
        conn.execute(f"CREATE OR REPLACE TABLE {atype} AS SELECT * FROM read_csv_auto('{dest_file}')")
        row_count = conn.execute(f'SELECT COUNT(*) FROM {atype}').fetchone()[0]
        
        artifact_results.append({'artifact': atype, 'rows': row_count, 'size_mb': round(size_mb, 1), 'path': str(dest_file)})
    except Exception as e:
        print(f'    FAILED: {e}')
        artifact_results.append({'artifact': atype, 'error': str(e)})

print(f'\nDownloaded and registered {len([a for a in artifact_results if "rows" in a])}/{len(files_by_type)} artifacts')

result = artifact_results

In [None]:
# name: download_pba | type: python
# Download latest intraday PBA parquet from S3 (last-mile-staging bucket)
import re as _re

pba_bucket = 'last-mile-staging'
sort_code = 'DS-A'
pba_dir = ctx_dir / 'intraday_pba'
pba_dir.mkdir(parents=True, exist_ok=True)

pba_session = make_boto3_session(profile_name='last-mile-staging')
pba_s3 = pba_session.client('s3')

pba_prefix = f'intraday-pba/partition_internal_sort_code={sort_code}/partition_target_date={target_date}/'
print(f'Scanning: s3://{pba_bucket}/{pba_prefix}')

# List all objects under target date to find run_time partitions
paginator = pba_s3.get_paginator('list_objects_v2')
run_times = {}

for page in paginator.paginate(Bucket=pba_bucket, Prefix=pba_prefix):
    for obj in page.get('Contents', []):
        key = obj['Key']
        match = _re.search(r'partition_run_time=(\d+)/', key)
        if match:
            run_time = int(match.group(1))
            run_times[run_time] = {'key': key, 'size': obj['Size']}

if not run_times:
    print(f'No intraday PBA data found for {target_date}')
    result = {'success': False, 'error': f'No data for {target_date}'}
else:
    latest_run_time = max(run_times)
    latest = run_times[latest_run_time]
    filename = latest['key'].split('/')[-1]
    size_mb = latest['size'] / 1024 / 1024
    dest_file = pba_dir / filename

    print(f'Latest run_time: {latest_run_time}')
    print(f'Downloading: {filename} ({size_mb:.1f} MB)...', flush=True)

    pba_s3.download_file(pba_bucket, latest['key'], str(dest_file))

    # Register in DuckDB
    conn.execute(f"CREATE OR REPLACE TABLE intraday_pba AS SELECT * FROM read_parquet('{dest_file}')")
    pba_count = conn.execute('SELECT COUNT(*) FROM intraday_pba').fetchone()[0]
    pba_cols = [col[0] for col in conn.execute('DESCRIBE intraday_pba').fetchall()]

    print(f'Intraday PBA: {pba_count:,} rows, {len(pba_cols)} columns')
    print(f'Columns: {pba_cols}')
    print(f'Saved to: {dest_file}')

    result = {
        'success': True,
        'run_time': latest_run_time,
        'rows': pba_count,
        'columns': pba_cols,
        'size_mb': round(size_mb, 1),
        'path': str(dest_file)
    }

In [None]:
-- name: vp_pivot | type: sql
SET TimeZone = 'UTC';

CREATE OR REPLACE TABLE vp AS
WITH pivot_all AS (
    SELECT
        node,
        plan_start_date,
        ofd_dates,
        demand_types,
        CAST(cpts AS TIMESTAMP) AS cpts,
        MAX(CASE WHEN metric_name = 'total_volume_available' THEN metric_value END) AS total_volume_available,
        MAX(CASE WHEN metric_name = 'automated_uncapped_slam_forecast' THEN metric_value END) AS automated_uncapped_slam_forecast,
        MAX(CASE WHEN metric_name = 'current_slam' THEN metric_value END) AS current_slam,
        MAX(CASE WHEN metric_name = 'weekly_uncapped_slam_forecast' THEN metric_value END) AS weekly_uncapped_slam_forecast,
        MAX(CASE WHEN metric_name = 'post_cutoff_adjustment' THEN metric_value END) AS post_cutoff_adjustment,
        MAX(CASE WHEN metric_name = 'total_backlog' THEN metric_value END) AS total_backlog,
        MAX(CASE WHEN metric_name = 'automated_confidence' THEN metric_value END) AS automated_confidence,
        MAX(CASE WHEN metric_name = 'uncapped_slam_forecast' THEN metric_value END) AS uncapped_slam_forecast,
        MAX(CASE WHEN metric_name = 'atrops_soft_cap' THEN metric_value END) AS atrops_soft_cap,
        MAX(CASE WHEN metric_name = 'confidence_anomaly' THEN metric_value END) AS confidence_anomaly,
        MAX(CASE WHEN metric_name = 'net_volume_adjustments' THEN metric_value END) AS net_volume_adjustments,
        MAX(CASE WHEN metric_name = 'adjusted_uncapped_slam_forecast' THEN metric_value END) AS adjusted_uncapped_slam_forecast,
        MAX(CASE WHEN metric_name = 'cap_target_buffer' THEN metric_value END) AS cap_target_buffer,
        MAX(CASE WHEN metric_name = 'earlies_expected' THEN metric_value END) AS earlies_expected,
        MAX(CASE WHEN metric_name = 'returns' THEN metric_value END) AS returns,
        MAX(CASE WHEN metric_name = 'sideline_in' THEN metric_value END) AS sideline_in,
        MAX(CASE WHEN metric_name = 'vovi_uncapped_slam_forecast' THEN metric_value END) AS vovi_uncapped_slam_forecast,
        MAX(CASE WHEN metric_name = 'in_station_backlog' THEN metric_value END) AS in_station_backlog,
        MAX(CASE WHEN metric_name = 'mnr_expected' THEN metric_value END) AS mnr_expected,
        MAX(CASE WHEN metric_name = 'mnr_received' THEN metric_value END) AS mnr_received,
        MAX(CASE WHEN metric_name = 'current_schedule' THEN metric_value END) AS current_schedule,
        MAX(CASE WHEN metric_name = 'vovi_adjustment' THEN metric_value END) AS vovi_adjustment,
        MAX(CASE WHEN metric_name = 'forecast_type' THEN metric_value END) AS forecast_type,
        MAX(CASE WHEN metric_name = 'earlies_received' THEN metric_value END) AS earlies_received,
        MAX(CASE WHEN metric_name = 'latest_deployed_cap' THEN metric_value END) AS latest_deployed_cap,
        MAX(CASE WHEN metric_name = 'atrops_hard_cap' THEN metric_value END) AS atrops_hard_cap,
        MAX(CASE WHEN metric_name = 'capped_slam_forecast' THEN metric_value END) AS capped_slam_forecast
    FROM vp_raw
    WHERE plan_start_date::DATE = ofd_dates::DATE
    GROUP BY node, plan_start_date, ofd_dates, demand_types, cpts
)
SELECT
    *,
    strftime(cpts, '%H:%M') AS cpts_local,
    strftime(cpts AT TIME ZONE 'UTC', '%H:%M') AS cpts_utc,
    ofd_dates || '#' || node || '#' || strftime(cpts, '%H:%M') AS grid_key_local,
    ofd_dates || '#' || node || '#' || strftime(cpts AT TIME ZONE 'UTC', '%H:%M') AS grid_key_utc,
    CASE WHEN GREATEST(COALESCE(latest_deployed_cap::FLOAT, 0), COALESCE(atrops_soft_cap::FLOAT, 0)) > 0
         THEN ROUND(COALESCE(automated_uncapped_slam_forecast::FLOAT, 0) / GREATEST(COALESCE(latest_deployed_cap::FLOAT, 0), COALESCE(atrops_soft_cap::FLOAT, 0)), 4)
         ELSE NULL END AS auto_forecast_util,
    CASE WHEN GREATEST(COALESCE(latest_deployed_cap::FLOAT, 0), COALESCE(atrops_soft_cap::FLOAT, 0)) > 0
         THEN ROUND(COALESCE(current_schedule::FLOAT, 0) / GREATEST(COALESCE(latest_deployed_cap::FLOAT, 0), COALESCE(atrops_soft_cap::FLOAT, 0)), 4)
         ELSE NULL END AS util
FROM pivot_all
ORDER BY node, cpts

In [None]:
-- name: joined | type: sql
SET TimeZone = 'UTC';

CREATE OR REPLACE TABLE joined AS
SELECT
    COALESCE(sl.station, vp.node) AS station,
    sl.cycle,
    sl.business_org,
    CASE
        WHEN sl.station IS NOT NULL AND vp.node IS NOT NULL THEN 'vp_list'
        WHEN vp.node IS NOT NULL THEN 'vp'
        ELSE 'list'
    END AS available_inputs,
    vp.plan_start_date,
    vp.ofd_dates,
    vp.demand_types,
    vp.cpts,
    vp.cpts_local,
    vp.cpts_utc,
    vp.grid_key_local,
    vp.grid_key_utc,
    vp.forecast_type,
    vp.automated_confidence,
    vp.auto_forecast_util,
    vp.util,
    vp.vovi_uncapped_slam_forecast,
    vp.uncapped_slam_forecast,
    vp.adjusted_uncapped_slam_forecast,
    vp.capped_slam_forecast,
    vp.atrops_soft_cap,
    vp.atrops_hard_cap,
    vp.latest_deployed_cap,
    vp.cap_target_buffer,
    vp.current_slam,
    vp.current_schedule,
    vp.total_volume_available,
    vp.total_backlog,
    vp.in_station_backlog,
    vp.post_cutoff_adjustment,
    vp.net_volume_adjustments,
    vp.vovi_adjustment,
    vp.confidence_anomaly,
    vp.automated_uncapped_slam_forecast,
    vp.weekly_uncapped_slam_forecast,
    vp.earlies_expected,
    vp.earlies_received,
    vp.returns,
    vp.sideline_in,
    vp.mnr_expected,
    vp.mnr_received,
    v.modified_user AS vovi_modified_user,
    v.proposed_cap AS vovi_proposed_cap,
    v.post_cutoff_adjustment AS vovi_post_cutoff_adjustment,
    v.adjusted_forecast AS vovi_adjusted_forecast,
    v.forecast_source AS vovi_forecast_source,
    v.original_forecast AS vovi_original_forecast,
    v.forecast_status AS vovi_forecast_status,
    v.forecast_adjustment AS vovi_forecast_adjustment,
    v.current_slammed AS vovi_current_slammed,
    v.current_scheduled AS vovi_current_scheduled,
    v.soft_cap AS vovi_soft_cap,
    v.hard_cap AS vovi_hard_cap,
    NOW()::TIMESTAMP AS execution_ts
FROM site_list sl
FULL OUTER JOIN vp ON sl.station = vp.node
LEFT JOIN vovi v
    ON COALESCE(sl.station, vp.node) = v.station
    AND vp.cpts_utc = strftime(CAST(timezone('UTC', to_timestamp(v.station_cpt)) AS TIMESTAMP), '%H:%M')
ORDER BY station, cpts

In [None]:
# name: summary | type: python
# Summary of available_inputs breakdown
summary = conn.execute("""
    SELECT available_inputs, COUNT(DISTINCT station) AS stations, COUNT(*) AS rows
    FROM joined
    GROUP BY available_inputs
    ORDER BY available_inputs
""").fetchdf()
print(summary.to_string(index=False))

total = conn.execute("SELECT COUNT(*) FROM joined").fetchone()[0]
distinct = conn.execute("SELECT COUNT(DISTINCT station) FROM joined").fetchone()[0]
print(f'\nTotal: {distinct} stations, {total} rows')

result = {
    'status': 'success',
    'total_rows': total,
    'distinct_stations': distinct,
    'breakdown': summary.to_dict('records')
}

In [None]:
# name: save_joined | type: python
# Export joined table to CSV
output_file = ctx_dir / 'joined.csv'
conn.execute(f"COPY joined TO '{output_file}' (HEADER, DELIMITER ',')")
row_count = conn.execute("SELECT COUNT(*) FROM joined").fetchone()[0]
print(f'Saved joined table: {row_count} rows -> {output_file}')

result = {'output_file': str(output_file), 'rows': row_count}