# Download Pipeline Artifacts

Downloads the latest run of each artifact type from the lma-glue-pipeline S3 bucket.

**Artifacts downloaded:**
- `flatline_execute` — Flatline execution results
- `flatline_execute_flags` — Flatline flags
- `flatline_keys` — Flatline keys
- `flatline_signal` — Flatline signal data
- `cumulative` — Cumulative forecast
- `day_classifier` — Day classifier
- `forecast` — Forecast output
- `horizon_aggregates` — Horizon aggregates

In [None]:
# name: setup | type: python
import boto3
import os
from pathlib import Path
from datetime import datetime
import pytz

# Variables injected from notebook metadata (can be overridden at runtime)
# target_date, sort_code, bucket, aws_profile, output_dir are set by variable injection

# Resolve output directory
output_path = Path(output_dir).expanduser()
output_path.mkdir(parents=True, exist_ok=True)

print(f'Target date: {target_date}')
print(f'Sort code: {sort_code}')
print(f'Bucket: {bucket}')
print(f'AWS profile: {aws_profile}')
print(f'Output: {output_path}')

result = {
    'target_date': target_date,
    'sort_code': sort_code,
    'output_dir': str(output_path)
}

In [None]:
# name: list_artifacts | type: python
session = boto3.Session(profile_name=aws_profile)
s3 = session.client('s3')

prefix = f'pipeline_output/internal_sort_code={sort_code}/target_forecast_date={target_date}/'
print(f'Scanning: s3://{bucket}/{prefix}')

# List all objects and group by artifact type, keeping the latest per type
paginator = s3.get_paginator('list_objects_v2')
files_by_type = {}

for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
    for obj in page.get('Contents', []):
        key = obj['Key']
        filename = key.split('/')[-1]
        
        # Split on the timestamp pattern to get artifact type
        # e.g., 'flatline_execute_20260208_190041_flags.csv' -> 'flatline_execute_flags'
        # e.g., 'flatline_execute_20260208_190041.csv' -> 'flatline_execute'
        import re
        match = re.match(r'^(.+?)_(\d{8}_\d{6})(.*)\.csv$', filename)
        if match:
            base_name = match.group(1)
            timestamp = match.group(2)
            suffix = match.group(3)  # e.g., '_flags' or ''
            artifact_type = base_name + suffix
            
            if artifact_type not in files_by_type or timestamp > files_by_type[artifact_type]['timestamp']:
                files_by_type[artifact_type] = {
                    'key': key,
                    'filename': filename,
                    'timestamp': timestamp,
                    'size': obj['Size'],
                    'last_modified': str(obj['LastModified']),
                    'artifact_type': artifact_type
                }

print(f'\nFound {len(files_by_type)} artifact types:')
for atype, info in sorted(files_by_type.items()):
    size_mb = info['size'] / 1024 / 1024
    print(f'  {atype}: {info["filename"]} ({size_mb:.1f} MB)')

total_mb = sum(f['size'] for f in files_by_type.values()) / 1024 / 1024
print(f'\nTotal: {total_mb:.1f} MB')

result = {
    'artifact_count': len(files_by_type),
    'total_mb': round(total_mb, 1),
    'artifacts': list(files_by_type.keys())
}

In [None]:
# name: download | type: python
downloaded = []
failed = []

for atype, info in sorted(files_by_type.items()):
    dest_file = output_path / info['filename']
    size_mb = info['size'] / 1024 / 1024
    
    try:
        print(f'Downloading {info["filename"]} ({size_mb:.1f} MB)...', flush=True)
        s3.download_file(bucket, info['key'], str(dest_file))
        downloaded.append({
            'artifact_type': atype,
            'filename': info['filename'],
            'size_mb': round(size_mb, 1),
            'path': str(dest_file)
        })
    except Exception as e:
        print(f'  FAILED: {e}')
        failed.append({'artifact_type': atype, 'error': str(e)})

print(f'\nDownloaded: {len(downloaded)}/{len(files_by_type)}')
if failed:
    print(f'Failed: {len(failed)}')
    for f in failed:
        print(f'  {f["artifact_type"]}: {f["error"]}')

result = {
    'status': 'success' if not failed else 'partial',
    'downloaded': len(downloaded),
    'failed': len(failed),
    'output_dir': str(output_path),
    'files': downloaded
}

In [None]:
# name: register_tables | type: python
# Register downloaded CSVs as DuckDB tables for immediate querying
registered = []

for file_info in downloaded:
    table_name = file_info['artifact_type']
    file_path = file_info['path']
    
    try:
        conn.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM read_csv_auto('{file_path}')")
        row_count = conn.execute(f'SELECT COUNT(*) FROM {table_name}').fetchone()[0]
        registered.append({'table': table_name, 'rows': row_count})
        print(f'  {table_name}: {row_count:,} rows')
    except Exception as e:
        print(f'  {table_name}: FAILED - {e}')

print(f'\nRegistered {len(registered)} tables in DuckDB')

result = {
    'status': 'success',
    'tables_registered': len(registered),
    'tables': registered,
    'output_dir': str(output_path)
}