# AWS Authentication with Temporary Credentials

This notebook provides flexible AWS authentication methods, including support for temporary credentials.

## Features:
- Multiple authentication methods (IAM user, temporary credentials, SSO)
- Session token support for MFA-protected accounts
- Credential verification and testing
- Automatic credential refresh handling
- Profile-based authentication
- Environment variable configuration

## 1. Setup and Dependencies

In [13]:
# Install required packages if needed
import sys
!{sys.executable} -m pip install boto3 --quiet

In [2]:
import boto3
import json
import os
from datetime import datetime, timezone
from typing import Dict, Optional, Any
from botocore.exceptions import ClientError, NoCredentialsError, ProfileNotFound
from botocore.session import Session
import warnings

warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Boto3 version: {boto3.__version__}")

Libraries imported successfully!
Boto3 version: 1.37.3


## 2. AWS Authentication Methods

Choose one of the following authentication methods:

### Method 1: Temporary Credentials (STS Assume Role or MFA)

In [None]:
# OPTION 1: Enter temporary credentials directly
# Uncomment and fill in your temporary credentials

TEMP_ACCESS_KEY_ID = ""  # e.g., "ASIA..."
TEMP_SECRET_ACCESS_KEY = ""  # Your temporary secret key
TEMP_SESSION_TOKEN = ""  # Your session token (required for temporary credentials)

# Optional: Set expiration time if known
TEMP_EXPIRATION = ""  # e.g., "2024-01-20T12:00:00Z"

In [8]:
def create_session_with_temp_credentials(
    access_key_id: str,
    secret_access_key: str,
    session_token: str,
    region_name: str = 'us-west-2'
) -> boto3.Session:
    """
    Create a boto3 session using temporary credentials.
    
    Args:
        access_key_id: Temporary AWS access key ID
        secret_access_key: Temporary AWS secret access key
        session_token: AWS session token
        region_name: AWS region (default: us-east-1)
        
    Returns:
        boto3.Session object configured with temporary credentials
    """
    session = boto3.Session(
        aws_access_key_id=access_key_id,
        aws_secret_access_key=secret_access_key,
        aws_session_token=session_token,
        region_name=region_name
    )
    
    return session

# Create session with temporary credentials if provided
if TEMP_ACCESS_KEY_ID and TEMP_SECRET_ACCESS_KEY and TEMP_SESSION_TOKEN:
    session = create_session_with_temp_credentials(
        TEMP_ACCESS_KEY_ID,
        TEMP_SECRET_ACCESS_KEY,
        TEMP_SESSION_TOKEN
    )
    print("✅ Session created with temporary credentials")
else:
    print("⚠️ Temporary credentials not provided")

✅ Session created with temporary credentials


## 3. Credential Verification and Testing

In [9]:
def verify_credentials(session: Optional[boto3.Session] = None) -> Dict[str, Any]:
    """
    Verify AWS credentials and get caller identity.
    
    Args:
        session: boto3 Session object (uses default if None)
        
    Returns:
        Dictionary with account information or None if failed
    """
    try:
        if session:
            print('using session')
            sts_client = session.client('sts')
        else:
            sts_client = boto3.client('sts')
        
        # Get caller identity
        response = sts_client.get_caller_identity()
        
        print("✅ AWS Credentials Valid!")
        print(f"\nAccount Details:")
        print(f"  Account ID: {response['Account']}")
        print(f"  User ARN: {response['Arn']}")
        print(f"  User ID: {response['UserId']}")
        
        # Check if using temporary credentials
        if 'assumed-role' in response['Arn'] or response['Arn'].startswith('arn:aws:sts'):
            print("\n📌 Using temporary credentials (assumed role or session token)")
        
        return response
        
    except NoCredentialsError:
        print("❌ No credentials found. Please configure credentials using one of the methods above.")
        return None
    except ClientError as e:
        print(f"❌ Error verifying credentials: {e}")
        return None

# Verify current credentials
if 'session' in locals():
    identity = verify_credentials(session)
else:
    identity = verify_credentials()

using session
✅ AWS Credentials Valid!

Account Details:
  Account ID: 444055461661
  User ARN: arn:aws:iam::444055461661:user/klesinger
  User ID: AIDAWOY6ET4O25743W2P7


## 4. Test AWS Service Access

In [10]:
def test_s3_access(session: Optional[boto3.Session] = None, bucket_name: Optional[str] = None):
    """
    Test S3 access with current credentials.
    
    Args:
        session: boto3 Session object (uses default if None)
        bucket_name: Specific bucket to test (lists all buckets if None)
    """
    try:
        if session:
            s3_client = session.client('s3')
        else:
            s3_client = boto3.client('s3')
        
        if bucket_name:
            # Test specific bucket access
            response = s3_client.head_bucket(Bucket=bucket_name)
            print(f"✅ Can access bucket: {bucket_name}")
            
            # Try to list objects
            response = s3_client.list_objects_v2(Bucket=bucket_name, MaxKeys=5)
            object_count = response.get('KeyCount', 0)
            print(f"  Found {object_count} objects (showing max 5)")
            
        else:
            # List all accessible buckets
            response = s3_client.list_buckets()
            buckets = response.get('Buckets', [])
            
            print(f"✅ S3 Access Confirmed")
            print(f"\nAccessible Buckets ({len(buckets)} total):")
            
            for bucket in buckets[:10]:  # Show first 10
                print(f"  - {bucket['Name']} (created: {bucket['CreationDate'].strftime('%Y-%m-%d')})")
            
            if len(buckets) > 10:
                print(f"  ... and {len(buckets) - 10} more")
        
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'AccessDenied':
            print(f"❌ Access Denied to S3")
        elif error_code == 'NoSuchBucket':
            print(f"❌ Bucket '{bucket_name}' does not exist")
        else:
            print(f"❌ Error accessing S3: {e}")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

# # Test S3 access
# if 'session' in locals():
#     test_s3_access(session)
# else:
#     test_s3_access()

# Test specific bucket (uncomment and modify):
test_s3_access(session, "nasa-disasters")

✅ Can access bucket: nasa-disasters
  Found 5 objects (showing max 5)


## 8. Integration with S3 Bucket Crawler

In [11]:
# Once authenticated, you can use the session with the S3 bucket crawler
# or any other AWS operations

if 'session' in locals() and session:
    # Create S3 client from authenticated session
    s3_client = session.client('s3')
    
    print("✅ Ready to use with S3 bucket crawler!")
    print("\nYou can now:")
    print("1. Import the S3BucketCrawler class from the other notebook")
    print("2. Pass this s3_client to the crawler: S3BucketCrawler('bucket-name', s3_client)")
    print("\nExample:")
    print("  crawler = S3BucketCrawler('nasa-disasters', s3_client)")
    print("  result = crawler.crawl()")
else:
    print("⚠️ Please complete authentication setup first using one of the methods above.")

✅ Ready to use with S3 bucket crawler!

You can now:
1. Import the S3BucketCrawler class from the other notebook
2. Pass this s3_client to the crawler: S3BucketCrawler('bucket-name', s3_client)

Example:
  crawler = S3BucketCrawler('nasa-disasters', s3_client)
  result = crawler.crawl()


In [12]:
import time
from tqdm.notebook import tqdm
from collections import defaultdict
import pandas as pd

class S3BucketCrawler:
    """
    Crawls S3 bucket structure and creates a nested dictionary of .tif files.
    """
    
    def __init__(self, bucket_name: str, s3_client=None):
        """
        Initialize the crawler.
        
        Args:
            bucket_name: Name of the S3 bucket
            s3_client: Boto3 S3 client (creates new one if not provided)
        """
        self.bucket_name = bucket_name.replace('s3://', '').rstrip('/')
        self.s3_client = s3_client or boto3.client('s3')
        self.total_files = 0
        self.total_size = 0
        self.total_directories = set()
        
    def build_nested_structure(self, file_list: list) -> Dict:
        """
        Convert flat S3 paths to nested dictionary structure.
        
        Args:
            file_list: List of dictionaries containing file information
            
        Returns:
            Nested dictionary representing directory structure
        """
        root = {}
        
        for file_info in file_list:
            path_parts = file_info['key'].split('/')
            current_level = root
            
            # Navigate/create the directory structure
            for i, part in enumerate(path_parts[:-1]):
                if part not in current_level:
                    current_level[part] = {
                        '_type': 'directory',
                        '_path': '/'.join(path_parts[:i+1]) + '/',
                        '_files': [],
                        '_subdirs': {}
                    }
                    current_level = current_level[part]['_subdirs']
                else:
                    current_level = current_level[part]['_subdirs']
            
            # Add the file to its directory
            file_name = path_parts[-1]
            if '_files' not in current_level:
                current_level['_files'] = []
            
            current_level['_files'].append({
                'name': file_name,
                'path': file_info['key'],
                'size_bytes': file_info['size'],
                'size_readable': self._format_size(file_info['size']),
                'last_modified': file_info['last_modified'],
                'storage_class': file_info.get('storage_class', 'STANDARD')
            })
        
        return root
    
    def crawl(self, 
              prefix: str = '', 
              show_progress: bool = True) -> Dict:
        """
        Crawl the bucket and return nested structure of .tif files only.
        
        Args:
            prefix: Start from this prefix (subdirectory)
            show_progress: Show progress bar
            
        Returns:
            Nested dictionary representing bucket structure with .tif files
        """
        print(f"🔍 Starting crawl of s3://{self.bucket_name}/{prefix}")
        print("📌 Filtering for .tif files only")
        
        # Reset statistics
        self.total_files = 0
        self.total_size = 0
        self.total_directories.clear()
        
        # Collect all .tif files
        all_tif_files = []
        
        # Use paginator for large buckets
        paginator = self.s3_client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(
            Bucket=self.bucket_name,
            Prefix=prefix
        )
        
        # Process pages
        print("Scanning bucket...")
        for page in page_iterator:
            if 'Contents' in page:
                for obj in page['Contents']:
                    key = obj['Key']
                    
                    # Filter for .tif files only
                    if key.lower().endswith('.tif'):
                        self.total_files += 1
                        self.total_size += obj.get('Size', 0)
                        
                        # Track directories
                        dir_path = '/'.join(key.split('/')[:-1])
                        if dir_path:
                            self.total_directories.add(dir_path)
                        
                        # Add file info
                        all_tif_files.append({
                            'key': key,
                            'size': obj.get('Size', 0),
                            'last_modified': obj.get('LastModified').isoformat() if obj.get('LastModified') else None,
                            'storage_class': obj.get('StorageClass', 'STANDARD')
                        })
                        
                        # Show progress
                        if show_progress and self.total_files % 100 == 0:
                            print(f"  Found {self.total_files} .tif files...", end="\r")
        
        print(f"\n✅ Found {self.total_files} .tif files")
        print(f"📁 Across {len(self.total_directories)} directories")
        print(f"💾 Total size: {self._format_size(self.total_size)}")
        
        # Build nested structure
        structure = self.build_nested_structure(all_tif_files)
        
        # Create final result with metadata
        result = {
            "_metadata": {
                "bucket": self.bucket_name,
                "prefix": prefix,
                "crawled_at": datetime.now().isoformat(),
                "file_filter": ".tif",
                "total_files": self.total_files,
                "total_directories": len(self.total_directories),
                "total_size_bytes": self.total_size,
                "total_size_readable": self._format_size(self.total_size)
            },
            "structure": structure
        }
        
        return result
    
    def _format_size(self, size_bytes: int) -> str:
        """Format bytes to human-readable size."""
        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
            if size_bytes < 1024.0:
                return f"{size_bytes:.2f} {unit}"
            size_bytes /= 1024.0
        return f"{size_bytes:.2f} PB"

print("✅ S3BucketCrawler class defined successfully!")

✅ S3BucketCrawler class defined successfully!


In [13]:
class S3DisastersCrawler:
    """
    Simplified crawler for drcs_activations directory that creates clean nested structure.
    """
    
    def __init__(self, bucket_name: str, s3_client=None):
        """
        Initialize the crawler.
        
        Args:
            bucket_name: Name of the S3 bucket
            s3_client: Boto3 S3 client (creates new one if not provided)
        """
        self.bucket_name = bucket_name.replace('s3://', '').rstrip('/')
        self.s3_client = s3_client or boto3.client('s3')
        self.total_files = 0
        self.activation_events = set()
        
    def build_clean_structure(self, file_list: list, prefix: str) -> Dict:
        """
        Build a clean nested structure organized by activation events.
        
        Args:
            file_list: List of S3 keys
            prefix: The prefix to remove (e.g., 'drcs_activations/')
            
        Returns:
            Nested dictionary with activation events and their files
        """
        structure = {}
        
        for key in file_list:
            # Remove the prefix to get relative path
            relative_path = key.replace(prefix, '', 1) if key.startswith(prefix) else key
            parts = relative_path.split('/')
            
            # Skip if not enough parts
            if len(parts) < 2:
                continue
            
            # First part is the activation event (e.g., '202301_Flood_CA')
            activation_event = parts[0]
            self.activation_events.add(activation_event)
            
            # Initialize activation event if not exists
            if activation_event not in structure:
                structure[activation_event] = {}
            
            # Build nested structure for remaining parts
            current_level = structure[activation_event]
            
            # Navigate through subdirectories
            for part in parts[1:-1]:
                if part not in current_level:
                    current_level[part] = {}
                # Check if current_level[part] is a list (files), if so convert to dict
                if isinstance(current_level[part], list):
                    current_level[part] = {'_files': current_level[part]}
                current_level = current_level[part]
            
            # Add the file
            filename = parts[-1]
            if '_files' not in current_level:
                current_level['_files'] = []
            current_level['_files'].append(filename)
        
        return structure
    
    def crawl_drcs_activations(self, show_progress: bool = True) -> Dict:
        """
        Crawl only the drcs_activations directory for .tif files.
        
        Args:
            show_progress: Show progress during crawl
            
        Returns:
            Clean nested dictionary of activation events and their .tif files
        """
        prefix = 'drcs_activations/'
        print(f"🔍 Crawling s3://{self.bucket_name}/{prefix}")
        print("📌 Filtering for .tif files in activation events")
        
        # Reset counters
        self.total_files = 0
        self.activation_events.clear()
        
        # Collect all .tif files
        tif_files = []
        
        # Use paginator for large buckets
        paginator = self.s3_client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(
            Bucket=self.bucket_name,
            Prefix=prefix
        )
        
        # Process pages
        print("Scanning activation events...")
        for page in page_iterator:
            if 'Contents' in page:
                for obj in page['Contents']:
                    key = obj['Key']
                    
                    # Filter for .tif files only
                    if key.lower().endswith('.tif'):
                        tif_files.append(key)
                        self.total_files += 1
                        
                        # Show progress
                        if show_progress and self.total_files % 100 == 0:
                            print(f"  Found {self.total_files} .tif files...", end="\r")
        
        print(f"\n✅ Found {self.total_files} .tif files")
        
        # Build clean structure
        structure = self.build_clean_structure(tif_files, prefix)
        
        print(f"📁 Across {len(self.activation_events)} activation events")
        
        # Create result with drcs_activations as root
        result = {
            "drcs_activations": structure,
            "_metadata": {
                "bucket": self.bucket_name,
                "crawled_at": datetime.now().isoformat(),
                "total_tif_files": self.total_files,
                "total_activation_events": len(self.activation_events),
                "activation_events": sorted(list(self.activation_events))
            }
        }
        
        return result

print("✅ S3DisastersCrawler class defined successfully!")

✅ S3DisastersCrawler class defined successfully!


## 10. Helper Functions for Data Management

In [14]:
def save_to_json(data: Dict, filename: str, indent: int = 2) -> str:
    """
    Save dictionary to JSON file.
    
    Args:
        data: Dictionary to save
        filename: Output filename
        indent: JSON indentation (None for compact)
        
    Returns:
        Path to saved file
    """
    filepath = os.path.abspath(filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=indent, default=str)
    
    file_size = os.path.getsize(filepath)
    print(f"✅ Saved to: {filepath}")
    print(f"📊 File size: {file_size:,} bytes")
    
    return filepath

def print_structure_preview(structure: Dict, max_depth: int = 3, current_depth: int = 0, prefix: str = ""):
    """
    Print a tree-like preview of the structure.
    
    Args:
        structure: Nested dictionary structure
        max_depth: Maximum depth to display
        current_depth: Current recursion depth
        prefix: Prefix for tree display
    """
    if current_depth > max_depth:
        return
    
    # Process directories
    for key, value in structure.items():
        if isinstance(value, dict) and '_type' in value and value['_type'] == 'directory':
            print(f"{prefix}📁 {key}/")
            
            # Show files in this directory
            if '_files' in value:
                for file in value['_files'][:3]:  # Show first 3 files
                    print(f"{prefix}  📄 {file['name']} ({file['size_readable']})")
                if len(value['_files']) > 3:
                    print(f"{prefix}  ... and {len(value['_files']) - 3} more .tif files")
            
            # Recurse into subdirectories
            if '_subdirs' in value and current_depth < max_depth:
                print_structure_preview(value['_subdirs'], max_depth, current_depth + 1, prefix + "  ")

def get_all_file_paths(structure: Dict) -> list:
    """
    Extract all file paths from nested structure.
    
    Args:
        structure: Nested dictionary structure
        
    Returns:
        List of all file paths
    """
    file_paths = []
    
    def extract_recursive(obj):
        for key, value in obj.items():
            if isinstance(value, dict):
                if '_files' in value:
                    for file_info in value['_files']:
                        file_paths.append(file_info['path'])
                if '_subdirs' in value:
                    extract_recursive(value['_subdirs'])
    
    if 'structure' in structure:
        extract_recursive(structure['structure'])
    else:
        extract_recursive(structure)
    
    return file_paths

def get_statistics(structure: Dict) -> pd.DataFrame:
    """
    Generate statistics from crawled structure.
    
    Args:
        structure: Crawled structure dictionary
        
    Returns:
        DataFrame with statistics
    """
    files_data = []
    
    def analyze_recursive(obj, current_dir=""):
        for key, value in obj.items():
            if isinstance(value, dict):
                if '_files' in value:
                    for file_info in value['_files']:
                        files_data.append({
                            'directory': current_dir if current_dir else '/',
                            'filename': file_info['name'],
                            'full_path': file_info['path'],
                            'size_bytes': file_info['size_bytes'],
                            'size_readable': file_info['size_readable'],
                            'last_modified': file_info['last_modified']
                        })
                if '_subdirs' in value:
                    new_dir = f"{current_dir}/{key}" if current_dir else key
                    analyze_recursive(value['_subdirs'], new_dir)
    
    if 'structure' in structure:
        analyze_recursive(structure['structure'])
    else:
        analyze_recursive(structure)
    
    df = pd.DataFrame(files_data)
    
    if not df.empty:
        print("\n📊 Statistics:")
        print(f"Total .tif files: {len(df)}")
        print(f"Total size: {df['size_bytes'].sum():,} bytes")
        print(f"Unique directories: {df['directory'].nunique()}")
        
        # Top directories by file count
        print("\nTop directories by file count:")
        top_dirs = df['directory'].value_counts().head(5)
        for dir_name, count in top_dirs.items():
            print(f"  {dir_name}: {count} files")
    
    return df

print("✅ Helper functions defined successfully!")

✅ Helper functions defined successfully!


## 11. Main Execution - Crawl S3 Buckets for .tif Files

In [15]:
# Configuration
BUCKET_NAME = "nasa-disasters"  # Change this to your target bucket
PREFIX = "drcs_activations"                      # Leave empty for entire bucket, or specify a path like "drcs_activations/"
OUTPUT_FILE = "s3_tif_files_structure.json"  # Output filename

print(f"Configuration:")
print(f"  Bucket: s3://{BUCKET_NAME}/{PREFIX}")
print(f"  File filter: .tif files only")
print(f"  Output file: {OUTPUT_FILE}")

Configuration:
  Bucket: s3://nasa-disasters/drcs_activations
  File filter: .tif files only
  Output file: s3_tif_files_structure.json


In [16]:
# Initialize crawler and run
if 'session' in locals() and session:
    # Create S3 client from authenticated session
    s3_client = session.client('s3')
    
    # Initialize crawler
    crawler = S3BucketCrawler(BUCKET_NAME, s3_client)
    
    # Crawl the bucket
    print("🚀 Starting crawl... This may take a while for large buckets.\n")
    
    start_time = time.time()
    
    # Perform the crawl
    result = crawler.crawl(
        prefix=PREFIX,
        show_progress=True
    )
    
    elapsed_time = time.time() - start_time
    
    print(f"\n✅ Crawl completed in {elapsed_time:.2f} seconds")
else:
    print("❌ Please complete authentication setup first using the cells above.")

🚀 Starting crawl... This may take a while for large buckets.

🔍 Starting crawl of s3://nasa-disasters/drcs_activations
📌 Filtering for .tif files only
Scanning bucket...
  Found 34400 .tif files...
✅ Found 34460 .tif files
📁 Across 1518 directories
💾 Total size: 5.53 TB

✅ Crawl completed in 47.71 seconds


## 12. Preview and Save Results

In [17]:
# Display structure preview
if 'result' in locals():
    print("🌳 Directory Structure Preview (max depth 3):\n")
    print(f"s3://{BUCKET_NAME}/")
    print_structure_preview(result['structure'], max_depth=3)
else:
    print("No results to display. Run the crawl first.")

🌳 Directory Structure Preview (max depth 3):

s3://nasa-disasters/
📁 drcs_activations/
  📁 2020/
    📁 aegean_sea_earthquake_202010/
      📁 aria/
    📁 bolivia_fires/
      📁 aster/
    📁 california_fires/
      📁 aria/
      📁 aster/
      📁 dnbr/
      📁 master/
      📁 sentinel2/
      📁 uavsar/
    📁 colorado_fires/
      📁 aster/
      📁 sentinel2/
    📁 hurricane_delta/
      📁 landsat8/
      📁 radarsat2/
      📁 sentinel1/
      📁 sentinel2/
    📁 hurricane_sally/
      📁 dfo/
      📁 landsat8/
      📁 sentinel1/
      📁 sentinel2/
    📁 hurricane_zeta/
      📁 landsat8/
      📁 sentinel1/
      📁 sentinel2/
    📁 hurricanes_eta_iota/
      📁 aria/
      📁 dartmouth_flood_observatory/
      📁 planet/
      📁 sentinel1/
      📁 sentinel2/
    📁 mi_dam_failure_2020/
      📁 aria/
      📁 landsat8/
      📁 sentinel2/
    📁 neuse_river_nc/
      📁 planet/
    📁 volcano_research/
      📁 lewotolok_20201129/
    📁 washington_oregon_fires/
      📁 aster/
      📁 sentinel2/
  📁 2021/


In [18]:
# Save the complete structure to JSON
if 'result' in locals():
    filepath = save_to_json(result, OUTPUT_FILE)
    print(f"\n📝 Structure saved to: {filepath}")
    
    # Option to save compact version (no indentation, smaller file)
    compact_file = OUTPUT_FILE.replace('.json', '_compact.json')
    compact_path = save_to_json(result, compact_file, indent=None)
    print(f"📝 Compact version saved to: {compact_path}")
else:
    print("No results to save. Run the crawl first.")

✅ Saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/s3_tif_files_structure.json
📊 File size: 18,984,715 bytes

📝 Structure saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/s3_tif_files_structure.json
✅ Saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/s3_tif_files_structure_compact.json
📊 File size: 11,176,941 bytes
📝 Compact version saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/s3_tif_files_structure_compact.json


## 13. Generate Statistics and Export Options

In [19]:
# Generate and display statistics
if 'result' in locals():
    df_stats = get_statistics(result)
    
    # Save statistics to CSV
    if not df_stats.empty:
        stats_file = OUTPUT_FILE.replace('.json', '_statistics.csv')
        df_stats.to_csv(stats_file, index=False)
        print(f"\n📊 Statistics saved to: {stats_file}")
        
        # Show sample
        print("\nSample of .tif files:")
        display(df_stats.head(10))
else:
    print("No results to analyze. Run the crawl first.")

In [20]:
# Export flat list of all .tif file paths
if 'result' in locals():
    all_files = get_all_file_paths(result)
    print(f"\nTotal .tif files in bucket: {len(all_files)}")
    print("\nFirst 10 file paths:")
    for path in all_files[:10]:
        print(f"  s3://{BUCKET_NAME}/{path}")
    
    # Save file list
    file_list_path = OUTPUT_FILE.replace('.json', '_file_list.txt')
    with open(file_list_path, 'w') as f:
        for path in all_files:
            f.write(f"s3://{BUCKET_NAME}/{path}\n")
    print(f"\n📄 File list saved to: {file_list_path}")
else:
    print("No results to export. Run the crawl first.")


Total .tif files in bucket: 0

First 10 file paths:

📄 File list saved to: s3_tif_files_structure_file_list.txt


## 14. Load and Search Previously Saved Results

Use these functions to load and search through previously crawled data.

In [21]:
def load_json_structure(filename: str) -> Dict:
    """
    Load a previously saved JSON structure.
    """
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"✅ Loaded structure from: {filename}")
    
    if '_metadata' in data:
        meta = data['_metadata']
        print(f"\nMetadata:")
        print(f"  Bucket: {meta.get('bucket')}")
        print(f"  Crawled at: {meta.get('crawled_at')}")
        print(f"  Total .tif files: {meta.get('total_files')}")
        print(f"  Total directories: {meta.get('total_directories')}")
        print(f"  Total size: {meta.get('total_size_readable')}")
    
    return data

def search_files(structure: Dict, pattern: str) -> list:
    """
    Search for files matching a pattern in the structure.
    """
    matches = []
    
    def search_recursive(obj):
        for key, value in obj.items():
            if isinstance(value, dict):
                if '_files' in value:
                    for file_info in value['_files']:
                        if pattern.lower() in file_info['name'].lower():
                            matches.append(file_info)
                if '_subdirs' in value:
                    search_recursive(value['_subdirs'])
    
    if 'structure' in structure:
        search_recursive(structure['structure'])
    else:
        search_recursive(structure)
    
    return matches

# Example: Load saved file
# loaded_data = load_json_structure(OUTPUT_FILE)

# Example: Search for specific files
# matching_files = search_files(loaded_data, "flood")
# print(f"Found {len(matching_files)} files containing 'flood'")

## 15. Crawl DRCS Activations Directory Only

Focused crawler for disaster activation events with simplified output.

In [22]:
# Configuration for DRCS Activations
BUCKET_NAME = "nasa-disasters"
OUTPUT_FILE = "drcs_activations_tif_files.json"

print(f"Configuration:")
print(f"  Bucket: s3://{BUCKET_NAME}/drcs_activations/")
print(f"  File filter: .tif files only")
print(f"  Output file: {OUTPUT_FILE}")

Configuration:
  Bucket: s3://nasa-disasters/drcs_activations/
  File filter: .tif files only
  Output file: drcs_activations_tif_files.json


In [23]:
# Run the focused crawler
if 'session' in locals() and session:
    # Create S3 client from authenticated session
    s3_client = session.client('s3')
    
    # Initialize the disasters crawler
    disasters_crawler = S3DisastersCrawler(BUCKET_NAME, s3_client)
    
    # Crawl drcs_activations
    print("🚀 Starting focused crawl of drcs_activations...\n")
    
    start_time = time.time()
    
    # Perform the crawl
    drcs_result = disasters_crawler.crawl_drcs_activations(show_progress=True)
    
    elapsed_time = time.time() - start_time
    
    print(f"\n✅ Crawl completed in {elapsed_time:.2f} seconds")
    
    # Show sample activation events
    if '_metadata' in drcs_result:
        events = drcs_result['_metadata']['activation_events'][:10]
        print(f"\nSample activation events:")
        for event in events:
            print(f"  - {event}")
        if len(drcs_result['_metadata']['activation_events']) > 10:
            print(f"  ... and {len(drcs_result['_metadata']['activation_events']) - 10} more")
else:
    print("❌ Please complete authentication setup first.")

🚀 Starting focused crawl of drcs_activations...

🔍 Crawling s3://nasa-disasters/drcs_activations/
📌 Filtering for .tif files in activation events
Scanning activation events...
  Found 34400 .tif files...
✅ Found 34460 .tif files
📁 Across 42 activation events

✅ Crawl completed in 52.65 seconds

Sample activation events:
  - 2020
  - 2021
  - 2022
  - 2023
  - 202301_Flood_CA
  - 202302_Earthquake_Turkiye
  - 202305_Typhoon_Mawar
  - 20230719_SevereWx_NC
  - 202307_Fire_Greece
  - 202307_Flood_VT
  ... and 32 more


In [25]:
# Preview the structure
if 'drcs_result' in locals():
    print("📊 Structure Preview:\n")
    
    # Show first 3 activation events and their structure
    events = list(drcs_result['drcs_activations'].keys())[:3]
    
    for event in events:
        print(f"📁 {event}/")
        event_data = drcs_result['drcs_activations'][event]
        
        # Show subdirectories and file counts
        for subdir, content in event_data.items():
            if isinstance(content, dict):
                if '_files' in content:
                    print(f"  └── {subdir}/ ({len(content['_files'])} .tif files)")
                else:
                    # Count nested files
                    total = sum(len(v['_files']) if isinstance(v, dict) and '_files' in v else 0 
                               for v in content.values())
                    print(f"  └── {subdir}/ ({total} .tif files in subdirs)")
            elif isinstance(content, list):
                print(f"  └── {len(content)} .tif files directly in {event}/")
        print()
else:
    print("No results to preview. Run the crawler first.")

📊 Structure Preview:

📁 2020/
  └── aegean_sea_earthquake_202010/ (2 .tif files in subdirs)
  └── bolivia_fires/ (1 .tif files in subdirs)
  └── california_fires/ (43 .tif files in subdirs)
  └── colorado_fires/ (28 .tif files in subdirs)
  └── hurricane_delta/ (574 .tif files in subdirs)
  └── hurricane_sally/ (8 .tif files in subdirs)
  └── hurricane_zeta/ (8 .tif files in subdirs)
  └── hurricanes_eta_iota/ (535 .tif files in subdirs)
  └── mi_dam_failure_2020/ (22 .tif files in subdirs)
  └── neuse_river_nc/ (12 .tif files)
  └── volcano_research/ (0 .tif files in subdirs)
  └── washington_oregon_fires/ (1 .tif files in subdirs)

📁 2021/
  └── australia_floods/ (11 .tif files in subdirs)
  └── california_fires/ (1 .tif files in subdirs)
  └── costarica_panama_flooding/ (6 .tif files in subdirs)
  └── guyana_floods/ (26 .tif files in subdirs)
  └── haiti_earthquake_202108/ (6 .tif files in subdirs)
  └── hurricane_elsa/ (174 .tif files in subdirs)
  └── hurricane_ida/ (380 .tif file

In [26]:
# Save the simplified structure
if 'drcs_result' in locals():
    filepath = save_to_json(drcs_result, OUTPUT_FILE)
    print(f"\n📝 Simplified structure saved to: {filepath}")
    
    # Also save a compact version
    compact_file = OUTPUT_FILE.replace('.json', '_compact.json')
    compact_path = save_to_json(drcs_result, compact_file, indent=None)
    print(f"📝 Compact version saved to: {compact_path}")
else:
    print("No results to save. Run the crawler first.")

✅ Saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/drcs_activations_tif_files.json
📊 File size: 2,256,239 bytes

📝 Simplified structure saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/drcs_activations_tif_files.json
✅ Saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/drcs_activations_tif_files_compact.json
📊 File size: 1,701,317 bytes
📝 Compact version saved to: /Users/klesinger/github/conversion_scripts/s3-crawler/drcs_activations_tif_files_compact.json


In [27]:
# Access example - show how to work with the data
if 'drcs_result' in locals():
    print("🔍 Access Examples:\n")
    
    # Get all activation events
    all_events = list(drcs_result['drcs_activations'].keys())
    print(f"Total activation events: {len(all_events)}")
    
    # Access files for a specific event
    if all_events:
        sample_event = all_events[0]
        print(f"\nAccessing files for '{sample_event}':")
        
        event_data = drcs_result['drcs_activations'][sample_event]
        
        # Count total files in this event
        def count_files(obj):
            total = 0
            if isinstance(obj, dict):
                if '_files' in obj:
                    total += len(obj['_files'])
                for value in obj.values():
                    if isinstance(value, dict):
                        total += count_files(value)
            return total
        
        total_files = count_files(event_data)
        print(f"  Total .tif files: {total_files}")
        
        # Show how to iterate through all files
        def get_all_files(obj, prefix=''):
            files = []
            if isinstance(obj, dict):
                if '_files' in obj:
                    for f in obj['_files']:
                        files.append(prefix + f)
                for key, value in obj.items():
                    if key != '_files' and isinstance(value, dict):
                        files.extend(get_all_files(value, prefix + key + '/'))
            return files
        
        all_files = get_all_files(event_data, f'{sample_event}/')
        print(f"\n  First 5 file paths:")
        for f in all_files[:5]:
            print(f"    s3://nasa-disasters/drcs_activations/{f}")
else:
    print("No results available. Run the crawler first.")

🔍 Access Examples:

Total activation events: 42

Accessing files for '2020':
  Total .tif files: 4384

  First 5 file paths:
    s3://nasa-disasters/drcs_activations/2020/aegean_sea_earthquake_202010/aria/ARIA_DPM_Sentinel-1_v0.3.tif
    s3://nasa-disasters/drcs_activations/2020/aegean_sea_earthquake_202010/aria/S1_A131_20201030_20201024_disp_cm.tif
    s3://nasa-disasters/drcs_activations/2020/bolivia_fires/aster/santacruz-nite-tif.tif
    s3://nasa-disasters/drcs_activations/2020/california_fires/aria/ARIA_S1_DPM_CreekFire_Sep_13_Sep_19_7am.tif
    s3://nasa-disasters/drcs_activations/2020/california_fires/aria/ARIA_S1_DPM_CreekFire_Sep_7_7am.tif


## 16. Split DRCS Data by Year

Automatically split the crawled DRCS activation data into separate files for each year (2020-2025).

In [28]:
def extract_year_from_event_name(event_name):
    """Extract year from event name."""
    # Check if it's a plain year (e.g., "2020")
    if event_name in ["2020", "2021", "2022", "2023", "2024", "2025"]:
        return int(event_name)
    
    # Check if it starts with a year (e.g., "202301_Flood_CA" or "20230719_SevereWx_NC")
    if event_name[:4].isdigit():
        year = int(event_name[:4])
        if 2020 <= year <= 2025:
            return year
    
    return None

def count_files_recursive(data):
    """Count total .tif files in a nested structure."""
    total = 0
    
    if isinstance(data, dict):
        if '_files' in data:
            total += len(data['_files'])
        for key, value in data.items():
            if key != '_files' and isinstance(value, dict):
                total += count_files_recursive(value)
    
    return total

def split_drcs_by_year(drcs_result):
    """
    Split DRCS activation data into separate files by year.
    
    Args:
        drcs_result: The crawled DRCS data dictionary
        
    Returns:
        Dictionary mapping years to their data
    """
    # Initialize year-based dictionaries
    years_data = {year: {} for year in range(2020, 2026)}
    
    # Process each activation event
    drcs_data = drcs_result.get('drcs_activations', {})
    
    for event_name, event_data in drcs_data.items():
        year = extract_year_from_event_name(event_name)
        
        if year and year in years_data:
            # If it's a plain year folder (e.g., "2020"), merge its contents
            if event_name == str(year):
                # This is a year folder, add all its contents
                for sub_event, sub_data in event_data.items():
                    years_data[year][sub_event] = sub_data
            else:
                # This is a named event (e.g., "202301_Flood_CA")
                years_data[year][event_name] = event_data
    
    return years_data

print("✅ Year-splitting functions defined successfully!")

✅ Year-splitting functions defined successfully!


In [29]:
# Automatically split DRCS data by year after saving
if 'drcs_result' in locals():
    print("\n📅 Splitting DRCS data by year (2020-2025)...\n")
    
    # Split the data by year
    years_data = split_drcs_by_year(drcs_result)
    
    # Create separate JSON files for each year
    for year in range(2020, 2026):
        output_file = f'drcs_activations_{year}.json'
        
        # Count statistics
        event_count = len(years_data[year])
        total_files = sum(count_files_recursive(event_data) 
                         for event_data in years_data[year].values())
        
        # Create output structure
        output_data = {
            "drcs_activations": years_data[year],
            "_metadata": {
                "year": year,
                "extracted_from": OUTPUT_FILE,
                "created_at": datetime.now().isoformat(),
                "total_events": event_count,
                "total_tif_files": total_files,
                "events": sorted(list(years_data[year].keys()))
            }
        }
        
        # Save to file
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        print(f"✅ Created {output_file}")
        print(f"   - Events: {event_count}")
        print(f"   - Total .tif files: {total_files}")
        if event_count > 0:
            sample_events = list(years_data[year].keys())[:3]
            if event_count > 3:
                sample_events.append(f"... and {event_count - 3} more")
            print(f"   - Sample events: {sample_events}")
        print()
    
    print("📊 Year-based splitting complete!")
else:
    print("⚠️ No DRCS data available. Run the DRCS crawler first.")


📅 Splitting DRCS data by year (2020-2025)...

✅ Created drcs_activations_2020.json
   - Events: 12
   - Total .tif files: 4384
   - Sample events: ['aegean_sea_earthquake_202010', 'bolivia_fires', 'california_fires', '... and 9 more']

✅ Created drcs_activations_2021.json
   - Events: 18
   - Total .tif files: 2257
   - Sample events: ['australia_floods', 'california_fires', 'costarica_panama_flooding', '... and 15 more']

✅ Created drcs_activations_2022.json
   - Events: 12
   - Total .tif files: 2223
   - Sample events: ['australia_flooding_202203', 'bangladesh_flooding_202206', 'brazil_flooding_202205', '... and 9 more']

✅ Created drcs_activations_2023.json
   - Events: 24
   - Total .tif files: 3821
   - Sample events: ['california_atmospheric_river', 'greece_wildfires', 'hawaii_wildfires_202308', '... and 21 more']

✅ Created drcs_activations_2024.json
   - Events: 40
   - Total .tif files: 8799
   - Sample events: ['bangladesh_flood_202408', 'brasil_flood_202405', 'california_a