# Complete CALIPSO HDF4 to COPC Conversion Pipeline

This notebook demonstrates the complete conversion pipeline:
1. HDF4 → HDF5 (using h4toh5convert)
2. HDF5 → Text (extracting 3D variables)
3. Text → LAS (using PDAL)
4. LAS → COPC (Cloud-Optimized Point Cloud)

The final COPC format is optimized for streaming and cloud storage.

In [None]:
# Import required libraries
from pathlib import Path
import subprocess
import h5py
import pandas as pd
from calipso_tool.converter import (
    h4_to_h5, h5_to_txt, txt_to_las_pipeline, 
    las_to_copc_pipeline, h4_to_copc
)

## Step 0: Setup and File Check

In [None]:
# Define input file and variable
hdf4_file = Path("CAL_LID_L3_Tropospheric_APro_AllSky-Standard-V4-20.2018-12D.hdf")
variable_name = "var_to_grab"  # UPDATE THIS with your actual variable name!

# Check if file exists
if hdf4_file.exists():
    print(f"✓ Input file found: {hdf4_file}")
    print(f"  File size: {hdf4_file.stat().st_size / (1024*1024):.2f} MB")
else:
    print(f"✗ Input file not found: {hdf4_file}")
    print("  Please ensure the HDF4 file is in the current directory")

## Method 1: One-Step Complete Pipeline

The easiest way - convert HDF4 directly to COPC with a single function call.

In [None]:
if hdf4_file.exists():
    try:
        # Complete pipeline in one call
        copc_file, h5_file, txt_file, las_file = h4_to_copc(
            input_h4=hdf4_file,
            variable_name=variable_name,
            altitude_units="km",  # Will convert to meters
            keep_intermediates=True  # Keep all intermediate files for inspection
        )
        
        print(f"\nAll files created:")
        if h5_file: print(f"  HDF5: {h5_file} ({h5_file.stat().st_size / (1024*1024):.2f} MB)")
        if txt_file: print(f"  Text: {txt_file} ({txt_file.stat().st_size / (1024*1024):.2f} MB)")
        if las_file: print(f"  LAS: {las_file} ({las_file.stat().st_size / (1024*1024):.2f} MB)")
        print(f"  COPC: {copc_file} ({copc_file.stat().st_size / (1024*1024):.2f} MB)")
        
    except Exception as e:
        print(f"Pipeline failed: {e}")
        print("\nTroubleshooting:")
        print("1. Make sure 'variable_name' matches a variable in your HDF5 file")
        print("2. Ensure PDAL is installed (conda install -c conda-forge pdal)")
        print("3. Check that the h4toh5convert binary is available")

## Method 2: Step-by-Step Pipeline

For more control and debugging, run each step separately.

In [None]:
# Step 1: HDF4 to HDF5
if hdf4_file.exists():
    h5_file = hdf4_file.with_suffix('.h5')
    
    try:
        print("Step 1: Converting HDF4 to HDF5...")
        h4_to_h5(hdf4_file, h5_file)
        print(f"✓ Created: {h5_file} ({h5_file.stat().st_size / (1024*1024):.2f} MB)")
    except Exception as e:
        print(f"✗ HDF4 to HDF5 conversion failed: {e}")

In [None]:
# Explore HDF5 structure to find correct variable name
if 'h5_file' in locals() and h5_file.exists():
    with h5py.File(h5_file, 'r') as f:
        print("Available variables in HDF5 file:")
        print("=" * 50)
        
        def list_variables(name, obj):
            if isinstance(obj, h5py.Dataset):
                # Check if it's a 3D variable
                if len(obj.shape) == 3:
                    print(f"✓ {name}: shape={obj.shape} (3D variable)")
                else:
                    print(f"  {name}: shape={obj.shape}")
        
        f.visititems(list_variables)
        
        print("\nNote: You need a 3D variable for point cloud conversion.")
        print("Update 'variable_name' with one of the 3D variables above.")

In [None]:
# Step 2: HDF5 to Text
if 'h5_file' in locals() and h5_file.exists():
    txt_file = h5_file.with_suffix('.txt')
    
    try:
        print("Step 2: Converting HDF5 to text...")
        h5_to_txt(h5_file, txt_file, variable_name, altitude_units="km")
        print(f"✓ Created: {txt_file} ({txt_file.stat().st_size / (1024*1024):.2f} MB)")
        
        # Preview the text file
        df_preview = pd.read_csv(txt_file, sep=' ', nrows=5)
        print(f"\nPreview of text file:")
        print(df_preview)
        
    except Exception as e:
        print(f"✗ HDF5 to text conversion failed: {e}")

In [None]:
# Step 3: Text to LAS
if 'txt_file' in locals() and txt_file.exists():
    las_file = txt_file.with_suffix('.las')
    
    try:
        print("Step 3: Converting text to LAS...")
        txt_to_las_pipeline(txt_file, las_file, variable_name)
        print(f"✓ Created: {las_file} ({las_file.stat().st_size / (1024*1024):.2f} MB)")
        
    except Exception as e:
        print(f"✗ Text to LAS conversion failed: {e}")

In [None]:
# Step 4: LAS to COPC
if 'las_file' in locals() and las_file.exists():
    copc_file = las_file.parent / f"{las_file.stem}.copc.laz"
    
    try:
        print("Step 4: Converting LAS to COPC...")
        las_to_copc_pipeline(las_file, copc_file)
        print(f"✓ Created: {copc_file} ({copc_file.stat().st_size / (1024*1024):.2f} MB)")
        
        # Calculate compression ratio
        las_size = las_file.stat().st_size / (1024*1024)
        copc_size = copc_file.stat().st_size / (1024*1024)
        compression = (1 - copc_size/las_size) * 100
        print(f"\nCompression achieved: {compression:.1f}%")
        
    except Exception as e:
        print(f"✗ LAS to COPC conversion failed: {e}")

## Inspect Final COPC File

In [None]:
# Get detailed information about the COPC file
copc_files = list(Path(".").glob("*.copc.laz"))

if copc_files:
    copc_file = copc_files[0]
    print(f"Inspecting COPC file: {copc_file}\n")
    
    # Get PDAL info
    result = subprocess.run(
        ["pdal", "info", str(copc_file), "--all"],
        capture_output=True,
        text=True
    )
    
    if result.returncode == 0:
        import json
        info = json.loads(result.stdout)
        
        # Extract key information
        if "stats" in info and "statistic" in info["stats"]:
            stats = info["stats"]["statistic"][0]
            print(f"Point count: {stats.get('count', 'N/A'):,}")
            print(f"\nDimensions:")
            for dim in ['X', 'Y', 'Z', variable_name]:
                if dim in stats:
                    dim_stats = stats[dim]
                    print(f"  {dim}: [{dim_stats.get('minimum', 'N/A'):.3f}, {dim_stats.get('maximum', 'N/A'):.3f}]")
    else:
        print(f"Error getting COPC info: {result.stderr}")
else:
    print("No COPC files found")

## Batch Processing

Convert multiple CALIPSO files at once.

In [None]:
# Find all HDF4 files in directory
hdf4_files = list(Path(".").glob("*.hdf"))

if len(hdf4_files) > 1:
    print(f"Found {len(hdf4_files)} HDF4 files to process\n")
    
    successful = []
    failed = []
    
    for hdf4_file in hdf4_files:
        try:
            print(f"Processing: {hdf4_file.name}")
            copc_file, _, _, _ = h4_to_copc(
                hdf4_file,
                variable_name=variable_name,
                keep_intermediates=False  # Clean up intermediate files
            )
            successful.append((hdf4_file, copc_file))
            print(f"✓ Success: {copc_file.name}\n")
            
        except Exception as e:
            failed.append((hdf4_file, str(e)))
            print(f"✗ Failed: {e}\n")
    
    print(f"\nBatch processing complete:")
    print(f"  Successful: {len(successful)}")
    print(f"  Failed: {len(failed)}")
    
    if failed:
        print(f"\nFailed files:")
        for f, error in failed:
            print(f"  - {f.name}: {error}")
            
elif len(hdf4_files) == 1:
    print("Only one HDF4 file found. Use the methods above for single file processing.")
else:
    print("No HDF4 files found in current directory.")

## Cleanup Utilities

In [None]:
# Clean up intermediate files if needed
def cleanup_intermediates(keep_copc=True):
    """
    Remove intermediate files, optionally keeping COPC files.
    """
    patterns = ['*.h5', '*.txt', '*.las']
    if not keep_copc:
        patterns.append('*.copc.laz')
    
    removed = []
    for pattern in patterns:
        for file in Path(".").glob(pattern):
            file.unlink()
            removed.append(file.name)
    
    if removed:
        print(f"Removed {len(removed)} files:")
        for f in removed:
            print(f"  - {f}")
    else:
        print("No intermediate files to clean up.")

# Uncomment to run cleanup
# cleanup_intermediates(keep_copc=True)

## Summary and Next Steps

You've successfully converted CALIPSO HDF4 data to Cloud-Optimized Point Cloud format!

**What you can do with COPC files:**
1. **Visualize** in point cloud viewers (CloudCompare, QGIS, Potree)
2. **Stream** efficiently from cloud storage (S3, Azure, GCS)
3. **Process** with PDAL for filtering, classification, or analysis
4. **Integrate** into GIS workflows

**Command-line usage:**
```bash
# Convert single file
python -m calipso_tool.converter input.hdf -v Extinction_Coefficient_532

# Batch conversion
for f in *.hdf; do
    python -m calipso_tool.converter "$f" -v Temperature_Met
done
```