Code used to transform [Global Mangrove Distribution, Aboveground Biomass, and Canopy Height](https://daac.ornl.gov/CMS/guides/CMS_Global_Map_Mangrove_Canopy.html) from .tif to Cloud Optimized Geotiff.

-Author: Kyle Lesinger

In [1]:
import os
import pandas as pd
import json
import tempfile
import boto3
import rasterio
import rioxarray as rxr
import s3fs
from rasterio.warp import calculate_default_transform, reproject, Resampling
import botocore
from pathlib import Path
from dotenv import load_dotenv

In [None]:
# Helper functions for TIFF predictor fix
def check_tiff_predictor(filepath):
    """
    Check TIFF file compression and predictor settings.
    Returns True if file has predictor issues with float data.
    """
    needs_fix = False
    data_type = None
    
    try:
        with rasterio.open(filepath) as src:
            # Get data type
            data_type = src.dtypes[0]
            
            # Check if it's float32 or float64
            if data_type in ['float32', 'float64']:
                # Check for predictor in metadata
                metadata = src.tags()
                
                # Get compression info from image structure metadata
                img_struct = src.tags(ns='IMAGE_STRUCTURE')
                
                # Check if predictor is being used (predictor=2 is horizontal differencing)
                # which causes issues with float data
                if 'PREDICTOR' in img_struct and img_struct['PREDICTOR'] != '1':
                    needs_fix = True
                    print(f"⚠️ Float{data_type[-2:]} data with predictor={img_struct.get('PREDICTOR', 'unknown')} detected - needs fix")
                elif 'COMPRESSION' in img_struct and data_type in ['float32', 'float64']:
                    # Even without explicit predictor, some compressions may use it implicitly
                    print(f"ℹ️ Float{data_type[-2:]} data with {img_struct['COMPRESSION']} compression - will use safe settings")
                    needs_fix = True
                    
    except Exception as e:
        print(f"Warning: Could not check predictor settings: {e}")
        # If we can't check but it's float data, better to be safe
        if data_type in ['float32', 'float64']:
            needs_fix = True
            
    return needs_fix, data_type

In [9]:
config = {
    "data_acquisition_method": "s3",
    "raw_data_bucket" : "nasa-disasters",
    "raw_data_prefix": "drcs_activations/202410_Hurricane_Milton/landsat",
    "cog_data_bucket": "nasa-disasters",
    "cog_data_prefix": "tmp-cog-speed-test",
    "local_output_dir": "output/transformed_cogs",  # Local directory to save COGs
    "transformation": {}
}

In [10]:
# Load environment variables from .env file
load_dotenv()

# AWS Credentials from environment variables
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')

Approach

1.) Read .tif files from S3 bucket
2.) Convert to COGs on local drive
3.) Move converted COGs to their final S3 location

In [11]:
session = boto3.session.Session()
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token = AWS_SESSION_TOKEN

)

bucket_name = config["cog_data_bucket"]
raw_data_bucket = config["raw_data_bucket"]
raw_data_prefix= config["raw_data_prefix"]

cog_data_bucket = config['cog_data_bucket']
cog_data_prefix= config["cog_data_prefix"]

fs = s3fs.S3FileSystem()

In [None]:
# Define COG profile for rasterio
# Standard COG profile for most data
COG_PROFILE = {
    "driver": "COG",
    "compress": "DEFLATE",
}

# Special COG profile for float32/float64 data to avoid predictor issues
FLOAT_COG_PROFILE = {
    "driver": "COG",
    "compress": "DEFLATE",
    "predictor": 1,  # Explicitly set to NONE (1) to avoid issues with float data
    "zlevel": 6,
    "bigtiff": "YES",  # Use BigTIFF for large float datasets
    "tiled": True,
    "blockxsize": 512,
    "blockysize": 512
}

In [None]:
def convert_to_proper_CRS_and_cogify(name, cog_filename, cog_data_bucket, cog_data_prefix, local_output_dir=None):
    s3_key = f"{cog_data_prefix}/{cog_filename}"
    os.makedirs('reproj', exist_ok=True)
    reproject_filename = f"reproj/{cog_filename}"
    
    # Create a temporary file for the downloaded S3 object
    temp_input_file = f"temp_{os.path.basename(name)}"

    try:
        # Download the file from S3 first
        print(f"[DOWNLOAD] Downloading {name} from S3...")
        s3_client.download_file(raw_data_bucket, name, temp_input_file)
        
        # Check if the file needs predictor fix for float data
        needs_fix, data_type = check_tiff_predictor(temp_input_file)
        
        # Reproject using the local file
        print(f"[REPROJECT] {name} → {reproject_filename} (EPSG:4326)")
        with rasterio.open(temp_input_file) as src:
            dst_crs = "EPSG:4326"
            transform, width, height = calculate_default_transform(
                src.crs, dst_crs, src.width, src.height, *src.bounds
            )
            
            # Base kwargs for reprojection
            kwargs = src.meta.copy()
            
            # If float32 or float64 data, use special settings to avoid predictor issues
            if data_type in ['float32', 'float64']:
                print(f"[FLOAT DATA] Detected {data_type} - using safe compression settings")
                kwargs.update({
                    "driver": "GTiff",  # Use GTiff for intermediate file
                    "compress": "DEFLATE",
                    "predictor": 1,  # Explicitly set to NONE to avoid issues
                    "crs": dst_crs,
                    "transform": transform,
                    "width": width,
                    "height": height,
                    "tiled": True,
                    "blockxsize": 512,
                    "blockysize": 512
                })
            else:
                # Standard settings for non-float data
                kwargs.update({
                    "driver": "COG",
                    "compress": "DEFLATE",
                    "crs": dst_crs,
                    "transform": transform,
                    "width": width,
                    "height": height
                })

            with rasterio.open(f"{reproject_filename}", "w", **kwargs) as dst:
                reproject(
                    source=rasterio.band(src, 1),
                    destination=rasterio.band(dst, 1),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=dst_crs,
                    resampling=Resampling.nearest,
                    wrapdateline=True
                )

        # 3) COGify & upload
        print(f"[COGIFY] {reproject_filename} → s3://{cog_data_bucket}/{s3_key}")
        ds = rxr.open_rasterio(reproject_filename)
        ds = ds.rename({"y": "lat", "x": "lon"})
        ds.rio.set_spatial_dims("lon", "lat", inplace=True)
        
        # Check data type and set appropriate nodata value
        if ds.dtype == 'uint8':
            # For uint8, use a value within 0-255 range (e.g., 255)
            nodata_value = 255
        elif data_type in ['float32', 'float64']:
            # For float data, use NaN as nodata
            import numpy as np
            nodata_value = np.nan
        else:
            # For other data types, use the original -9999
            nodata_value = -9999
        
        ds.rio.write_nodata(nodata_value, inplace=True)

        with tempfile.NamedTemporaryFile() as tmp:
            # Choose appropriate COG profile based on data type
            if data_type in ['float32', 'float64']:
                print(f"[COG PROFILE] Using float-safe COG profile for {data_type}")
                ds.rio.to_raster(tmp.name, **FLOAT_COG_PROFILE)
            else:
                ds.rio.to_raster(tmp.name, **COG_PROFILE)
            
            # Upload to S3
            s3_client.upload_file(
                Filename = tmp.name, 
                Bucket = cog_data_bucket, 
                Key = s3_key)
            print(f"[SUCCESS] Uploaded to s3://{cog_data_bucket}/{s3_key}")
            
            # Save locally if output directory is specified
            if local_output_dir:
                os.makedirs(local_output_dir, exist_ok=True)
                local_path = os.path.join(local_output_dir, cog_filename)
                
                # Copy the COG file to local directory
                import shutil
                shutil.copy(tmp.name, local_path)
                print(f"[LOCAL SAVE] Saved COG to {local_path}")
            
            # Verify the output can be read (especially important for float data)
            print(f"[VERIFY] Checking output file readability...")
            with rasterio.open(tmp.name) as verify_ds:
                band = verify_ds.GetRasterBand(1) if hasattr(verify_ds, 'GetRasterBand') else verify_ds.read(1, window=((0, min(100, verify_ds.height)), (0, min(100, verify_ds.width))))
                print(f"[VERIFY] ✅ Output file is readable - dtype: {verify_ds.dtypes[0]}")
            
    except Exception as e:
        print(f"[ERROR] Failed to process {name}: {str(e)}")
        raise
            
    finally:
        # Clean up temporary input file
        if os.path.exists(temp_input_file):
            os.remove(temp_input_file)
            print(f"[CLEANUP] removed temporary input file {temp_input_file}")
            
        # Clean up local intermediate
        if os.path.exists(reproject_filename):
            os.remove(reproject_filename)
            print(f"[CLEANUP] removed intermediate {reproject_filename}")

In [22]:
def convert_to_proper_CRS_and_cogify(name, cog_filename, cog_data_bucket, cog_data_prefix, local_output_dir=None):
    s3_key = f"{cog_data_prefix}/{cog_filename}"
    os.makedirs('reproj', exist_ok=True)
    reproject_filename = f"reproj/{cog_filename}"
    
    # Create a temporary file for the downloaded S3 object
    temp_input_file = f"temp_{os.path.basename(name)}"

    try:
        # Download the file from S3 first
        print(f"[DOWNLOAD] Downloading {name} from S3...")
        s3_client.download_file(raw_data_bucket, name, temp_input_file)
        
        # Reproject using the local file
        print(f"[REPROJECT] {name} → {reproject_filename} (EPSG:4326)")
        with rasterio.open(temp_input_file) as src:
            dst_crs = "EPSG:4326"
            transform, width, height = calculate_default_transform(
                src.crs, dst_crs, src.width, src.height, *src.bounds
            )
            kwargs = src.meta.copy()
            kwargs.update({
                "driver": "COG",                 # write a COG instead of plain GTiff
                "compress": "DEFLATE",           # or "LZW"
                "crs": dst_crs,
                "transform": transform,
                "width": width,
                "height": height
            })

            with rasterio.open(f"{reproject_filename}", "w", **kwargs) as dst:
                reproject(
                    source=rasterio.band(src, 1),
                    destination=rasterio.band(dst, 1),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=dst_crs,
                    resampling=Resampling.nearest,
                    wrapdateline=True
                )

        # 3) COGify & upload
        print(f"[COGIFY] {reproject_filename} → s3://{cog_data_bucket}/{s3_key}")
        ds = rxr.open_rasterio(reproject_filename)
        ds = ds.rename({"y": "lat", "x": "lon"})
        ds.rio.set_spatial_dims("lon", "lat", inplace=True)
        
        # Check data type and set appropriate nodata value
        if ds.dtype == 'uint8':
            # For uint8, use a value within 0-255 range (e.g., 255)
            nodata_value = 255
        else:
            # For other data types, use the original -9999
            nodata_value = -9999
        
        ds.rio.write_nodata(nodata_value, inplace=True)

        with tempfile.NamedTemporaryFile() as tmp:
            ds.rio.to_raster(tmp.name, **COG_PROFILE)
            
            # Upload to S3
            s3_client.upload_file(
                Filename = tmp.name, 
                Bucket = cog_data_bucket, 
                Key = s3_key)
            print(f"[SUCCESS] Uploaded to s3://{cog_data_bucket}/{s3_key}")
            
            # Save locally if output directory is specified
            if local_output_dir:
                os.makedirs(local_output_dir, exist_ok=True)
                local_path = os.path.join(local_output_dir, cog_filename)
                
                # Copy the COG file to local directory
                import shutil
                shutil.copy(tmp.name, local_path)
                print(f"[LOCAL SAVE] Saved COG to {local_path}")
            
    except Exception as e:
        print(f"[ERROR] Failed to process {name}: {str(e)}")
        raise
            
    finally:
        # Clean up temporary input file
        if os.path.exists(temp_input_file):
            os.remove(temp_input_file)
            print(f"[CLEANUP] removed temporary input file {temp_input_file}")
            
        # Clean up local intermediate
        if os.path.exists(reproject_filename):
            os.remove(reproject_filename)
            print(f"[CLEANUP] removed intermediate {reproject_filename}")

In [23]:
# Initialize DataFrame to track processed files
files_processed = pd.DataFrame(columns=["file_name", "COGs_created"])

# Get local output directory from config
local_output_dir = config.get("local_output_dir")

# Create output directories
if local_output_dir:
    os.makedirs(local_output_dir, exist_ok=True)
    print(f"Local COGs will be saved to: {local_output_dir}")

# Process all files
for name in sorted(keys):
    print(f"\nProcessing: {name}")
    cog_filename = Path(name).name

    print(f"Output filename: {cog_filename}")
    
    # Process the file with local output directory
    convert_to_proper_CRS_and_cogify(name, cog_filename, cog_data_bucket, cog_data_prefix, local_output_dir)
    
    # Add to tracking DataFrame
    files_processed = files_processed._append(
        {"file_name": name, "COGs_created": cog_filename},
        ignore_index=True,
    )
    print(f"Generated and saved COG: {cog_filename}")

print("\nDone generating COGs")
if local_output_dir:
    print(f"COGs saved locally to: {local_output_dir}")

Local COGs will be saved to: output/transformed_cogs

Processing: drcs_activations/202410_Hurricane_Milton/landsat/LC08_L1_colorInfrared_20241012_154915_015040.tif
Output filename: LC08_L1_colorInfrared_20241012_154915_015040.tif
[DOWNLOAD] Downloading drcs_activations/202410_Hurricane_Milton/landsat/LC08_L1_colorInfrared_20241012_154915_015040.tif from S3...
[REPROJECT] drcs_activations/202410_Hurricane_Milton/landsat/LC08_L1_colorInfrared_20241012_154915_015040.tif → reproj/LC08_L1_colorInfrared_20241012_154915_015040.tif (EPSG:4326)
[COGIFY] reproj/LC08_L1_colorInfrared_20241012_154915_015040.tif → s3://nasa-disasters/tmp-cog-speed-test/LC08_L1_colorInfrared_20241012_154915_015040.tif
[SUCCESS] Uploaded to s3://nasa-disasters/tmp-cog-speed-test/LC08_L1_colorInfrared_20241012_154915_015040.tif
[LOCAL SAVE] Saved COG to output/transformed_cogs/LC08_L1_colorInfrared_20241012_154915_015040.tif
[CLEANUP] removed temporary input file temp_LC08_L1_colorInfrared_20241012_154915_015040.tif
[

In [24]:
# Save metadata if there are processed files
if len(files_processed) > 0:
    # Get metadata from one of the processed files
    sample_file = files_processed.iloc[0]['file_name']
    temp_sample_file = f"temp_{os.path.basename(sample_file)}"
    
    # Download sample file to extract metadata
    s3_client.download_file(raw_data_bucket, sample_file, temp_sample_file)
    
    with rasterio.open(temp_sample_file) as src:
        metadata = {
            "description": src.tags(),
            "driver": src.driver,
            "dtype": str(src.dtypes[0]),
            "nodata": src.nodata,
            "width": src.width,
            "height": src.height,
            "count": src.count,
            "crs": str(src.crs),
            "transform": list(src.transform),
            "bounds": list(src.bounds),
            "total_files_processed": len(files_processed),
        }
    
    # Upload metadata
    with tempfile.NamedTemporaryFile(mode="w+") as fp:
        json.dump(metadata, fp, indent=2)
        fp.flush()
        
        s3_client.upload_file(
            Filename=fp.name,
            Bucket=bucket_name,
            Key=f"{cog_data_prefix}/metadata.json",
        )
        print(f"Uploaded metadata to s3://{bucket_name}/{cog_data_prefix}/metadata.json")
    
    # Clean up sample file
    if os.path.exists(temp_sample_file):
        os.remove(temp_sample_file)

# Save the files_processed DataFrame to CSV using the same s3_client
with tempfile.NamedTemporaryFile(mode="w+", suffix=".csv") as fp:
    files_processed.to_csv(fp.name, index=False)
    fp.flush()
    
    s3_client.upload_file(
        Filename=fp.name,
        Bucket=bucket_name,
        Key=f"{cog_data_prefix}/files_converted.csv",
    )
    print(f"Saved processing log to s3://{bucket_name}/{cog_data_prefix}/files_converted.csv")

Uploaded metadata to s3://nasa-disasters/tmp-cog-speed-test/metadata.json
Saved processing log to s3://nasa-disasters/tmp-cog-speed-test/files_converted.csv


In [25]:
# Display summary
print(f"\nProcessing Summary:")
print(f"Total files found: {len(keys)}")
print(f"Files processed: {len(files_processed)}")
print(f"\nProcessed files:")
files_processed


Processing Summary:
Total files found: 27
Files processed: 27

Processed files:


Unnamed: 0,file_name,COGs_created
0,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_colorInfrared_20241012_154915_015040.tif
1,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_colorInfrared_20241012_154939_015041.tif
2,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_colorInfrared_20241012_15503_015042.tif
3,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_naturalColor_20241012_154915_015040.tif
4,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_naturalColor_20241012_154939_015041.tif
5,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_naturalColor_20241012_15503_015042.tif
6,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_trueColor_20241012_154915_015040.tif
7,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_trueColor_20241012_154939_015041.tif
8,drcs_activations/202410_Hurricane_Milton/lands...,LC08_L1_trueColor_20241012_15503_015042.tif
9,drcs_activations/202410_Hurricane_Milton/lands...,LC09_colorInfrared_20240824_merged.tif
