Code used to transform [Global Mangrove Distribution, Aboveground Biomass, and Canopy Height](https://daac.ornl.gov/CMS/guides/CMS_Global_Map_Mangrove_Canopy.html) from .tif to Cloud Optimized Geotiff.

-Author: Kyle Lesinger

In [1]:
import os
import pandas as pd
import json
import tempfile
import boto3
import rasterio
import rioxarray as rxr
import s3fs
from rasterio.warp import calculate_default_transform, reproject, Resampling
import botocore
from pathlib import Path
from dotenv import load_dotenv

# Useful links
[drcs_activations OLD Directory](https://data.disasters.openveda.cloud/browseui/browseui/#drcs_activations/)

List of new 2nd level directories

    "Sentinel-1"
    "Sentinel-2"
    "Landsat"
    "MODIS"
    "VIIRS"
    "ASTER"
    "MASTER"
    "ECOSTRESS"
    "Planet"
    "Maxar"
    "HLS"
    "IMERG"
    "GOES"
    "SMAP"
    "ICESat"
    "GEDI"
    "COMSAR"
    "UAVSAR"
    "WB-57"

In [2]:
# DO NOT CHANGE
DIR_OLD_BASE = 'drcs_activations'
DIR_NEW_BASE = 'drcs_activations_new'

In [None]:
EVENT_NAME = '202405_Flood_TX'
PRODUCT_NAME = 'sentinel1'

RENAME_PRODUCT = 'Sentinel-1'

PATH_OLD = f'{DIR_OLD_BASE}/{EVENT_NAME}/{PRODUCT_NAME}'  # Updated to use actual available directory
DIRECTORY_NEW = f'{DIR_NEW_BASE}/'

## Load TIF Files from DRCS Data

This cell loads the pre-analyzed DRCS activation data from `drcs_activations_tif_files.json` which contains a complete inventory of all .tif files in the NASA Disasters S3 bucket.

The code will:
1. Load the JSON file containing the file inventory
2. Parse the `PATH_OLD` variable to find the corresponding directory
3. Extract all .tif filenames from that directory
4. Store them in `files_to_process` for later use


In [8]:
# Load the pre-analyzed DRCS TIF files data
import json
from pathlib import Path

# Load the drcs_activations_tif_files.json
drcs_json_path = Path('../s3-crawler/drcs_activations_tif_files.json')

if drcs_json_path.exists():
    with open(drcs_json_path, 'r') as f:
        drcs_data = json.load(f)
    print(f"✅ Loaded DRCS data from {drcs_json_path}")
else:
    print(f"⚠️ DRCS data file not found at {drcs_json_path}")
    drcs_data = None

# Function to get TIF files from a specific directory path
def get_tif_files_from_path(path_old, drcs_data):
    """
    Extract .tif files from the specified path in DRCS data.
    
    Args:
        path_old: Path like 'drcs_activations/202405_Flood_TX/planet'
        drcs_data: The loaded DRCS JSON data
    
    Returns:
        List of .tif filenames found in that directory
    """
    if not drcs_data or 'drcs_activations' not in drcs_data:
        return []
    
    # Parse the path
    path_parts = path_old.replace(DIR_OLD_BASE, '').strip('/').split('/')
    
    if len(path_parts) < 1:
        print(f"⚠️ Invalid path: {path_old}")
        return []
    
    # Navigate through the JSON structure
    current = drcs_data['drcs_activations']
    
    for part in path_parts:
        if part and part in current:
            current = current[part]
        else:
            print(f"⚠️ Directory '{part}' not found in path: {path_old}")
            
            # Suggest available directories
            if isinstance(current, dict):
                available = [k for k in current.keys() if k not in ['_files', '_metadata']]
                if available:
                    print(f"   Available directories at this level: {', '.join(available[:5])}")
            return []
    
    # Get files if they exist
    if isinstance(current, dict) and '_files' in current:
        return current['_files']
    else:
        print(f"⚠️ No files found in: {path_old}")
        return []

# Get TIF files from the specified PATH_OLD
tif_files = get_tif_files_from_path(PATH_OLD, drcs_data)

if tif_files:
    print(f"\\n📁 Found {len(tif_files)} .tif files in {PATH_OLD}:")
    print("\\nFirst 10 files:")
    for i, file in enumerate(tif_files[:10], 1):
        print(f"  {i:2d}. {file}")
    if len(tif_files) > 10:
        print(f"  ... and {len(tif_files) - 10} more files")
    
    # Store in a variable for later use
    files_to_process = [f"{PATH_OLD}/{file}" for file in tif_files]
    print(f"\n✅ Files ready for processing. Stored in 'files_to_process' variable.")
else:
    print(f"\n❌ No files found. Please check the PATH_OLD variable.")
    files_to_process = []


✅ Loaded DRCS data from ../s3-crawler/drcs_activations_tif_files.json
\n📁 Found 11 .tif files in drcs_activations/202405_Flood_TX/sentinel1:
\nFirst 10 files:
   1. S1A_IW_20240430T002653_DVR_RTC20_G_gpuned_0610_WM.tif
   2. S1A_IW_20240430T002653_DVR_RTC20_G_gpuned_0610_rgb.tif
   3. S1A_IW_20240430T002719_DVR_RTC20_G_gpuned_F141_WM.tif
   4. S1A_IW_20240430T002719_DVR_RTC20_G_gpuned_F141_rgb.tif
   5. S1A_IW_20240507T122323_DVR_RTC20_G_gpuned_5BA0_WM.tif
   6. S1A_IW_20240507T122323_DVR_RTC20_G_gpuned_5BA0_rgb.tif
   7. S1A_IW_20240512T002655_DVR_RTC20_G_gpuned_EC9C_WM.tif
   8. S1A_IW_20240512T002720_DVR_RTC20_G_gpuned_D32B_WM.tif
   9. S1A_IW_20240512T002745_DVR_RTC20_G_gpuned_3F78_WM.tif
  10. S1_20240430_20240507_WM_diff.tif
  ... and 1 more files

✅ Files ready for processing. Stored in 'files_to_process' variable.


In [7]:
files_to_process

['drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240430T002653_DVR_RTC20_G_gpuned_0610_WM.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240430T002653_DVR_RTC20_G_gpuned_0610_rgb.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240430T002719_DVR_RTC20_G_gpuned_F141_WM.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240430T002719_DVR_RTC20_G_gpuned_F141_rgb.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240507T122323_DVR_RTC20_G_gpuned_5BA0_WM.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240507T122323_DVR_RTC20_G_gpuned_5BA0_rgb.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240512T002655_DVR_RTC20_G_gpuned_EC9C_WM.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240512T002720_DVR_RTC20_G_gpuned_D32B_WM.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1A_IW_20240512T002745_DVR_RTC20_G_gpuned_3F78_WM.tif',
 'drcs_activations/202405_Flood_TX/sentinel1/S1_20240430_20240507_WM_diff.tif',
 'drcs_a

In [None]:
config = {
    "data_acquisition_method": "s3",
    "raw_data_bucket" : "nasa-disasters", #DO NOT CHANGE
    "raw_data_prefix": F"{PATH_OLD}",
    "cog_data_bucket": "nasa-disasters", #DO NOT CHANGE
    "cog_data_prefix": "transformed_cogs/CMS_Global_Map_Mangrove_Canopy_Biomass",
    "local_output_dir": f"output/{EVENT_NAME}",  # Local directory to save COGs
    "transformation": {}
}

In [7]:
# Load environment variables from .env file
load_dotenv()

# AWS Credentials from environment variables
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')

Approach

1.) Read .tif files from S3 bucket
2.) Convert to COGs on local drive
3.) Move converted COGs to their final S3 location

In [8]:
session = boto3.session.Session()
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token = AWS_SESSION_TOKEN

)

bucket_name = config["cog_data_bucket"]
raw_data_bucket = config["raw_data_bucket"]
raw_data_prefix= config["raw_data_prefix"]

cog_data_bucket = config['cog_data_bucket']
cog_data_prefix= config["cog_data_prefix"]

fs = s3fs.S3FileSystem()

In [9]:
def get_all_s3_keys(bucket, model_name, ext):
    """Get a list of all keys in an S3 bucket."""
    keys = []

    kwargs = {"Bucket": bucket, "Prefix": f"{model_name}/"}
    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp["Contents"]:
            if obj["Key"].endswith(ext) and "historical" not in obj["Key"]:
                keys.append(obj["Key"])

        try:
            kwargs["ContinuationToken"] = resp["NextContinuationToken"]
        except KeyError:
            break

    return keys

keys = get_all_s3_keys(raw_data_bucket, raw_data_prefix, ".tif")
keys

['coastal-observatory/data/Mangrove_agb_AndamanAndNicobar.tif',
 'coastal-observatory/data/Mangrove_agb_Angola.tif',
 'coastal-observatory/data/Mangrove_agb_Anguilla.tif',
 'coastal-observatory/data/Mangrove_agb_AntiguaAndBarbuda.tif',
 'coastal-observatory/data/Mangrove_agb_Aruba.tif',
 'coastal-observatory/data/Mangrove_agb_Australia.tif',
 'coastal-observatory/data/Mangrove_agb_Bahamas.tif',
 'coastal-observatory/data/Mangrove_agb_Bahrain.tif',
 'coastal-observatory/data/Mangrove_agb_Bangladesh.tif',
 'coastal-observatory/data/Mangrove_agb_Barbados.tif',
 'coastal-observatory/data/Mangrove_agb_Belize.tif',
 'coastal-observatory/data/Mangrove_agb_Benin.tif',
 'coastal-observatory/data/Mangrove_agb_Brazil.tif',
 'coastal-observatory/data/Mangrove_agb_BritishVirginIslands.tif',
 'coastal-observatory/data/Mangrove_agb_Brunei.tif',
 'coastal-observatory/data/Mangrove_agb_Cambodia.tif',
 'coastal-observatory/data/Mangrove_agb_Cameroon.tif',
 'coastal-observatory/data/Mangrove_agb_Carribea

In [10]:
def create_cog_filename(f):
    
    f = Path(f).stem
    # Example: "Mangrove_agb_AndamanAndNicobar.tif" -> "Mangrove_agb_AndamanAndNicobar_2000year.tif"
    # Example: "Mangrove_hmax95_Yemen.tif" -> "Mangrove_hmax95_Yemen_2000year.tif"
    
    # Simply append 2000year to the stem
    cog_filename = f"{f}_2000year.tif"
    return cog_filename

In [11]:
# Define COG profile for rasterio
COG_PROFILE = {
    "driver": "COG",
    "compress": "DEFLATE",
}

In [None]:
def validate_cog(filepath):
    """
    Validate that a file is a proper Cloud Optimized GeoTIFF.
    
    Args:
        filepath: Path to the file to validate
    
    Returns:
        tuple: (is_valid, details_dict) where is_valid is boolean and 
               details_dict contains validation information
    """
    import rasterio
    from rasterio.env import Env
    
    validation_details = {
        'is_cog': False,
        'has_tiles': False,
        'has_overviews': False,
        'tile_size': None,
        'overview_levels': [],
        'compression': None,
        'driver': None,
        'errors': []
    }
    
    try:
        with Env(GDAL_DISABLE_READDIR_ON_OPEN='EMPTY_DIR'):
            with rasterio.open(filepath) as src:
                # Check driver
                validation_details['driver'] = src.driver
                
                # Check if it's a GeoTIFF
                if src.driver != 'GTiff' and src.driver != 'COG':
                    validation_details['errors'].append(f"Invalid driver: {src.driver}, expected GTiff or COG")
                    return False, validation_details
                
                # Check for tiling
                if src.profile.get('tiled', False):
                    validation_details['has_tiles'] = True
                    validation_details['tile_size'] = (
                        src.profile.get('blockxsize', 0),
                        src.profile.get('blockysize', 0)
                    )
                else:
                    validation_details['errors'].append("File is not tiled")
                
                # Check for overviews
                overviews = src.overviews(1)  # Check band 1
                if overviews:
                    validation_details['has_overviews'] = True
                    validation_details['overview_levels'] = overviews
                else:
                    validation_details['errors'].append("No overviews found")
                
                # Check compression
                compression = src.profile.get('compress', None)
                validation_details['compression'] = compression
                if compression not in ['DEFLATE', 'LZW', 'ZSTD', 'WEBP', 'JPEG']:
                    validation_details['errors'].append(f"Compression '{compression}' may not be optimal for COG")
                
                # Check if file structure is cloud optimized
                # A COG should have IFD (Image File Directory) offsets arranged properly
                # This is a simplified check - true COG validation would check IFD ordering
                is_likely_cog = (
                    validation_details['has_tiles'] and 
                    validation_details['has_overviews'] and
                    validation_details['compression'] in ['DEFLATE', 'LZW', 'ZSTD', 'WEBP', 'JPEG']
                )
                
                validation_details['is_cog'] = is_likely_cog
                
                # Additional check for internal structure
                if hasattr(src, 'is_tiled') and src.is_tiled:
                    # Check tile size is reasonable (typically 256 or 512)
                    tile_x, tile_y = validation_details['tile_size']
                    if tile_x not in [256, 512, 1024] or tile_y not in [256, 512, 1024]:
                        validation_details['errors'].append(f"Non-standard tile size: {tile_x}x{tile_y}")
                
                return is_likely_cog, validation_details
                
    except Exception as e:
        validation_details['errors'].append(f"Validation error: {str(e)}")
        return False, validation_details

print("✅ COG validation function added")

In [None]:
def convert_to_proper_CRS_and_cogify(name, cog_filename, cog_data_bucket, cog_data_prefix, local_output_dir=None):
    s3_key = f"{cog_data_prefix}/{cog_filename}"
    reproject_filename = f"reproj/{cog_filename}"
    
    # Create necessary directories
    os.makedirs("reproj", exist_ok=True)
    
    # Create a temporary file for the downloaded S3 object
    temp_input_file = f"temp_{os.path.basename(name)}"

    try:
        # Download the file from S3 first
        print(f"[DOWNLOAD] Downloading {name} from S3...")
        s3_client.download_file(raw_data_bucket, name, temp_input_file)
        
        # Reproject using the local file
        print(f"[REPROJECT] {name} → {reproject_filename} (EPSG:4326)")
        with rasterio.open(temp_input_file) as src:
            dst_crs = "EPSG:4326"
            transform, width, height = calculate_default_transform(
                src.crs, dst_crs, src.width, src.height, *src.bounds
            )
            kwargs = src.meta.copy()
            kwargs.update({
                "driver": "COG",                 # write a COG instead of plain GTiff
                "compress": "DEFLATE",           # or "LZW"
                "crs": dst_crs,
                "transform": transform,
                "width": width,
                "height": height
            })

            with rasterio.open(f"{reproject_filename}", "w", **kwargs) as dst:
                reproject(
                    source=rasterio.band(src, 1),
                    destination=rasterio.band(dst, 1),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=dst_crs,
                    resampling=Resampling.nearest,
                    wrapdateline=True
                )

        # 3) COGify & upload
        print(f"[COGIFY] {reproject_filename} → s3://{cog_data_bucket}/{s3_key}")
        ds = rxr.open_rasterio(reproject_filename)
        ds = ds.rename({"y": "lat", "x": "lon"})
        ds.rio.set_spatial_dims("lon", "lat", inplace=True)
        ds.rio.write_nodata(-9999, inplace=True)

        with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as tmp:
            tmp_name = tmp.name
            ds.rio.to_raster(tmp_name, **COG_PROFILE)
            
            # Validate COG before uploading
            print(f"[VALIDATE] Checking if {cog_filename} is a valid COG...")
            is_valid_cog, validation_details = validate_cog(tmp_name)
            
            if is_valid_cog:
                print(f"[VALIDATE] ✅ Valid COG confirmed:")
                print(f"  - Tiled: {validation_details['has_tiles']} (tile size: {validation_details['tile_size']})")
                print(f"  - Overviews: {len(validation_details['overview_levels'])} levels {validation_details['overview_levels']}")
                print(f"  - Compression: {validation_details['compression']}")
            else:
                print(f"[VALIDATE] ⚠️ COG validation warnings:")
                if validation_details['errors']:
                    for error in validation_details['errors']:
                        print(f"  - {error}")
                
                # Decide whether to continue or fail based on severity
                critical_errors = [e for e in validation_details['errors'] if 'Invalid driver' in e]
                if critical_errors:
                    raise ValueError(f"Critical COG validation failed: {', '.join(critical_errors)}")
                else:
                    print(f"[VALIDATE] Proceeding with upload despite warnings...")
            
            # Upload to S3
            s3_client.upload_file(
                Filename = tmp_name, 
                Bucket = cog_data_bucket, 
                Key = s3_key)
            print(f"[SUCCESS] Uploaded to s3://{cog_data_bucket}/{s3_key}")
            
            # Save locally if output directory is specified
            if local_output_dir:
                os.makedirs(local_output_dir, exist_ok=True)
                local_path = os.path.join(local_output_dir, cog_filename)
                
                # Copy the COG file to local directory
                import shutil
                shutil.copy(tmp_name, local_path)
                print(f"[LOCAL SAVE] Saved COG to {local_path}")
            
    except Exception as e:
        print(f"[ERROR] Failed to process {name}: {str(e)}")
        raise
            
    finally:
        # Clean up temporary files
        if os.path.exists(temp_input_file):
            os.remove(temp_input_file)
            print(f"[CLEANUP] Removed temporary input file {temp_input_file}")
            
        # Clean up local intermediate
        if os.path.exists(reproject_filename):
            os.remove(reproject_filename)
            print(f"[CLEANUP] Removed intermediate {reproject_filename}")
            
        # Clean up temp COG file
        if 'tmp_name' in locals() and os.path.exists(tmp_name):
            os.remove(tmp_name)
            print(f"[CLEANUP] Removed temporary COG file")

In [None]:
# Initialize DataFrame to track processed files
files_processed = pd.DataFrame(columns=["file_name", "COGs_created"])

# Get local output directory from config
local_output_dir = config.get("local_output_dir")

# Create output directories
if local_output_dir:
    os.makedirs(local_output_dir, exist_ok=True)
    print(f"Local COGs will be saved to: {local_output_dir}")

# Process all files
for name in sorted(keys):
    cog_filename = create_cog_filename(name, start_str, end_str)
    print(f"\nProcessing: {name}")
    print(f"Output filename: {cog_filename}")
    
    # Process the file with local output directory
    convert_to_proper_CRS_and_cogify(name, cog_filename, cog_data_bucket, cog_data_prefix, local_output_dir)
    
    # Add to tracking DataFrame
    files_processed = files_processed._append(
        {"file_name": name, "COGs_created": cog_filename},
        ignore_index=True,
    )
    print(f"Generated and saved COG: {cog_filename}")

print("\nDone generating COGs")
if local_output_dir:
    print(f"COGs saved locally to: {local_output_dir}")

Local COGs will be saved to: output/cms-global-map-mangrove

Processing: coastal-observatory/data/Mangrove_agb_AndamanAndNicobar.tif
Output filename: Mangrove_agb_AndamanAndNicobar_2000-01-01day_2009-12-31.tif
[DOWNLOAD] Downloading coastal-observatory/data/Mangrove_agb_AndamanAndNicobar.tif from S3...
[REPROJECT] coastal-observatory/data/Mangrove_agb_AndamanAndNicobar.tif → reproj/Mangrove_agb_AndamanAndNicobar_2000-01-01day_2009-12-31.tif (EPSG:4326)
[COGIFY] reproj/Mangrove_agb_AndamanAndNicobar_2000-01-01day_2009-12-31.tif → s3://ghgc-data-store-dev/transformed_cogs/CMS_Global_Map_Mangrove_Canopy/Mangrove_agb_AndamanAndNicobar_2000-01-01day_2009-12-31.tif


In [None]:
# Save metadata if there are processed files
if len(files_processed) > 0:
    # Get metadata from one of the processed files
    sample_file = files_processed.iloc[0]['file_name']
    temp_sample_file = f"temp_{os.path.basename(sample_file)}"
    
    # Download sample file to extract metadata
    s3_client.download_file(raw_data_bucket, sample_file, temp_sample_file)
    
    with rasterio.open(temp_sample_file) as src:
        metadata = {
            "description": src.tags(),
            "driver": src.driver,
            "dtype": str(src.dtypes[0]),
            "nodata": src.nodata,
            "width": src.width,
            "height": src.height,
            "count": src.count,
            "crs": str(src.crs),
            "transform": list(src.transform),
            "bounds": list(src.bounds),
            "total_files_processed": len(files_processed),
            "year": "2000"
        }
    
    # Upload metadata
    with tempfile.NamedTemporaryFile(mode="w+") as fp:
        json.dump(metadata, fp, indent=2)
        fp.flush()
        
        s3_client.upload_file(
            Filename=fp.name,
            Bucket=bucket_name,
            Key=f"{cog_data_prefix}/metadata.json",
        )
        print(f"Uploaded metadata to s3://{bucket_name}/{cog_data_prefix}/metadata.json")
    
    # Clean up sample file
    if os.path.exists(temp_sample_file):
        os.remove(temp_sample_file)

# Save the files_processed DataFrame to CSV using the same s3_client
with tempfile.NamedTemporaryFile(mode="w+", suffix=".csv") as fp:
    files_processed.to_csv(fp.name, index=False)
    fp.flush()
    
    s3_client.upload_file(
        Filename=fp.name,
        Bucket=bucket_name,
        Key=f"{cog_data_prefix}/files_converted.csv",
    )
    print(f"Saved processing log to s3://{bucket_name}/{cog_data_prefix}/files_converted.csv")

In [None]:
# Display summary
print(f"\nProcessing Summary:")
print(f"Total files found: {len(keys)}")
print(f"Files processed: {len(files_processed)}")
print(f"\nProcessed files:")
files_processed


Processing Summary:
Total files found: 348
Files processed: 348

Processed files:


Unnamed: 0,file_name,COGs_created
0,coastal-observatory/data/Mangrove_agb_AndamanA...,Mangrove_agb_AndamanAndNicobar_2000-01-01day_2...
1,coastal-observatory/data/Mangrove_agb_Angola.tif,Mangrove_agb_Angola_2000-01-01day_2009-12-31.tif
2,coastal-observatory/data/Mangrove_agb_Anguilla...,Mangrove_agb_Anguilla_2000-01-01day_2009-12-31...
3,coastal-observatory/data/Mangrove_agb_AntiguaA...,Mangrove_agb_AntiguaAndBarbuda_2000-01-01day_2...
4,coastal-observatory/data/Mangrove_agb_Aruba.tif,Mangrove_agb_Aruba_2000-01-01day_2009-12-31.tif
...,...,...
343,coastal-observatory/data/Mangrove_hmax95_Venez...,Mangrove_hmax95_Venezuela_2000-01-01day_2009-1...
344,coastal-observatory/data/Mangrove_hmax95_Vietn...,Mangrove_hmax95_Vietnam_2000-01-01day_2009-12-...
345,coastal-observatory/data/Mangrove_hmax95_Virgi...,Mangrove_hmax95_VirginIslandsUs_2000-01-01day_...
346,coastal-observatory/data/Mangrove_hmax95_Walli...,Mangrove_hmax95_WallisAndFutuna_2000-01-01day_...
