<a href="https://colab.research.google.com/github/larasauser/master/blob/main/ML_sarafa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SSGP-Toolbox - Sarafanov et al. (2020)

# Configuration

In [None]:
ROOT_DRIVE_PATH = '/content/drive/MyDrive/SSGP_L8G'
INPUTS_TIF = os.path.join(ROOT_DRIVE_PATH, 'inputs')
HISTORY_TIF = os.path.join(ROOT_DRIVE_PATH, 'history')
EXTRA_TIF = os.path.join(ROOT_DRIVE_PATH, 'extra')
FILLED_DIR = os.path.join(ROOT_DRIVE_PATH, 'filled')

In [None]:
EXTENT = {'minX': 6.38, 'minY': 46.54, 'maxX': 6.5, 'maxY': 46.63}

In [None]:
KEY_VALUES = {'gap': -100.0, 'skip': -200.0, 'NoData': -32768.0}

In [None]:
RESOLUTION = {'xRes': 30, 'yRes': 30}

In [None]:
EXPORT_BIOME = True
BIOME_FILENAME = os.path.join(EXTRA_TIF, 'biome.tif')
BIOME_GEE_SCALE = 500 # native MODIS resolution; exported will be resampled to 30m in this script
RESAMPLE_TO_30M = True

In [None]:
# SSGP parameters
SSGP_METHOD = 'RandomForest' # method name as used in their SimpleSpatialGapfiller
PREDICTOR_CONFIG = 'Biome'
HYPERPARAMS = 'RandomGridSearch'
ADD_OUTPUTS_TO_HISTORY = False
PARALLEL = True # run pixel filling in parallel where class supports it

# Environment setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install SSGP toolbox from GitHub (if not already installed)
!pip install git+https://github.com/Dreamlone/SSGP-toolbox.git

In [None]:
# Install Earth Engine Python API if biome export required
if EXPORT_BIOME:
!pip install earthengine-api

In [None]:
# Install rasterio, geopandas, pyproj, gdal bindings
!apt-get update -qq
!apt-get install -y -qq gdal-bin libgdal-dev
!pip install rasterio==1.3.8 pyproj==3.5.0

# Imports

In [None]:
import os, re, json, shutil, time
import numpy as np
from glob import glob
import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.enums import Resampling as RioResampling
from datetime import datetime
import pyproj

In [None]:
# SSGP toolbox imports
try:
from SSGPToolbox.preparators.Sentinel3.S3_L2_LST import S3_L2_LST
# The toolbox provides a main class for gap filling; we'll use their SimpleSpatialGapfiller implementation
from SSGPToolbox.gapfillers.SimpleSpatialGapfiller import SimpleSpatialGapfiller
except Exception as e:
# If the above structure is different, user installed toolbox but import paths may vary
print('SSGP toolbox import failed (this may be OK if toolbox structure differs). Install completed?')
print(e)

# Utility functions

In [None]:
def ensure_dirs():
for p in [INPUTS_TIF, HISTORY_TIF, EXTRA_TIF, FILLED_DIR]:
os.makedirs(p, exist_ok=True)

In [None]:
def parse_date_from_filename(fname):
"""
Try to extract datetime from filenames like:
NDVI_2023-10-07.tif -> 20231007T000000
If the GeoTIFF has metadata with time, prefer that.
"""
base = os.path.basename(fname)
name, ext = os.path.splitext(base)


# Try common patterns: YYYY-MM-DD or YYYYMMDD
m = re.search(r'(\d{4})[-_]?([01]\d)[-_]?([0-3]\d)', name)
if m:
y, mm, dd = m.groups()
# Try to read time from tags
try:
with rasterio.open(fname) as src:
tags = src.tags()
# Try several common keys
for k in ['ACQUISITION_TIME','SCENE_CENTER_TIME','TIFFTAG_DATETIME','ACQUISITION_DATETIME','DATE_TIME']:
if tags.get(k):
tstr = tags.get(k)
# normalize formats like HH:MM:SS or HHMMSS
hhmmss = re.search(r'(\d{2}):(\d{2}):(\d{2})', tstr)
if hhmmss:
hh, mi, ss = hhmmss.groups()
return f"{y}{mm}{dd}T{hh}{mi}{ss}"
# fallback to midnight
return f"{y}{mm}{dd}T000000"
except Exception:
return f"{y}{mm}{dd}T000000"
else:
# fallback to file modified time
ts = os.path.getmtime(fname)
dt = datetime.utcfromtimestamp(ts)
return dt.strftime('%Y%m%dT%H%M%S')

In [None]:
def rename_files_to_ssgp_format(folder):
"""Rename NDVI_YYYY-MM-DD.tif -> L8_NDVI_YYYYMMDDTHHMMSS.tif (in place).
If time not available, uses 000000 for HHMMSS.
"""
renamed = []
for p in sorted(glob(os.path.join(folder, '*.tif'))):
new_dt = parse_date_from_filename(p)
new_name = f"L8_NDVI_{new_dt}.tif"
new_path = os.path.join(folder, new_name)
if os.path.basename(p) != os.path.basename(new_path):
os.rename(p, new_path)
renamed.append(new_path)
return renamed

In [None]:
def get_utm_code_and_extent(extent):
minX, minY, maxX, maxY = extent['minX'], extent['minY'], extent['maxX'], extent['maxY']
y_centroid = (minY + maxY) / 2.0
base_code = 32700 if y_centroid < 0 else 32600
x_centroid = (minX + maxX) / 2.0
zone = int(((x_centroid + 180) / 6.0) % 60) + 1
utm_code = base_code + zone
wgs = pyproj.Proj('epsg:4326')
utm = pyproj.Proj(f'epsg:{utm_code}')
min_corner = pyproj.transform(wgs, utm, minX, minY)
max_corner = pyproj.transform(wgs, utm, maxX, maxY)
utm_extent = {'minX': min_corner[0], 'minY': min_corner[1], 'maxX': max_corner[0], 'maxY': max_corner[1]}
return utm_code, utm_extent

In [None]:
def geotiff_to_npy_and_metadata(tif_path, out_folder, extent, resolution, key_values):
if src.crs != dst_crs:
transform, width, height = calculate_default_transform(src.crs, dst_crs, src.width, src.height, *src.bounds)
kwargs = src.meta.copy()
kwargs.update({'crs': dst_crs, 'transform': transform, 'width': width, 'height': height})
# reproject into an in-memory raster
dest = np.empty((height, width), dtype=np.float32)
reproject(
source=arr,
destination=dest,
src_transform=src.transform,
src_crs=src.crs,
dst_transform=transform,
dst_crs=dst_crs,
resampling=Resampling.bilinear
)
arr = dest
src_transform = transform
src_crs = dst_crs


# Compute UTM projection and warp to UTM and desired resolution and extent
utm_code, utm_extent = get_utm_code_and_extent(extent)
dst_crs = f'EPSG:{utm_code}'


# Build target transform and shape
xres, yres = resolution['xRes'], resolution['yRes']
minx, miny, maxx, maxy = utm_extent['minX'], utm_extent['minY'], utm_extent['maxX'], utm_extent['maxY']
width = int(np.ceil((maxx - minx) / xres))
height = int(np.ceil((maxy - miny) / yres))
dst_transform = rasterio.transform.from_origin(minx, maxy, xres, yres)


# Create destination array and reproject
dest = np.full((height, width), key_values.get('NoData'), dtype=np.float32)
reproject(
source=arr,
destination=dest,
src_transform=src_transform,
src_crs=src_crs,
dst_transform=dst_transform,
dst_crs=dst_crs,
resampling=Resampling.bilinear
)


# Apply key value mapping if needed (here we keep as is; user images have no missing values)
# Replace NaN with gap if present
dest[np.isnan(dest)] = key_values.get('gap')


# Save npy
basename = os.path.splitext(os.path.basename(tif_path))[0]
npy_path = os.path.join(out_folder, f"{basename}.npy")
np.save(npy_path, dest.astype(np.float32))


# Save metadata similar to SSGP
metadata = {
'file_name': os.path.basename(tif_path),
'satellite': 'L8',
'datetime': basename.split('_')[-1],
'extent': extent,
'utm_code': utm_code,
'utm_extent': utm_extent,
'resolution': resolution,
'key_values': key_values
}
meta_path = os.path.join(out_folder, f"{basename}_metadata.json")
with open(meta_path, 'w') as f:
json.dump(metadata, f, indent=4)
return npy_path, meta_path

# Biome export

In [None]:
if EXPORT_BIOME:
import ee
try:
ee.Initialize()
except Exception:
ee.Authenticate()
ee.Initialize()


# Define region geometry from EXTENT
region = ee.Geometry.Rectangle([EXTENT['minX'], EXTENT['minY'], EXTENT['maxX'], EXTENT['maxY']])


# MODIS landcover (MCD12Q1). LC_Type1 uses IGBP classification
modis = ee.ImageCollection('MODIS/006/MCD12Q1').select('LC_Type1')
# Use the most recent year available or compute mode across years
year = ee.Date(datetime.utcnow()).get('year')
year_image = ee.Image(modis.filter(ee.Filter.calendarRange(2019, 2023, 'year')).first())
if year_image is None:
year_image = ee.Image(modis.first())


# Clip to region and resample
biome_img = year_image.clip(region).resample('nearest')


# Export to Drive (resampled at MODIS native 500m then we will resample to 30m locally)
task = ee.batch.Export.image.toDrive(
image=biome_img,
description='export_biome_modis',
folder='SSGP_L8G_gee_exports',
fileNamePrefix='biome_modis',
region=region.bounds().getInfo()['coordinates'],
scale=BIOME_GEE_SCALE,
crs='EPSG:4326',
maxPixels=1e13
)
task.start()
print('Biome export started in Earth Engine. Monitor Tasks in your GEE account. Waiting for completion...')
# Simple waiting loop (polling). In Colab you may prefer to check the Tasks tab manually.
while task.status()['state'] in ['READY','RUNNING']:
print('Task state:', task.status())
time.sleep(10)
print('Task finished with state:', task.status())


# The exported file will be in your Google Drive in folder 'SSGP_L8G_gee_exports'.
# Find it and move/rename to EXTRA_TIF/biome.tif, then resample to 30m below.


# Attempt auto-detection of Drive export artifact
drive_export_folder = '/content/drive/MyDrive/SSGP_L8G_gee_exports'
exported_candidates = glob(os.path.join(drive_export_folder, 'biome_modis*.tif'))
if len(exported_candidates) == 0:
print('No exported biome found automatically. Please move the exported TIFF from your Drive\'s SSGP_L8G_gee_exports to the extra folder and name it biome_modis.tif')
else:
src_biome = exported_candidates[0]
dst_biome = BIOME_FILENAME.replace('.tif', '_modis500m.tif')
shutil.copy(src_biome, dst_biome)
print('Copied biome export to', dst_biome)


# If resampling to 30m is requested, do it now (nearest neighbor)
if RESAMPLE_TO_30M and len(exported_candidates) > 0:
src = dst_biome
with rasterio.open(src) as srcf:
data = srcf.read(1)
src_transform = srcf.transform
src_crs = srcf.crs


# Determine target UTM extent and transform (we'll use same EXTENT and 30m resolution)
utm_code, utm_extent = get_utm_code_and_extent(EXTENT)
xres, yres = RESOLUTION['xRes'], RESOLUTION['yRes']
minx, miny, maxx, maxy = utm_extent['minX'], utm_extent['minY'], utm_extent['maxX'], utm_extent['maxY']
width = int(np.ceil((maxx - minx) / xres))
height = int(np.ceil((maxy - miny) / yres))
dst_transform = rasterio.transform.from_origin(minx, maxy, xres, yres)


dst_profile = {
'driver': 'GTiff',
'height': height,
'width': width,
'count': 1,
'dtype': data.dtype,
'crs': f'EPSG:{utm_code}',
'transform': dst_transform
}
os.makedirs(EXTRA_TIF, exist_ok=True)
biome_out = BIOME_FILENAME
with rasterio.open(biome_out, 'w', **dst_profile) as dst:
reproject(
source=data,
destination=rasterio.band(dst, 1),
src_transform=src_transform,
src_crs=src_crs,
dst_transform=dst_transform,
dst_crs=f'EPSG:{utm_code}',
resampling=RioResampling.nearest
)
print('Biome resampled and saved to', biome_out)

# Prepare data (rename + npy)

In [None]:
ensure_dirs()
print('Renaming input and history TIFFs to SSGP filename format...')
rename_files_to_ssgp_format(INPUTS_TIF)
rename_files_to_ssgp_format(HISTORY_TIF)

In [None]:
# Convert all TIFFs in history and inputs to npy
print('Converting TIFFs to NPY+metadata...')
for folder in [HISTORY_TIF, INPUTS_TIF]:
for tif in sorted(glob(os.path.join(folder, '*.tif'))):
# Decide output subfolder: history or inputs inside ROOT (SSGP expects History/Inputs/Extra)
if os.path.commonpath([tif, HISTORY_TIF]) == HISTORY_TIF:
out_sub = os.path.join(ROOT_DRIVE_PATH, 'History')
else:
out_sub = os.path.join(ROOT_DRIVE_PATH, 'Inputs')
os.makedirs(out_sub, exist_ok=True)
npy_path, meta_path = geotiff_to_npy_and_metadata(tif, out_sub, EXTENT, RESOLUTION, KEY_VALUES)
print('Saved', npy_path, meta_path)

In [None]:
# Convert biome to Extra (if it exists)
if os.path.exists(BIOME_FILENAME):
out_extra = os.path.join(ROOT_DRIVE_PATH, 'Extra')
os.makedirs(out_extra, exist_ok=True)
biome_npy, _ = geotiff_to_npy_and_metadata(BIOME_FILENAME, out_extra, EXTENT, RESOLUTION, KEY_VALUES)
# Rename Extra npy to expected name Extra.npy
biome_base = os.path.splitext(os.path.basename(biome_npy))[0]
extra_dst = os.path.join(out_extra, 'Extra.npy')
shutil.copy(biome_npy, extra_dst)
print('Biome converted and saved as', extra_dst)
else:
print('No biome file found in extra; SSGP Biome mode will fail unless Extra/Extra.npy exists.')

# Gapfilling

In [None]:
# The SSGP toolbox expects a project directory with subfolders 'History', 'Inputs', 'Extra'.
PROJECT_DIR = ROOT_DRIVE_PATH # it already contains History, Inputs, Extra subfolders now

In [None]:
# Instantiate the gapfiller
try:
gapfiller = SimpleSpatialGapfiller(PROJECT_DIR, parallel=PARALLEL)
except Exception as e:
print('Failed to import SimpleSpatialGapfiller from installed toolbox. Ensure the toolbox is installed and import path matches.')
raise

In [None]:
# Run fill_gaps with specified options
gapfiller.fill_gaps(method=SSGP_METHOD,
predictor_configuration=PREDICTOR_CONFIG,
hyperparameters=HYPERPARAMS,
params=None,
add_outputs=ADD_OUTPUTS_TO_HISTORY,
key_values=KEY_VALUES)

In [None]:
# Outputs saved to PROJECT_DIR/Outputs by the class. Move them to FILLED_DIR and also export as GeoTIFF using metadata
outputs_dir = os.path.join(PROJECT_DIR, 'Outputs')
if not os.path.exists(outputs_dir):
print('No Outputs folder found. Did the gapfiller run successfully?')
else:
os.makedirs(FILLED_DIR, exist_ok=True)
# For each .npy in Outputs, write a GeoTIFF using the corresponding metadata json produced earlier
for f in sorted(glob(os.path.join(outputs_dir, '*.npy'))):
base = os.path.splitext(os.path.basename(f))[0]
out_npy = os.path.join(FILLED_DIR, os.path.basename(f))
shutil.copy(f, out_npy)
# Try to find metadata file in History/Inputs (we saved metadata alongside input npys earlier)
meta_candidate = os.path.join(PROJECT_DIR, 'History', base + '_metadata.json')
if not os.path.exists(meta_candidate):
meta_candidate = os.path.join(PROJECT_DIR, 'Inputs', base + '_metadata.json')
if os.path.exists(meta_candidate):
with open(meta_candidate, 'r') as mf:
meta = json.load(mf)
# reconstruct geotiff from meta and npy
arr = np.load(f)
utm_code = meta.get('utm_code')
utm_ext = meta.get('utm_extent')
res = meta.get('resolution')
xres = res['xRes']; yres = res['yRes']
minx = utm_ext['minX']; maxy = utm_ext['maxY']
width = arr.shape[1]; height = arr.shape[0]
transform = rasterio.transform.from_origin(minx, maxy, xres, yres)
profile = {
'driver': 'GTiff',
'height': height,
'width': width,
'count': 1,
'dtype': 'float32',
'crs': f'EPSG:{utm_code}',
'transform': transform
}
out_tif = os.path.join(FILLED_DIR, base + '.tif')
with rasterio.open(out_tif, 'w', **profile) as dst:
dst.write(arr.astype(np.float32), 1)
print('Saved filled GeoTIFF:', out_tif)
else:
print('Metadata not found for', base, '— saved only .npy')


print('ALL DONE. Filled files are in', FILLED_DIR)