In [4]:
import os 
import numpy as np
import rasterio
import torch
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

def get_best_gpu():
    """Selects the best available GPU based on memory size."""
    if not torch.cuda.is_available():
        return 'CPU'
    
    num_gpus = torch.cuda.device_count()
    if num_gpus == 1:
        return 'cuda:0'
    
    best_gpu = max(range(num_gpus), key=lambda i: torch.cuda.get_device_properties(i).total_memory)
    return f'cuda:{best_gpu}'

def read_dem(dem_path):
    """Reads a DEM file and returns the data as a NumPy array along with its metadata."""
    with rasterio.open(dem_path) as src:
        data = src.read(1).astype(np.float32)  # Read first band
        meta = src.meta
    return data, meta

def mask_invalid_values(data, nodata_value=-9999, min_valid=-999, max_valid=10000):
    """Masks out invalid values by setting them to NaN."""
    data = np.where((data <= min_valid) | (data >= max_valid) | (data == nodata_value), np.nan, data)
    return data

def interpolate_missing_values(data, model_type='catboost'):
    """Performs interpolation on missing values in a DEM using the specified model."""
    mask = np.isfinite(data)
    coords = np.array(np.nonzero(mask)).T
    values = data[mask]
    missing_coords = np.array(np.nonzero(~mask)).T
    
    if len(values) == 0 or len(missing_coords) == 0:
        print("No valid data to train the model or no missing values to interpolate.")
        return data
    
    model = None
    device = get_best_gpu()
    
    if model_type == 'rf':
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_type == 'catboost':
        model = CatBoostRegressor(iterations=1000, verbose=100, task_type='GPU' if 'cuda' in device else 'CPU', devices=[int(device.split(':')[-1])] if 'cuda' in device else None)
    elif model_type == 'lightgbm':
        model = LGBMRegressor(n_estimators=100, learning_rate=0.1)
    elif model_type == 'xgboost':
        model = XGBRegressor(n_estimators=100, learning_rate=0.1, objective='reg:squarederror')
    else:
        raise ValueError("Unsupported model type. Choose from 'rf', 'catboost', 'lightgbm', or 'xgboost'.")
    
    model.fit(coords, values)
    data[~mask] = model.predict(missing_coords)
    return data

def save_dem(dem_path, data, meta):
    """Saves the processed DEM back to a file."""
    meta.update(dtype=rasterio.float32, nodata=np.nan)
    with rasterio.open(dem_path, 'w', **meta) as dst:
        dst.write(data, 1)

def demvfill_byML(dem_ipath, dem_opath, model_type='catboost'):
    """Full pipeline: Read, mask, interpolate, and save the DEM."""
    print('read_dem')
    data, meta = read_dem(dem_ipath)
    print('mask_invalid_values')
    data = mask_invalid_values(data)
    print('interpolate_missing_values')
    data = interpolate_missing_values(data, model_type)
    print('save_dem')
    save_dem(dem_opath, data, meta)
    print('demvfill_byML')


In [5]:
get_best_gpu()

'cuda:1'

In [9]:
from upaths import OUT_TILES_DPATH
import os

X = 30
outdir = f"{OUT_TILES_DPATH}/DEMVFILL/TILES{X}"
tiles_xdpath = f"{OUT_TILES_DPATH}/TILES{X}"

- do hpo ? you mean ensemble here , and check other places as well 
- if this works, rolfs criteria: only radar data, and no extra data to merge, purly unsupervised ::: write a whole paper here 
- add features like dem derivatives to help the model make train and make prediction
- check if the file exisit before loading data to run this, save the model in models directoy and load if existing 
- train a bigger model with all the files, and their voids, so it becomes void-filler no self training need, and compare it with self local training output 
- in the tile analysed at 30m, my method outperfomrs edem (gold standard) expected elevation and deviation 
- do for ag local and global too 

# modules: 
- a) voidfiller, 
- b)feature recovery 
- c)baias reduction 
- d)ground separation 
- d)morphometry

In [12]:
tilenames = os.listdir(tiles_xdpath)

In [15]:
for tilename in tilenames:
    #tilename = 'N10E105'
    dem_ipath = f"{tiles_xdpath}/{tilename}/{tilename}_tdem_DEM__Fw.tif"
    tile_odpath = f"{outdir}/{tilename}/" 
    os.makedirs(tile_odpath, exist_ok=True)
    dem_opath = f"{tile_odpath}/{tilename}_tdem_DEM__iML.tif"

    print(os.path.isfile(dem_ipath))
    demvfill_byML(dem_ipath, dem_opath, model_type='catboost')
    #print(dem_ipath)

True
read_dem
mask_invalid_values
interpolate_missing_values
Learning rate set to 0.132733
0:	learn: 1.8310222	total: 25.1ms	remaining: 25.1s
100:	learn: 1.2310032	total: 2.31s	remaining: 20.5s
200:	learn: 1.2066771	total: 4.59s	remaining: 18.3s
300:	learn: 1.1944815	total: 6.87s	remaining: 16s
400:	learn: 1.1866061	total: 9.15s	remaining: 13.7s
500:	learn: 1.1798905	total: 11.4s	remaining: 11.4s
600:	learn: 1.1744044	total: 13.7s	remaining: 9.12s
700:	learn: 1.1691966	total: 16s	remaining: 6.82s
800:	learn: 1.1653670	total: 18.3s	remaining: 4.54s
900:	learn: 1.1620099	total: 20.6s	remaining: 2.26s
999:	learn: 1.1589038	total: 22.8s	remaining: 0us
save_dem
demvfill_byML
True
read_dem
mask_invalid_values
interpolate_missing_values
Learning rate set to 0.116232
0:	learn: 2.1691458	total: 11.4ms	remaining: 11.4s
100:	learn: 1.9065536	total: 974ms	remaining: 8.67s
200:	learn: 1.8634730	total: 1.92s	remaining: 7.62s
300:	learn: 1.8326660	total: 2.88s	remaining: 6.69s
400:	learn: 1.8109035	t

models:
- RF [x]: too expensive 
- CB [x]
- same method as agglomerative :: not really got gpu - about 2mins tile at 30m -> should be 10 mins at 12 