In [1]:
import os 
import time

In [2]:
def print_context(text):
    print(' ')
    """
    Prints the given text in a styled format, mimicking SAGA GIS console output.

    Parameters:
        text (str): The text to print.
    """
    # Border symbols
    border_char = "=" * 60
    padding_char = " " * 4

    # Print formatted text
    print(border_char)
    print(f"{padding_char}ML - Process Log")
    print(border_char)

    for line in text.split("\n"):
        print(f"{padding_char}{line}")

    print(border_char)

def measure_time_beautifully(task_description, task_function, *args, **kwargs):
    """
    Measures the execution time of a given function and prints the elapsed time in various units.

    Parameters:
        task_description (str): A description of the task being measured.
        task_function (callable): The function to execute and measure.
        *args: Positional arguments to pass to the task_function.
        **kwargs: Keyword arguments to pass to the task_function.
    """
    # Start the timer
    start_time = time.perf_counter()

    # Execute the task function
    result = task_function(*args, **kwargs)

    # Stop the timer
    end_time = time.perf_counter()

    # Calculate elapsed time
    elapsed_seconds = end_time - start_time
    elapsed_minutes = elapsed_seconds / 60
    elapsed_hours = elapsed_minutes / 60
    elapsed_days = elapsed_hours / 24

    # Border symbols
    border_char = "=" * 60
    padding_char = " " * 4

    # Print the execution time beautifully
    print(border_char)
    print(f"{padding_char}Task Performance Report")
    print(border_char)
    print(f"{padding_char}Task: {task_description}")
    print(f"{padding_char}Elapsed Time:")
    print(f"{padding_char * 2}{elapsed_seconds:.2f} seconds")
    print(f"{padding_char * 2}{elapsed_minutes:.2f} minutes")
    print(f"{padding_char * 2}{elapsed_hours:.2f} hours")
    print(f"{padding_char * 2}{elapsed_days:.2f} days")
    print(border_char)

    return result


In [3]:
import pandas as pd 
from ud_tilepartquets import list_files_by_tilenames
from pprint import pprint

In [4]:
ftcol = ['egm08',
 'egm96',
 'tdem_hem',
 'multi_s1_band1',
 'multi_s1_band2',
 'multi_s2_band1',
 'multi_s2_band2',
 'multi_s2_band3',
 'edem_w84',
 'tdem_dem__fw',
 'multi_dtm_lidar']

N = 1000
X=90
yvar="zdif"
tcol='edem_w84' #"edem"
rcol='multi_dtm_lidar'
fcol = ['egm08', 'egm96', 'tdem_hem', 
        'multi_s1_band1', 'multi_s1_band2',
        'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3']##, 'edem_w84']

model_type="catboost"
num_rounds=10000 # d:1000

def estimate_nsamples(target_samples=81_000_000, num_tiles=6, multipliers=[0.2, 0.3, 0.5, 0.8, 1, 2, 3]):
    samples_per_tile = target_samples // num_tiles
    nsamples = [int(samples_per_tile * multiplier) for multiplier in multipliers]
    return nsamples

# Calling the function


multipliers=[0.1, 0.2, 0.3, 0.5, 0.8, 1, 2, 3]
target_samples=81_000_000
num_tiles=6
Nsamples = estimate_nsamples(target_samples, num_tiles, multipliers)
print(Nsamples)
assert len(Nsamples) == len(multipliers), "Multipliers do not much"

[1350000, 2700000, 4050000, 6750000, 10800000, 13500000, 27000000, 40500000]


In [7]:
from uvars import RES_DPATH, tilenames_lidar
from ud_tilepartquets import dropnulls_bycol,check_fillnulls
from sklearn.model_selection import train_test_split

In [101]:
def full_pipeline(model_type="catboost", num_rounds=100, X=12, N=1000):
    yvar="zdif"
    tcol='edem_w84' #"edem"
    rcol='multi_dtm_lidar'
    fcol = ['egm08', 'egm96', 'tdem_hem', 
            'multi_s1_band1', 'multi_s1_band2',
            'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3']


    """
    Full pipeline for training a model with given parameters.
    """
    out_dpath = f'output/cb_train_by_sample/{X}/{yvar}/nsample{N}_num_rounds{num_rounds}'
    os.makedirs(out_dpath, exist_ok=True)
    print(f"Output directory created or already exists: {out_dpath}")


    print_context('# Step 1: List files by tilenames')
    fparquet_list, tile_files_list = list_files_by_tilenames(RES_DPATH, X, tilenames_lidar)
    print(f"Found {len(fparquet_list)} parquet files and {len(tile_files_list)} tile files.")
    pprint(fparquet_list)

    # if X == 12
    dflist  = []
    for fparquet in fparquet_list:
        df = pd.read_parquet(fparquet)#, columns=tfcols)
        print(f"Dropping nulls from column '{tcol}'...")
        df = dropnulls_bycol(df, col=tcol)
        print(f'Sampling from {os.path.basename(fparquet)}')
        L = len(df)
        if L < N:
            print(f"Warning: Requested sample size {N} exceeds available rows {L}. Sampling all rows instead.")
            df = df.sample(L)
        else:
            df = df.sample(N)
            dflist.append(df)
    df = pd.concat(dflist, ignore_index=True)

    df = check_fillnulls(df)
    
    print(f"Calculating {yvar} as the difference between '{tcol}' and 'ldem'...")
    df[yvar] = df[tcol].subtract(df[rcol])
    print(f"First few values of {yvar}:\n", df[yvar].head())

    # Step 7: Split dataframe into training and validation sets
    print("Splitting dataframe into training and validation sets...")
    train_df, valid_df = train_test_split(df, test_size=0.2, random_state=43)
    print(f"Training set size: {train_df.shape[0]} rows, Validation set size: {valid_df.shape[0]} rows.")
    del df  # Free memory

    # Step 9: Train the model
    print(f"Training {model_type} model...")
    train_model(train_data=train_df, 
                valid_data=valid_df, 
                target_col=yvar, 
                features_col=fcol, 
                output_dir=out_dpath, # becomes outdir 
                model_type=model_type, 
                num_rounds=num_rounds)
    print("Model training complete.")
    print(fname)


    

In [102]:
measure_time_beautifully(
                "Full Model Training Pipeline Training Single Model",
                full_pipeline,
                model_type=model_type,
                num_rounds=num_rounds,
                X=X,
                N = N
            )

Output directory created or already exists: output/cb_train_by_sample/90/zdif/nsample1000_num_rounds10000
 
    ML - Process Log
    # Step 1: List files by tilenames
Found 6 parquet files and 6 tile files.
['/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES90/N09E105/N09E105_byldem.parquet',
 '/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES90/N09E106/N09E106_byldem.parquet',
 '/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES90/N10E104/N10E104_byldem.parquet',
 '/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES90/N10E105/N10E105_byldem.parquet',
 '/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES90/N10E106/N10E106_byldem.parquet',
 '/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES90/N13E103/N13E103_byldem.parquet']
    Task Performance Report
    Task: Full Model Training Pipeline Training Single Model
    Elapsed Time:
        0.00 seconds
        0.00 minutes
        0.00 hours
        0.00 days


In [77]:
f = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES90/N09E106/N09E106_byldem.parquet"
df = pd.read_parquet(f)

In [80]:
print(df.columns.tolist())

['egm08', 'egm96', 'tdem_hem', 'multi_s1_band1', 'multi_s1_band2', 'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3', 'edem_w84', 'tdem_dem__fw', 'multi_dtm_lidar']
