PREDICT STEPS:
- load files [x]
- load df [x]
- predict zdif [x]
- get dtm [x]
- write raster [x]
- merge block [-] 
    - check transitions manually [-]
    -  if not, predict blovk [-]
- use df to get error metrics 
- make plots histogram,lineplot,river-cross section 
- erro maps difference, rmse 

In [14]:
import os
import sys
from paths import libpath
sys.path.append(libpath)
from utilsdf import list_files_by_tilenames,tile_files_to_parquet_parallel
from uvars import (tilenames_mkd, tilenames_tls,tilenames_rgn,
                   tilenames_lidar,RES_DPATH)

from uvars import aux_ending12,s1_ending12,s2_ending12,tar_ending12

-by tile per region predictions best:
- RNG [x]
- TLS [x]

if can find the code from geotile to merge all over the faces would be great, use code like rive-floodpain merge to do $$


-by block per region predictions:
- TLS 
- RNG

In [16]:
import rasterio
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from utilsdf import check_fillnulls

def get_parquets_and_geotifs_by_tile(RES_DPATH, X, tilenames,vending_all):
    fparquet_list,tile_files_list = list_files_by_tilenames(RES_DPATH, X, tilenames)
    assert len(fparquet_list) == len(tile_files_list), 'len(fparquet_list) != len(tile_files_list)'
    _, fparquet_list = tile_files_to_parquet_parallel(tilenames, 
                                                       RES_DPATH, 
                                                       X, 
                                                       vending_all)
    
    return fparquet_list,tile_files_list   


def write_predictions(predictions, tile_file, output_raster_path, block_size=(128, 128)):
    """
    Writes predictions to a new raster file using metadata from an existing tile file in blocks,
    optimised for large rasters.

    Parameters:
    - predictions (array-like): 1D array of predicted values matching the flattened raster size.
    - tile_file (str): Path to the raster file from which metadata will be read.
    - output_raster_path (str): Path where the new raster file will be saved.
    - block_size (tuple): Tuple specifying the block size for processing (default is (128, 128)).

    Returns:
    - None
    """
    # Read metadata and raster dimensions from the tile file
    with rasterio.open(tile_file) as src:
        meta = src.meta.copy()
        raster_shape = (src.height, src.width)
        transform = src.transform
        crs = src.crs

    # Reshape predictions to match the raster's dimensions
    try:
        predictions_reshaped = np.array(predictions).reshape(raster_shape)
    except ValueError:
        raise ValueError(f"Predictions array size {len(predictions)} does not match raster dimensions {raster_shape}.")

    # Update metadata for writing a new raster
    meta.update({
        "dtype": rasterio.float32,  # Ensure predictions are stored as float32
        "count": 1,  # Single band
        "compress": "lzw"  # Optional: Add compression
    })

    # Write the new raster in blocks
    with rasterio.open(output_raster_path, "w", **meta) as dst:
        print(f"Writing raster in blocks of size: {block_size}")
        for y_start in range(0, raster_shape[0], block_size[0]):
            for x_start in range(0, raster_shape[1], block_size[1]):
                # Calculate window bounds
                y_end = min(y_start + block_size[0], raster_shape[0])
                x_end = min(x_start + block_size[1], raster_shape[1])
                window = ((y_start, y_end), (x_start, x_end))

                # Slice the data for the current block
                block_data = predictions_reshaped[y_start:y_end, x_start:x_end]

                # Write the block to the corresponding window
                dst.write(block_data.astype(rasterio.float32), 1, window=rasterio.windows.Window.from_slices(*window))

    print(f"Raster written successfully to {output_raster_path}")



def get_tile_file(tile_files):
    tile_file = [i for i in tile_files if i.endswith('edem_W84.tif')] #[0]
    assert len(tile_file) == 1, 'len(tile_file) != 1'
    tile_file = tile_file[0]
    return tile_file

def load_prediction_data(fparquet,fcol):
    df = pd.read_parquet(fparquet)
    df[fcol] = check_fillnulls(df[fcol])
    return df

def load_cb_model(modelpath):
    assert os.path.exists(modelpath), f'{modelpath} does not exist'
    model = CatBoostRegressor()
    model.load_model(modelpath)
    return model

def get_prediction_df(model,df,fcol,yvar,tcol):
    df[f'ml_{yvar}'] = model.predict(Pool(df[fcol]))
    df[f'ml_{tcol}'] = df[tcol].subtract(df[f'ml_{yvar}'])
    return df

def cb_predict_workflow(outdir,modelpath,fparquet,tile_files,
                        fcol,yvar,tcol,ps,bsize=256):
    tile_ifile = get_tile_file(tile_files)
    tile_odir = os.path.join(outdir,tile_ifile.split('/')[-2])
    os.makedirs(tile_odir,exist_ok=True)
    tile_ofile = os.path.join(tile_odir,tile_ifile.split('/')[-1].replace('.tif','_ML.tif'))

    if not os.path.isfile(tile_ofile):
        f'writing:: {tile_ofile} ...'
        df = load_prediction_data(fparquet,fcol)
        assert len(df) == ps * ps, 'Grid size does not match'
        model = load_cb_model(modelpath)
        df = get_prediction_df(model,df,fcol,yvar,tcol)

        
        write_predictions(predictions=df[f'ml_{tcol}'], #
                        tile_file=tile_ifile, 
                        output_raster_path=tile_ofile, 
                        block_size=(bsize, bsize))
    else:
        print(f'{tile_ofile} already exists')


yvar = "zdif"
tcol = 'edem_w84'
rcol = 'multi_dtm_lidar'
fcol = ['egm08', 'egm96', 'tdem_hem', 'multi_s1_band1', 'multi_s1_band2',
        'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3']
tar_ending,aux_ending,s1_ending,s2_ending = aux_ending12,s1_ending12,s2_ending12,tar_ending12
vending_all = tar_ending+aux_ending+s1_ending+s2_ending
X = 12 
ps = 9001
tilenames = tilenames_mkd
bsize = 512 # match with grid size X 
modelpath = ''
outdir = ''
fparquet_list,tile_files_list = get_parquets_and_geotifs_by_tile(RES_DPATH, X, tilenames_mkd,vending_all)

# for fparquet,tile_files in zip(fparquet_list,tile_files_list):
#     cb_predict_workflow(outdir,modelpath,fparquet,tile_files,
#                         fcol,yvar,tcol,ps,bsize=256)
    
## clean this code, and make it more modular and improve any thing that needs improvement
## add timer per cb_predict_workflow 
# add timer for the whole process too see how long it takes to predict all the tiles
# print the timer at the end and write both times to the log file txt t

Filtered files count: 8/25
Parquet file already exists: /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES12/N09E105/N09E105_byldem.parquet
Filtered files count: 8/25
Parquet file already exists: /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES12/N09E106/N09E106_byldem.parquet
Filtered files count: 8/25
Filtered files count: 8/25
Parquet file already exists: /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES12/N10E104/N10E104_byldem.parquet
Filtered files count: 8/25
Parquet file already exists: /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES12/N10E105/N10E105_byldem.parquet
Parquet file already exists: /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES12/N10E106/N10E106_byldem.parquet


In [11]:
modelpath = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/train_cb_bysample/12/zdif/iter10000_n81000000_eq6xtile/catboost_10000_42_model.txt"
outdir = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/PREDICTIONS/TESTING/"


In [88]:
import logging

def get_best_model(predictor):
    """
    Safely retrieves the best model for a given AutoGluon predictor.

    Args:
        predictor (autogluon.tabular.TabularPredictor): The trained AutoGluon predictor.

    Returns:
        str: The name of the best model if available.

    Raises:
        ValueError: If no models are available for inference.
    """
    try:
        # Check if the predictor is valid
        if predictor is None:
            raise ValueError("The predictor instance is None. Please provide a valid AutoGluon predictor.")
        
        # Attempt to get the best model
        best_model = predictor.model_best
        if not best_model:
            raise ValueError("No best model found. Ensure the predictor has been trained and contains models.")
        
        return best_model
    except AssertionError as e:
        # Handle specific assertion errors
        error_message = f"AssertionError encountered: {e}. Ensure the predictor has fit models that can infer."
        logging.error(error_message)
        raise ValueError(error_message) from e
    except Exception as e:
        # Handle other unexpected errors
        logging.error(f"Unexpected error: {e}")
        raise

def safe_predict(predictor, data):
    """
    Safely makes predictions using a given AutoGluon predictor.

    Args:
        predictor (autogluon.tabular.TabularPredictor): The trained AutoGluon predictor.
        data (pd.DataFrame): The input data for prediction.

    Returns:
        pd.DataFrame: The predictions.

    Raises:
        ValueError: If no models are available or if data is invalid.
    """
    try:
        # Check if the predictor and data are valid
        if predictor is None:
            raise ValueError("The predictor instance is None. Please provide a valid AutoGluon predictor.")
        
        if data is None or data.empty:
            raise ValueError("The input data is None or empty. Provide valid data for prediction.")

        # Attempt prediction
        predictions = predictor.predict(data)
        return predictions
    except AssertionError as e:
        # Handle specific assertion errors
        error_message = f"AssertionError encountered: {e}. Ensure the predictor has fit models and valid data."
        logging.error(error_message)
        raise ValueError(error_message) from e
    except Exception as e:
        # Handle other unexpected errors
        logging.error(f"Unexpected error during prediction: {e}")
        raise


In [None]:


def ag_mbest_predict_workflow(outdir,dirname, modelpath,
                             fparquet,tile_files,
                            fcol,yvar,tcol,ps,bsize=256):
    
    predictor = TabularPredictor.load(modelpath)

    # load the model first, and the perfom he cheks if not working, then dont even load the data 
    # if it passes then go one to load the data and make the predictions
    
    
    outpath = os.path.join(outdir,dirname)
    tile_ifile = get_tile_file(tile_files)
    tile_odir = os.path.join(outpath,tile_ifile.split('/')[-2])
    os.makedirs(tile_odir,exist_ok=True)
    tile_ofile = os.path.join(tile_odir,tile_ifile.split('/')[-1].replace('.tif','_ML.tif'))

    if not os.path.isfile(tile_ofile):
        f'writing:: {tile_ofile} ...'
        df = load_prediction_data(fparquet,fcol)
        assert len(df) == ps * ps, 'Grid size does not match'
      
        df[f'ml_{yvar}'] = predictor.predict(df[fcol])
        df[f'ml_{tcol}'] = df[tcol].subtract(df[f'ml_{yvar}'])

        
        write_predictions(predictions=df[f'ml_{tcol}'], #
                        tile_file=tile_ifile, 
                        output_raster_path=tile_ofile, 
                        block_size=(bsize, bsize))
    else:
        print(f'{tile_ofile} already exists')

In [19]:
idx = 0
fparquet,tile_files  = fparquet_list[0],tile_files_list[0] 
#df = load_prediction_data(fparquet,fcol)
#assert len(df) == ps * ps, 'Grid size does not match'

In [40]:
modelpath_ag = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit1140_best_quality_135000000/"
#modelpath_ag = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit1140_best_quality_135000000/models/trainer.pkl"

In [98]:
import os
from autogluon.tabular import TabularPredictor

def ag_mbest_predict_workflow(outdir, dirname, modelpath, 
                              fparquet, tile_files, 
                              fcol, yvar, tcol, ps, bsize=256):
    """
    Workflow to perform predictions using a pre-trained model. The function checks 
    the model before loading the data to avoid unnecessary costs if the model is invalid.

    Args:
        outdir (str): Output directory path.
        dirname (str): Directory name for organising outputs.
        modelpath (str): Path to the saved model.
        fparquet (str): Path to the input parquet file containing prediction data.
        tile_files (list): List of tile files.
        fcol (str): Feature column used for predictions.
        yvar (str): Variable to predict.
        tcol (str): Target column.
        ps (int): Grid size parameter.
        bsize (int, optional): Block size for raster output. Defaults to 256.

    Returns:
        None
    """
    # Step 1: Load and validate the model
    try:
        predictor = TabularPredictor.load(modelpath)
    except Exception as e:
        raise RuntimeError(f"Failed to load the model from {modelpath}: {e}")

    # Perform model validation checks
    # Example: Check for necessary metadata or expected feature columns
    expected_features = fcol
    model_features = predictor.feature_metadata.get_features()
    print(f"Model features: {model_features}")
    print('-'*50)
    if not all(feature in model_features for feature in expected_features):
        raise ValueError(f"Model validation failed: Missing required feature columns {expected_features}")

    # If the model passes validation, proceed
    print("Model loaded and validated successfully.")

    # Step 2: Prepare output paths
    outpath = os.path.join(outdir, dirname)
    tile_ifile = get_tile_file(tile_files)
    tile_odir = os.path.join(outpath, tile_ifile.split('/')[-2])
    os.makedirs(tile_odir, exist_ok=True)
    tile_ofile = os.path.join(tile_odir, tile_ifile.split('/')[-1].replace('.tif', '_ML.tif'))

    # Step 3: Check if output already exists
    if not os.path.isfile(tile_ofile):
        print(f"Writing: {tile_ofile} ...")
        
        # Step 4: Load prediction data
        df = load_prediction_data(fparquet, fcol)
        if len(df) != ps * ps:
            raise ValueError(f"Grid size mismatch: Expected {ps * ps}, got {len(df)}")

        # Step 5: Perform predictions and write results
        df[f'ml_{yvar}'] = predictor.predict(df[fcol])
        df[f'ml_{tcol}'] = df[tcol] - df[f'ml_{yvar}']

        write_predictions(predictions=df[f'ml_{tcol}'],  # Residuals
                          tile_file=tile_ifile,
                          output_raster_path=tile_ofile,
                          block_size=(bsize, bsize))
    else:
        print(f"{tile_ofile} already exists")



In [99]:
outdir = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/PREDICTIONS/TESTING_AG/"
dirname = "tlimit120_good_quality"
modelpath = predictor_path = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit120_good_quality/"
idx = 0
fparquet,tile_files  = fparquet_list[0],tile_files_list[0] 

In [100]:

ag_mbest_predict_workflow(outdir, dirname, modelpath, 
                              fparquet, tile_files, 
                              fcol, yvar, tcol, ps, bsize=256)

Model features: ['egm08', 'egm96', 'tdem_hem', 'multi_s1_band1', 'multi_s1_band2', 'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3']
--------------------------------------------------
Model loaded and validated successfully.
Writing: /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/PREDICTIONS/TESTING_AG/tlimit120_good_quality/N09E105/N09E105_edem_W84_ML.tif ...
Writing raster in blocks of size: (256, 256)
Raster written successfully to /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/PREDICTIONS/TESTING_AG/tlimit120_good_quality/N09E105/N09E105_edem_W84_ML.tif


In [57]:
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon
#print(autogluon.__version__)


In [58]:
import pkg_resources
print(pkg_resources.get_distribution("autogluon").version)


1.2


In [72]:
predictor_path = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit1140_best_quality_135000000/"
predictor_path = "/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit120_good_quality/"

'/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit120_good_quality'

In [74]:
agx_path = '/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit1140_best_quality_135000000/utils/data/X_val.pkl'
agy_path = '/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit1140_best_quality_135000000/utils/data/y_val.pkl'

In [64]:
d = pd.read_pickle(agx_path)

In [None]:
case 1: ---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[62], line 1
----> 1 best_model = predictor.model_best()
      2 print(f"The best model is: {best_model}")

File ~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/tabular/predictor/predictor.py:3720, in TabularPredictor.model_best(self)
   3709 @property
   3710 def model_best(self) -> str:
   3711     """
   3712     Returns the string model name of the best model by validation score that can infer.
   3713     This is the same model used during inference when `predictor.predict` is called without specifying a model.
   (...)
   3718     String model name of the best model
   3719     """
-> 3720     return self._model_best(can_infer=True)

File ~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/tabular/predictor/predictor.py:3729, in TabularPredictor._model_best(self, can_infer)
   3727     if self._trainer.model_best in models:
   3728         return self._trainer.model_best
-> 3729 return self._trainer.get_model_best(can_infer=can_infer)

File ~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/core/trainer/abstract_trainer.py:1651, in AbstractTrainer.get_model_best(self, can_infer, allow_full, infer_limit, infer_limit_as_child)
   1649 models = self.get_model_names(can_infer=can_infer)
   1650 if not models:
-> 1651     raise AssertionError("Trainer has no fit models that can infer.")
   1652 models_full = self.get_models_attribute_dict(models=models, attribute="refit_full_parent")
   1653 if not allow_full:

AssertionError: Trainer has no fit models that can infer.

case 2:

{
	"name": "AssertionError",
	"message": "Trainer has no fit models that can infer.",
	"stack": "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)\nCell \u001b[0;32mIn[48], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m d1 \u001b[38;5;241m=\u001b[39m \u001b[43mpredictor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\nFile \u001b[0;32m~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/tabular/predictor/predictor.py:2364\u001b[0m, in \u001b[0;36mTabularPredictor.predict\u001b[0;34m(self, data, model, as_pandas, transform_features, decision_threshold)\u001b[0m\n\u001b[1;32m   2362\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m decision_threshold \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   2363\u001b[0m     decision_threshold \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecision_threshold\n\u001b[0;32m-> 2364\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_learner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mas_pandas\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mas_pandas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransform_features\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecision_threshold\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecision_threshold\u001b[49m\u001b[43m)\u001b[49m\n\nFile \u001b[0;32m~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/tabular/learner/abstract_learner.py:208\u001b[0m, in \u001b[0;36mAbstractTabularLearner.predict\u001b[0;34m(self, X, model, as_pandas, inverse_transform, transform_features, decision_threshold)\u001b[0m\n\u001b[1;32m    206\u001b[0m     decision_threshold \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.5\u001b[39m\n\u001b[1;32m    207\u001b[0m X_index \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(X\u001b[38;5;241m.\u001b[39mindex) \u001b[38;5;28;01mif\u001b[39;00m as_pandas \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 208\u001b[0m y_pred_proba \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict_proba\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    209\u001b[0m \u001b[43m    \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mas_pandas\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mas_multiclass\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minverse_transform\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransform_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransform_features\u001b[49m\n\u001b[1;32m    210\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    211\u001b[0m problem_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlabel_cleaner\u001b[38;5;241m.\u001b[39mproblem_type_transform \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproblem_type\n\u001b[1;32m    212\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m get_pred_from_proba(y_pred_proba\u001b[38;5;241m=\u001b[39my_pred_proba, problem_type\u001b[38;5;241m=\u001b[39mproblem_type, decision_threshold\u001b[38;5;241m=\u001b[39mdecision_threshold)\n\nFile \u001b[0;32m~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/tabular/learner/abstract_learner.py:189\u001b[0m, in \u001b[0;36mAbstractTabularLearner.predict_proba\u001b[0;34m(self, X, model, as_pandas, as_multiclass, inverse_transform, transform_features)\u001b[0m\n\u001b[1;32m    187\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m transform_features:\n\u001b[1;32m    188\u001b[0m         X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtransform_features(X)\n\u001b[0;32m--> 189\u001b[0m     y_pred_proba \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_trainer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict_proba\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    190\u001b[0m y_pred_proba \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_post_process_predict_proba(\n\u001b[1;32m    191\u001b[0m     y_pred_proba\u001b[38;5;241m=\u001b[39my_pred_proba, as_pandas\u001b[38;5;241m=\u001b[39mas_pandas, index\u001b[38;5;241m=\u001b[39mX_index, as_multiclass\u001b[38;5;241m=\u001b[39mas_multiclass, inverse_transform\u001b[38;5;241m=\u001b[39minverse_transform\n\u001b[1;32m    192\u001b[0m )\n\u001b[1;32m    193\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m y_pred_proba\n\nFile \u001b[0;32m~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/core/trainer/abstract_trainer.py:955\u001b[0m, in \u001b[0;36mAbstractTrainer.predict_proba\u001b[0;34m(self, X, model)\u001b[0m\n\u001b[1;32m    953\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpredict_proba\u001b[39m(\u001b[38;5;28mself\u001b[39m, X: pd\u001b[38;5;241m.\u001b[39mDataFrame, model: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m np\u001b[38;5;241m.\u001b[39mndarray:\n\u001b[1;32m    954\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m model \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 955\u001b[0m         model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_best\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    956\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_predict_proba_model(X\u001b[38;5;241m=\u001b[39mX, model\u001b[38;5;241m=\u001b[39mmodel)\n\nFile \u001b[0;32m~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/core/trainer/abstract_trainer.py:962\u001b[0m, in \u001b[0;36mAbstractTrainer._get_best\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    960\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_best\n\u001b[1;32m    961\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 962\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_model_best\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\nFile \u001b[0;32m~/miniconda3/envs/agluon/lib/python3.11/site-packages/autogluon/core/trainer/abstract_trainer.py:1651\u001b[0m, in \u001b[0;36mAbstractTrainer.get_model_best\u001b[0;34m(self, can_infer, allow_full, infer_limit, infer_limit_as_child)\u001b[0m\n\u001b[1;32m   1649\u001b[0m models \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_model_names(can_infer\u001b[38;5;241m=\u001b[39mcan_infer)\n\u001b[1;32m   1650\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m models:\n\u001b[0;32m-> 1651\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAssertionError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTrainer has no fit models that can infer.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1652\u001b[0m models_full \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_models_attribute_dict(models\u001b[38;5;241m=\u001b[39mmodels, attribute\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrefit_full_parent\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m allow_full:\n\n\u001b[0;31mAssertionError\u001b[0m: Trainer has no fit models that can infer."
}


def ag_predict_mbest(predictor_path,df, fcol, yvar, tcol):
    #predict using the best model 
    predictor = TabularPredictor.load(predictor_path)
    # check that predict is load, and able to make predictions, and not corrupted or  error  case 1 and 2
    df[f'ml_{yvar}'] = predictor.predict(df[fcol])
    df[f'ml_{tcol}'] = df[tcol].subtract(df[f'ml_{yvar}'])
    return df # we could discard all other variables should we just need to go one with one

In [85]:
def ag_predict_mbest(predictor_path,df, fcol, yvar, tcol):
    #predict using the best model 
    predictor = TabularPredictor.load(predictor_path)
    # check that predict is load, and able to make predictions, and not corrupted or  error  case 1 and 2
    df[f'ml_{yvar}'] = predictor.predict(df[fcol])
    df[f'ml_{tcol}'] = df[tcol].subtract(df[f'ml_{yvar}'])
    return df # we could discard all other variables should we just need to go one with one

def ag_predict_mtopfive(predictor_path,df, fcol, yvar, tcol):
    # predict using the top five models 
    pass 
    

In [86]:
df =  ag_predict_mbest(predictor_path,df, fcol, yvar, tcol)

In [87]:
df

Unnamed: 0,egm08,egm96,tdem_hem,multi_s1_band1,multi_s1_band2,multi_s2_band1,multi_s2_band2,multi_s2_band3,edem_w84,tdem_dem__fw,multi_dtm_lidar,zdif,ml_zdif,ml_edem_w84
0,-8.909122,-9.052000,6.867803,-18.357485,-27.563616,0.103014,0.116391,0.128022,-8.909220,,,,-8.009323,-0.899897
1,-8.908708,-9.051620,6.971410,-18.355732,-27.560644,0.103996,0.117849,0.129290,-8.908882,,,,-8.024690,-0.884192
2,-8.908293,-9.051241,7.819718,-18.277824,-27.549456,0.103668,0.117686,0.129413,-8.908545,,,,-8.036531,-0.872014
3,-8.907878,-9.050861,6.184140,-18.199917,-27.538267,0.103340,0.117522,0.129536,-8.908208,,,,-8.036531,-0.871676
4,-8.907463,-9.050482,7.063742,-18.128281,-27.599447,0.103559,0.117577,0.129275,-8.907657,,,,-8.006196,-0.901461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81017996,-2.750880,-2.905636,-32767.000000,-18.797722,-29.085106,0.087559,0.123204,0.146951,-2.750668,,,,-2.033511,-0.717157
81017997,-2.750468,-2.905227,-32767.000000,-18.593487,-29.173258,0.087127,0.124098,0.146870,-2.750334,,,,-2.034126,-0.716208
81017998,-2.750057,-2.904818,-32767.000000,-18.651117,-29.158873,0.086188,0.123317,0.146588,-2.749999,,,,-2.034126,-0.715873
81017999,-2.749645,-2.904409,-32767.000000,-18.708750,-29.144487,0.085248,0.122536,0.146306,-2.749665,,,,-2.041474,-0.708191


In [75]:
# predictor = TabularPredictor.load(predictor_path)
# predictor.path

'/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/MODELS/autogluon_study/12/zdif/tlimit120_good_quality'

In [76]:
dd = predictor.predict(d)

In [82]:
type(dd)

pandas.core.series.Series

In [81]:
predictor.model_best

'LightGBMXT_BAG_L1_FULL'

In [78]:
d2 = predictor.predict(df[fcol])

In [79]:
d2.shape

(81018001,)

In [80]:
# why the big model is not working? investigate the issue further and send to git hub 
# make prediction with small model to set up the pipelline 

In [59]:
ndf = df[yvar] = df[tcol].subtract(df[rcol])

In [62]:
best_model = predictor.model_best()
print(f"The best model is: {best_model}")

AssertionError: Trainer has no fit models that can infer.

In [53]:
predictor.get_model_best
<bound method TabularPredictorDeprecatedMixin.get_model_best of <autogluon.tabular.predictor.predictor.TabularPredictor object at 0x73a7b69dfa50>>
# how to use that to make predictions on tabular dataset, i have got 1.2 version of autogluon

<bound method TabularPredictorDeprecatedMixin.get_model_best of <autogluon.tabular.predictor.predictor.TabularPredictor object at 0x73a7b69dfa50>>

In [55]:
predictor.save_path

AttributeError: 'TabularPredictor' object has no attribute 'save_path'

In [54]:
print(predictor.get_model_names())
ValueError: `get_model_names` has been deprecated and will be removed in version 1.2. Please use `model_names` instead
# how to find my version of autogluon


ValueError: `get_model_names` has been deprecated and will be removed in version 1.2. Please use `model_names` instead

In [48]:
d1 = predictor.predict(df)

AssertionError: Trainer has no fit models that can infer.

In [46]:
df.columns

Index(['egm08', 'egm96', 'tdem_hem', 'multi_s1_band1', 'multi_s1_band2',
       'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3', 'edem_w84',
       'tdem_dem__fw', 'multi_dtm_lidar'],
      dtype='object')

In [30]:
d1 = predictor.predict(df[fcol])

AssertionError: Trainer has no fit models that can infer.

In [27]:
def get_prediction_df(model,df,fcol,yvar,tcol):
    df[f'ml_{yvar}'] = model.predict(df[fcol])
    df[f'ml_{tcol}'] = df[tcol].subtract(df[f'ml_{yvar}'])
    return df

In [28]:
df = get_prediction_df(predictor,df,fcol,yvar,tcol)

AssertionError: Trainer has no fit models that can infer.

In [64]:

model = load_cb_model(modelpath)
df = get_prediction_df(model,df,fcol,yvar,tcol)

tile_ifile = get_tile_file(tile_files)
tile_odir = os.path.join(outdir,tile_ifile.split('/')[-2])
os.makedirs(tile_odir,exist_ok=True)
tile_ofile = os.path.join(tile_odir,tile_ifile.split('/')[-1].replace('.tif','_ML.tif'))
write_predictions(predictions=df[f'ml_{tcol}'], #
                 tile_file=tile_ifile, 
                 output_raster_path=tile_ofile, 
                 block_size=(bsize, bsize))

In [74]:
df.columns

Index(['egm08', 'egm96', 'tdem_hem', 'multi_s1_band1', 'multi_s1_band2',
       'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3', 'edem_w84',
       'tdem_dem__fw', 'multi_dtm_lidar', 'ml_zdif', 'ml_edem_w84'],
      dtype='object')

In [73]:
pdf.columns

Index(['egm08', 'egm96', 'tdem_hem', 'multi_s1_band1', 'multi_s1_band2',
       'multi_s2_band1', 'multi_s2_band2', 'multi_s2_band3', 'edem_w84',
       'tdem_dem__fw', 'multi_dtm_lidar', 'ml_zdif', 'ml_edem_w84'],
      dtype='object')

In [86]:

tile_ifile

'/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/TILES12/N09E105/N09E105_edem_W84.tif'

'N09E105'

'/media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/PREDICTIONS/TESTING/N09E105/N09E105_edem_W84_ml.tif'

Writing raster in blocks of size: (512, 512)
Raster written successfully to /media/ljp238/12TBWolf/RSPROX/OUTPUT_TILES/PREDICTIONS/TESTING/N09E105/N09E105_edem_W84_ml.tif
