In [0]:
import time
import pandas as pd
import glob
import os
import shutil
import joblib
import numpy as np
import rasterio as rio
from rasterio.merge import merge as rio_merge
from pathlib import Path
import xgboost as xgb
import re

In [0]:
def convert(seconds):
    return time.strftime("%H:%M:%S", time.gmtime(seconds))

def split_dataframe(df, chunk_size):
    num_chunks = len(df) // chunk_size + int(len(df) % chunk_size != 0)
    return [df.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(num_chunks)]

def extract_index(filename):
    match = re.search(r'class_(\d+)\.csv', filename)
    return int(match.group(1)) if match else -1

def get_model_feature_names(model_path):
    try:
        model = joblib.load(model_path)
        if hasattr(model, 'feature_names'):
            return model.feature_names
        elif hasattr(model, 'get_booster'):
            return model.get_booster().feature_names
        else:
            print("Model does not have feature_names attribute.")
            return None
    except Exception as e:
        print(f"Error loading model or extracting feature names: {e}")
        return None

In [0]:
def predictClass(infile, outpath, i, classifier, scaler, feature_names):
    import os
    try:
        # Load data
        df = pd.read_csv(infile)
        print(f"Processing chunk {i}")
        df_pred = df[feature_names]
        # Use only XGBClassifier for both class and probability prediction
        temp_model_file = 'temp_model.json'
        classifier.save_model(temp_model_file)
        sk_classifier = xgb.XGBClassifier()
        try:
            sk_classifier.load_model(temp_model_file)
        except Exception as e:
            raise ValueError(f"Failed to load model into XGBClassifier: {e}")
        # Predict class values
        class_values = sk_classifier.predict(df_pred)
        # Compute probabilities
        prob_values = sk_classifier.predict_proba(df_pred)
        confidence_values = np.max(prob_values, axis=1)
        confidence_values = np.clip(confidence_values, 0, 1)  # Clip confidence to [0, 1]
        print(f"Chunk {i} confidence range: {confidence_values.min():.6f} to {confidence_values.max():.6f}, dtype: {confidence_values.dtype}")
        print(f"Unique confidence values (sample): {np.unique(confidence_values)[:10]}")
        # Add predictions to output DataFrame
        df['Class'] = class_values + 1  # Adjust to 1-based labels
        df['Confidence'] = confidence_values
        # Save to CSV with error handling
        output_file = os.path.join(outpath, f'class_{i}.csv')
        try:
            df.to_csv(output_file, index=False)
        except Exception as e:
            print(f"An error occurred while writing CSV: {e}")
            raise
    except Exception as e:
        print(f"An error occurred: {e}")
        raise
    finally:
        if os.path.exists(temp_model_file):
            os.remove(temp_model_file)
            print(f"Removed temporary model file: {temp_model_file}")

In [0]:
def predict(grid):
    # Load the pre-trained model and scaler
    try:
        model_path = '/dbfs/mnt/lab/unrestricted/KritiM/classification/model_10_features.joblib'
        scaler_path = '/dbfs/mnt/lab/unrestricted/KritiM/classification/scaler.joblib'
        best_model = joblib.load(model_path)
        scaler = joblib.load(scaler_path)
        scaler_feature_names = scaler.feature_names_in_.tolist()
        print(f"Scaler feature names (full numerical): {scaler_feature_names}")
        model_feature_names = get_model_feature_names(model_path)
        if model_feature_names is None:
            print("Falling back to scaler feature names for model.")
            model_feature_names = scaler_feature_names
        print(f"Model feature names (subset): {model_feature_names}")
    except Exception as e:
        print(f"Error loading model or scaler: {e}")
        raise
    
    # Define categorical columns
    categorical_cols = ['Landcover_LE', 'Profile_depth', 'CaCO3_rank', 'Texture_group', 
                    'Aggregate_texture', 'Aquifers', 'bedrock_raster_50m', 'ALC_old']
    
    # Prepare data for classification
    dftrain = pd.read_csv('/dbfs/mnt/lab/unrestricted/KritiM/classification/trainingSample.csv')
    mode_values = {col: dftrain[col].mode()[0] for col in categorical_cols if col in dftrain.columns}
    mean_values = dftrain.select_dtypes(include='number').mean()
    median_values = dftrain.select_dtypes(include='number').median()
    traincols = dftrain.columns.tolist()
    pathtogrids = Path('/dbfs/mnt/lab/unrestricted/KritiM/GRID/')
    subdirectories = [subdir for subdir in pathtogrids.iterdir() if subdir.is_dir()]
    subdirectories.sort()
    
    df = pd.DataFrame()
    for folder in subdirectories:
        print(f'Working on folder: {folder}')
        files = [file for file in folder.glob(grid + '*.tif') if file.is_file()]
        if not files:
            print(f"No files found for grid {grid} in {folder}")
            continue
        for file in files:
            grid_name = file.name[:5]
            var = file.name[6:-4]
            print(f"Processing file: {file.name}, Grid: {grid_name}, Variable: {var}")
            with rio.open(file, 'r') as src:
                data = src.read(1).ravel()
                if 'EAST' not in df.columns:
                    rows, cols = np.meshgrid(
                        np.arange(src.height),
                        np.arange(src.width),
                        indexing="ij"
                    )
                    xs, ys = rio.transform.xy(src.transform, rows, cols)
                    df['EAST'] = np.array(xs).ravel()
                    df['NORTH'] = np.array(ys).ravel()
                df[var] = data
                
    print(f'Created dataframe for {grid}...')
    print("Dataframe columns:", df.columns.tolist())
    
    # Check for missing features
    expected_features = scaler_feature_names + [col for col in categorical_cols if col in df.columns]
    missing_features = [col for col in expected_features if col not in df.columns]
    if missing_features:
        print(f"Error: Missing features for grid {grid}: {missing_features}")
        return
    
    # Write dataframe to CSV
    outdir = Path('/dbfs/mnt/lab/unrestricted/KritiM/Table')
    
    try:
        outdir.mkdir(parents=True, exist_ok=True)
    except Exception as e:
        print(f"Failed to create directory {outdir}: {e}")
        raise
    df.to_csv(outdir / (grid + '.csv'), index=False)
    
    # Read one of the raster grid files for profile information
    pathraster = Path('/dbfs/mnt/lab/unrestricted/KritiM//GRID/Elevation_dm')
    raster = f"{grid}_Elevation_dm.tif"
    try:
        with rio.open(os.path.join(pathraster, raster), 'r') as src:
            profile = src.profile
            profile.update(count=1)
            band = src.read(1)
            nodata_value = src.nodatavals[0] if src.nodatavals else -9999
    except Exception as e:
        print(f"Error reading raster file: {e}")
        raise
    
    # Read the dataframe for prediction
    pathdata = Path('/dbfs/mnt/lab/unrestricted/KritiM/Table')
    df = pd.read_csv(pathdata / f"{grid}.csv")
    df = df.replace(nodata_value, np.nan)
    
    # Save original NaN mask for later masking in raster
    nan_mask = df.isna().any(axis=1)
    
    # Impute NaNs: numerical with median, categorical with mode
    for col in df.columns:
        if col in categorical_cols:
            if col in mode_values:
                df[col] = df[col].fillna(mode_values[col])
        elif pd.api.types.is_numeric_dtype(df[col]):
            if col in median_values:
                df[col] = df[col].fillna(median_values[col])
    
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')
            
    # Downcast numerical columns
    for col in df.select_dtypes(include='number').columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    print("Columns in df (after imputation):", df.columns.tolist())
    
    # Scale numerical columns
    num_cols = [col for col in scaler_feature_names if col not in categorical_cols]
    try:
        print('Scaling the prediction dataframe...')
        df_num_scaled = pd.DataFrame(
            scaler.transform(df[num_cols]),
            columns=num_cols, index=df.index
        )
        df_scaled = pd.concat([df_num_scaled, df[categorical_cols]], axis=1)
        print('Scaling completed...')
        
        # Select only expected features
        df_scaled_select = df_scaled[model_feature_names]
        print("Columns in scaled dataset:", df_scaled_select.columns.tolist())
        print('Shape of dataframe:', df_scaled_select.shape)
        
        chunk_size = 100000
        chunks = split_dataframe(df_scaled_select, chunk_size)
        
        tmp = Path('/dbfs/mnt/lab/unrestricted/KritiM/Predict') / grid
        tmp.mkdir(parents=True, exist_ok=True)
        
        inFiles = list(tmp.glob('data_*.csv'))
        if not inFiles:
            for i, chunk in enumerate(chunks):
                chunk.to_csv(tmp / f'data_{i}.csv', index=False)
            inFiles = list(tmp.glob('data_*.csv'))
        
        for i, file in enumerate(inFiles):
            print(f'Predicting classification for grid {grid}, chunk {i}')
            predictClass(file, tmp, i, best_model, scaler, model_feature_names)
        
        print('Merging the classified data...')
        toMergeC = list(tmp.glob('class_*.csv'))
        if not toMergeC:
            print(f"No class_*.csv files found for grid {grid}. Skipping merge step.")
            return
        
        toMergeCF = sorted(toMergeC, key=lambda x: extract_index(str(x)))
        dfsC = []
        for i in toMergeCF:
            print(f"Reading dataframe {i}")
            df_chunk = pd.read_csv(i)
            if 'Confidence' not in df_chunk.columns:
                print(f"Error: 'Confidence' column missing in {i}")
                return
            subset = df_chunk[['Class', 'Confidence']]
            dfsC.append(subset)
        MergedC = pd.concat(dfsC)
        
        # Create full prediction and confidence arrays
        full_class_pred = MergedC['Class'].values
        full_confidence_pred = MergedC['Confidence'].values
        
        # Reshape the 'Class' and 'Confidence' arrays
        if len(full_class_pred) != band.size:
            raise ValueError(f"Prediction array length {len(full_class_pred)} does not match raster pixel count {band.size}.")
        S_class = np.reshape(full_class_pred, (band.shape[0], band.shape[1]))
        S_confidence = np.reshape(full_confidence_pred, (band.shape[0], band.shape[1]))
        
        # Apply the original NaN mask as NoData
        nan_mask_2d = np.reshape(nan_mask.values, (band.shape[0], band.shape[1]))
        S_class[nan_mask_2d] = nodata_value
        S_confidence[nan_mask_2d] = nodata_value
        
        # Clip confidence to [0, 1] (except nodata)
        S_confidence = np.where(S_confidence != nodata_value, np.clip(S_confidence, 0, 1), nodata_value)
        
        # Print debug info before writing
        print(f"S_class shape: {S_class.shape}, dtype: {S_class.dtype}, min: {np.nanmin(S_class)}, max: {np.nanmax(S_class)}")
        print(f"S_confidence shape: {S_confidence.shape}, dtype: {S_confidence.dtype}, min: {np.nanmin(S_confidence)}, max: {np.nanmax(S_confidence)}")
        print(f"Unique S_confidence values (sample): {np.unique(S_confidence[~np.isnan(S_confidence)])[:10]}")
        
        # Ensure profile has correct nodata value
        profile.update(nodata=nodata_value)
        out_dir = Path('/dbfs/mnt/lab/unrestricted/KritiM/Predict')
        out_dir.mkdir(parents=True, exist_ok=True)
        local_class_path = f'/tmp/{grid}_predict_xgb.tif'
        local_conf_path = f'/tmp/{grid}_confidence_xgb.tif'
        dbfs_class_path = out_dir / f'{grid}_predict_xgb.tif'
        dbfs_conf_path = out_dir / f'{grid}_confidence_xgb.tif'
        try:
            with rio.open(local_class_path, 'w', **profile) as dst:
                dst.write(S_class.astype(rio.float32), 1)
            shutil.move(local_class_path, dbfs_class_path)
            print(f"Moved {local_class_path} to {dbfs_class_path}")
            with rio.open(local_conf_path, 'w', **profile) as dst:
                dst.write(S_confidence.astype(rio.float32), 1)
            shutil.move(local_conf_path, dbfs_conf_path)
            print(f"Moved {local_conf_path} to {dbfs_conf_path}")
        except Exception as e:
            import traceback
            print(f"Failed to write raster for grid {grid}: {e}")
            traceback.print_exc()
            return
        
        # Delete the temporary folder
        shutil.rmtree(tmp)
    except Exception as e:
        print(f'Error predicting for {grid}: {e}')
        return

In [0]:
# Main execution
path = Path('/dbfs/mnt/lab/unrestricted/KritiM/GRID/Elevation_dm/')
files = [file for file in folder.glob('*.tif') if file.is_file()]
# predict(files[0].name[:5])
for file in files:
    grid = file.name[:5]
    # print(grid)
    output_file = Path('/dbfs/mnt/lab/unrestricted/KritiM/Predict') / f'{grid}_predict_xgb.tif'
    if not output_file.exists():
        predict(grid)



In [0]:
print('Merging the grids together...')
directory = '/dbfs/mnt/lab/unrestricted/KritiM/Predict'
os.makedirs(directory, exist_ok=True)
# Merge class rasters
filenames = glob.glob(os.path.join(directory, '*_predict_xgb.tif'))
output_file = '/dbfs/mnt/lab/unrestricted/KritiM/soil_predict_xgb_.tif'
command = f"gdal_merge.py -a_nodata -9999 -o \"{output_file}\" " + " ".join([f"\"{file}\"" for file in filenames])
os.system(command)

# Merge confidence rasters
confidence_filenames = glob.glob(os.path.join(directory, '*_confidence_xgb.tif'))
confidence_output_file = 'FinalOutputs/soil_confidence_xgb_.tif'
command = f"gdal_merge.py -a_nodata -9999 -o \"{confidence_output_file}\" " + " ".join([f"\"{file}\"" for file in confidence_filenames])
os.system(command)

end = time.time()
timetaken = convert(end-start)
print('Time taken for processing: ', timetaken)