tasks:
- train 
- predict 
- qgis check rasters 
- package to send to Rolf, @@

In [46]:
from utils import (collect_tiff_paths,split_data_two,load_many_patch_to_df,
                   get_row_paths_and_names,load_patch_to_df)
import lightgbm as lgb
from sklearn.metrics import root_mean_squared_error
import time 
import matplotlib.pyplot as plt 
import pandas as pd 

In [42]:
rparams = {'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',#['rmse','mae','mse'],
        #'learning_rate': 0.8935583206145743,
        'num_leaves': 2080,
        'max_depth': 16,
        'min_data_in_leaf': 600,
        'lambda_l1': 45,
        'lambda_l2': 35,
        'min_gain_to_split': 10,
        'bagging_fraction': 0.9,
        'bagging_freq': 10,
        'feature_fraction': 1.0,
        'verbose': 0,
        'n_estimators': 500,
        'seed':123,
        'early_stopping_rounds':20,
        'first_metric_only':True
    }


# If you are dealing with multi-class classification, you might use:
# cparams['objective'] = 'multiclass'
# cparams['metric'] = 'multi_logloss'
# cparams['num_class'] = <number_of_classes>  # Specify the number of 


cparams = rparams.copy()

# Modify parameters for classification
cparams['objective'] = 'binary'  # Use 'binary' for binary classification
cparams['metric'] = 'binary_logloss'  # Common metric for binary 

In [52]:
roi = 'tsap'
nboost = 10000
sa = 0.1 #1
st = 0.1
path = "/media/ljp238/12TBWolf/ARCHIEVE/ZOUT/datasets_TILES12_patches256/"
dpath = "/media/ljp238/12TBWolf/ARCHIEVE/ZOUT/datasets_TILES12_patches256/N13E103"
dfpath = collect_tiff_paths(dpath)
acols = ['cdem_dem', 'cdem_wbm', 'edem_dem', 'edem_demw84', 'egm08', 'egm96',
       'esawc', 'ldem_label', 'ldtm', 'pdem', 'pdem_label', 'tdem_dem_clean',
       'tdem_dem_clean_binmask', 'tdem_dem_filled', 'tdem_dem_hsd',
       'tdem_dem_rgx', 'tdem_dem_slp', 'tdem_dem_tpi', 'tdem_dem_tri',
       'tdem_hem']

fcolYc = 'ldem_label'
fcolYr = 'ldtm'
fcolX =  ['egm08', 'egm96','tdem_hem','tdem_dem_filled'] 
FTCOLSC =  fcolX + [fcolYc , fcolYr]
pathdf = dfpath[FTCOLSC]
print(pathdf.shape) # filter out pad ones for the img 

train_paths, valid_paths =  split_data_two(pathdf.sample(frac=sa),test_pct= st)
print(train_paths.shape, valid_paths.shape)#, test_paths.shape)

(1521, 6)
(136, 6) (16, 6)


REGRESSION

In [None]:
val_data_df = load_many_patch_to_df(valid_paths[FTCOLSC])
X_vali = val_data_df.drop(fcolYr, axis=1)
y_vali = val_data_df[fcolYr]
val_data = lgb.Dataset(X_vali, label=y_vali, free_raw_data=False)

# test_data_df = load_many_patch_to_df(test_paths[FTCOLSC])
# X_test = val_data_df.drop(fcolYr, axis=1)
# y_test = val_data_df[fcolYr]
# test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False)

ti = time.perf_counter()
# List to store accuracy scores
train_accuracies = []
val_accuracies = []

# Best model and best validation score
best_model = None
best_val_score = float('inf')


# Training with batches
for i in range(len(train_paths)):
    #if i > 100: break
    print(f'[INFO]:: {i}/ {len(train_paths)}')
    paths, names = get_row_paths_and_names(train_paths[FTCOLSC], i)
    try:
        di = load_patch_to_df(paths, names)
        X_train = di.drop(fcolYr, axis=1)
        y_train = di[fcolYr]
        train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)

        # If first batch, train initial model, else update model
        if i == 0:
            model = lgb.train(rparams, train_data, valid_sets=[train_data, val_data], 
                              num_boost_round=nboost, callbacks=[lgb.early_stopping(100)])
        else:
            model = lgb.train(rparams, train_data, valid_sets=[train_data, val_data], 
                              init_model=model, num_boost_round=nboost, callbacks=[lgb.early_stopping(100)])

        # Calculate accuracy for the current batch
        y_train_pred = model.predict(X_train) #> 0.5).astype(int)
        y_val_pred = model.predict(X_vali) #> 0.5).astype(int)
        train_accuracy = root_mean_squared_error(y_train, y_train_pred)
        val_accuracy = root_mean_squared_error(y_vali, y_val_pred)

        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)

        # Save the best model based on validation accuracy
        if val_accuracy < best_val_score:
            best_val_score = val_accuracy
            best_model = model

    except Exception as e:
        print(f"Error in batch {i}: {e}")
        pass 
# replace valid by test?
# Save the best model
best_model.save_model(f'lgbR_{roi}_modeli.txt')

# Save accuracy scores to CSV
accuracy_df = pd.DataFrame({
    'train_RMSE': train_accuracies,
    'val_RMSE': val_accuracies
})
accuracy_df.to_csv(f'lgbR_{roi}_scoresi.csv')

# Plot training and validation accuracy over batches
plt.plot(train_accuracies, label='Train RMSE')
plt.plot(val_accuracies, label='Valid RMSE')
plt.xlabel('Batch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation RMSE Over Batches')
plt.savefig('RMSE_plot.png')
#plt.show()

print('Finished ')

tf = time.perf_counter() - ti 
print(f'run.time={tf/60} min(s)')