In [28]:
import time, warnings
import pandas as pd
import numpy as np
import rasterio
import os
from joblib import parallel_backend

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

warnings.filterwarnings("ignore")

In [29]:
df = pd.read_csv('/Users/jonny.sanchez/Documents/tesis/estaciones.csv',sep=";")
df.head()

Unnamed: 0,fuente,ciudad,codigo_estacion,fecha_toma,anio,mes,dia,medicion,sea_water,fresh_water,...,clouds,bare_ground,veg,lst,ndbi,ndvi,ndwi,st_emissivity,norte,este
0,dimar,barranquilla,0000000004,2020-03-29 07:00:00.000,2020,3,29,26.4,0.04,0.88,...,0.0,0.0,0.0,29.455811,-0.01172,-0.016581,0.00206,0.988,2785929.0,4798071.0
1,dimar,barranquilla,0000000004,2021-02-12 13:00:00.000,2021,2,12,28.12,0.08,0.72,...,0.0,0.0,0.0,28.509033,-0.013787,0.060807,-0.105339,0.988,2785929.0,4798071.0
2,dimar,barranquilla,0000000004,2022-02-23 13:00:00.000,2022,2,23,27.81,0.25,0.7,...,0.0,0.0,0.0,29.517365,-0.096698,0.052519,-0.084309,0.988,2785929.0,4798071.0
3,dimar,barranquilla,0CP03FL033,2020-03-29 13:00:00.000,2020,3,29,28.5,0.0,0.28,...,0.0,0.04,0.0,36.818237,0.073587,0.277937,-0.362358,0.9669,2778642.0,4801240.0
4,dimar,barranquilla,0CP03FL033,2021-02-12 13:00:00.000,2021,2,12,28.6,0.0,0.32,...,0.0,0.2,0.0,34.975922,0.049616,0.316371,-0.36871,0.9669,2778642.0,4801240.0


In [30]:
y = df['medicion'].values

bands = ['anio','mes','dia','sea_water','fresh_water','builds','clouds','bare_ground','veg','lst','ndbi','norte','este']
available_cols = [c for c in bands if c in df.columns]

X = df[available_cols]

In [31]:
X.corr()

Unnamed: 0,anio,mes,dia,sea_water,fresh_water,builds,clouds,bare_ground,veg,lst,ndbi,norte,este
anio,1.0,-0.354468,0.56946,0.019247,0.059732,-0.087528,,0.037695,0.026397,-0.095305,-0.031912,0.01354,0.002468
mes,-0.354468,1.0,-0.161194,-0.005282,-0.008689,0.076982,,0.11206,-0.139635,0.153652,0.013609,0.017312,0.010795
dia,0.56946,-0.161194,1.0,0.008334,0.114353,-0.024364,,0.032062,-0.040714,-0.036427,-0.074084,0.01886,0.00556
sea_water,0.019247,-0.005282,0.008334,1.0,-0.119137,-0.552997,,-0.345073,-0.487346,-0.755212,0.083814,0.06211,-0.013509
fresh_water,0.059732,-0.008689,0.114353,-0.119137,1.0,-0.063321,,-0.057463,-0.059554,-0.095509,-0.01494,0.039124,-0.010322
builds,-0.087528,0.076982,-0.024364,-0.552997,-0.063321,1.0,,-0.059784,-0.280787,0.782371,-0.042382,-0.177271,-0.22519
clouds,,,,,,,,,,,,,
bare_ground,0.037695,0.11206,0.032062,-0.345073,-0.057463,-0.059784,,1.0,-0.060457,0.379727,-0.027004,0.03961,0.0353
veg,0.026397,-0.139635,-0.040714,-0.487346,-0.059554,-0.280787,,-0.060457,1.0,-0.033817,-0.044652,0.071006,0.240616
lst,-0.095305,0.153652,-0.036427,-0.755212,-0.095509,0.782371,,0.379727,-0.033817,1.0,-0.051527,-0.289507,-0.244737


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'LinearRegression' : (
        Pipeline([("scaler", StandardScaler()),("model",LinearRegression())]), 
        {}
    ),
    'RandomForestRegressor': (
        Pipeline([("model", RandomForestRegressor(random_state=42,n_jobs=-1))]),
        {"model__n_estimators":[200], 
         "model__max_depth":[None,8],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]}
    ),
    'ExtraTrees':(
        Pipeline([("model",ExtraTreesRegressor(random_state=42,n_jobs=-1))]),
        {
        "model__n_estimators": [300, 500],
        "model__max_depth": [None, 15],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]}
    ),
    'GradientBoostingRegressor': (
        Pipeline([("model", GradientBoostingRegressor(random_state=42))]),
        {"model__n_estimators":[200,400], 
         "model__learning_rate":[0.05,0.1], 
         "model__max_depth":[2,3]}
    ),
    'SVR': (
        Pipeline([("scaler", StandardScaler()), ("model", SVR())]),
        {"model__C": [1.0, 10.0], 
         "model__epsilon": [0.1, 0.3], 
         "model__kernel": ["rbf"]}
    ),
    'MLPRegressor': (
        Pipeline([("scaler", StandardScaler()), ("model", MLPRegressor(random_state=42, max_iter=3000))]),
        {"model__hidden_layer_sizes": [(64, 32), (128, 64, 32)], 
         "model__alpha": [1e-5, 1e-4, 1e-3, 1e-2], 
         "model__activation": ["relu","tanh"],
         "model__solver": ["adam", "lbfgs"],}
    ),
}

In [33]:
k          = 10
cv         = KFold(n_splits=k, shuffle=True, random_state=42)

results = []
best_models = {}

for name, (pipe, grid) in models.items():
    gs = GridSearchCV(pipe, grid, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=1, refit=True)
    tic = time.perf_counter()
    gs.fit(X_train, y_train)
    fit_time = round(time.perf_counter() - tic, 3)

    y_pred = gs.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = float(np.sqrt(mse))
    mae = float(mean_absolute_error(y_test, y_pred))
    r2 = float(r2_score(y_test, y_pred))

    with parallel_backend("threading"):
        tic = time.perf_counter()
        y_pred_oof = cross_val_predict(gs.best_estimator_, X, y, cv=cv, n_jobs=-1)
        cv_pred_time = round(time.perf_counter() - tic, 3)

    rmse_oof = float(np.sqrt(mean_squared_error(y, y_pred_oof)))
    mae_oof  = float(mean_absolute_error(y, y_pred_oof))
    r2_oof   = float(r2_score(y, y_pred_oof))

    print(f"\n=== {name} ===")
    print("Mejores params CV: ", gs.best_params_)
    print(f"Tiempo entrenamiento: {fit_time}s | Tiempo OOF-predict: {cv_pred_time}s")
    print(f"[TEST]   RMSE={rmse:.3f}  MAE={mae:.3f}  R2={r2:.3f}")
    print(f"[OOF-CV] RMSE={rmse_oof:.3f}  MAE={mae_oof:.3f}  R2={r2_oof:.3f}")

    results.append({
        "modelo": name,
        "best_params": gs.best_params_,
        "fit_time_s": fit_time,
        "cv_pred_time_s": cv_pred_time,
        "test_RMSE": rmse, "test_MAE": mae, "test_R2": r2,
        "oof_RMSE": rmse_oof, "oof_MAE": mae_oof, "oof_R2": r2_oof,
        "cv_best_RMSE": float(-gs.best_score_),
    })

    best_models[name] = gs.best_estimator_

res_df = pd.DataFrame(results).sort_values("test_RMSE", ascending=False)
display(res_df)


=== LinearRegression ===
Mejores params CV:  {}
Tiempo entrenamiento: 0.031s | Tiempo OOF-predict: 0.029s
[TEST]   RMSE=1.161  MAE=0.832  R2=0.784
[OOF-CV] RMSE=2.922  MAE=1.095  R2=-0.212

=== RandomForestRegressor ===
Mejores params CV:  {'model__max_depth': 8, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Tiempo entrenamiento: 6.672s | Tiempo OOF-predict: 0.741s
[TEST]   RMSE=0.895  MAE=0.531  R2=0.872
[OOF-CV] RMSE=1.368  MAE=0.576  R2=0.735

=== ExtraTrees ===
Mejores params CV:  {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}
Tiempo entrenamiento: 17.247s | Tiempo OOF-predict: 0.761s
[TEST]   RMSE=0.572  MAE=0.410  R2=0.948
[OOF-CV] RMSE=1.326  MAE=0.507  R2=0.751

=== GradientBoostingRegressor ===
Mejores params CV:  {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 400}
Tiempo entrenamiento: 4.766s | Tiempo OOF-predict: 0.503s
[TEST]   RMSE=0

Unnamed: 0,modelo,best_params,fit_time_s,cv_pred_time_s,test_RMSE,test_MAE,test_R2,oof_RMSE,oof_MAE,oof_R2,cv_best_RMSE
5,MLPRegressor,"{'model__activation': 'tanh', 'model__alpha': ...",210.105,10.408,1.819087,1.005553,0.471032,2.268098,0.970881,0.269902,1.602329
0,LinearRegression,{},0.031,0.029,1.161387,0.831977,0.784386,2.922069,1.095325,-0.211821,2.434841
4,SVR,"{'model__C': 10.0, 'model__epsilon': 0.1, 'mod...",0.082,0.027,1.040065,0.791301,0.827081,1.395885,0.777529,0.723461,1.251794
1,RandomForestRegressor,"{'model__max_depth': 8, 'model__min_samples_le...",6.672,0.741,0.895211,0.531452,0.871893,1.367622,0.576078,0.734546,1.07718
3,GradientBoostingRegressor,"{'model__learning_rate': 0.1, 'model__max_dept...",4.766,0.503,0.828785,0.481909,0.890199,1.362998,0.53645,0.736338,1.083108
2,ExtraTrees,"{'model__max_depth': None, 'model__min_samples...",17.247,0.761,0.57235,0.409715,0.947634,1.325563,0.507131,0.750622,1.046853


In [25]:
feature_order = list(X.columns)
print("feature_order:", feature_order)

feature_order: ['anio', 'mes', 'dia', 'sea_water', 'fresh_water', 'builds', 'clouds', 'bare_ground', 'veg', 'lst', 'norte', 'este']


In [26]:
def cargar_banda(path):
    with rasterio.open(path) as src:
        return src.read(), src.profile

def grilla_xy(profile):
    T = profile["transform"]
    W = profile["width"]
    H = profile["height"]
    xs = T.c + np.arange(W) * T.a
    ys = T.f + np.arange(H) * T.e
    X, Y = np.meshgrid(xs, ys)
    return X.astype("float32"), Y.astype("float32")

def read_resample(path, ref_profile):
    with rasterio.open(path) as src:
        if (src.width, src.height) != (ref_profile["width"], ref_profile["height"]) or src.transform != ref_profile["transform"]:
            data = src.read(
                out_shape=(1, ref_profile["height"], ref_profile["width"]),
                resampling=rasterio.enums.Resampling.nearest
            )
            return data.astype("float32")
        else:
            return src.read().astype("float32")

def clasificar_rasters(path_raiz, best_models):
    for folder_name in os.listdir(path_raiz):
        folder_path = os.path.join(path_raiz, folder_name)
        if os.path.isdir(folder_path):
            date = anio = mes = dia = sea_water = fresh_water = builds = clouds = bare_ground = veg = lst = ndbi = ndvi = ndwi = st_emissivity = None
            date = folder_name.split(sep='_')[3]
            anio_file = int(date[0:4])
            mes_file = int(date[4:6])
            dia_file = int(date[6:8])
            for file_name in os.listdir(folder_path):
                if file_name.endswith("c0_sea_water.TIF"):
                    sea_water = os.path.join(folder_path,file_name)
                elif file_name.endswith("c1_fresh_water.TIF"):
                    fresh_water = os.path.join(folder_path,file_name)
                elif file_name.endswith("c2_builds.TIF"):
                    builds = os.path.join(folder_path,file_name)
                elif file_name.endswith("c3_clouds.TIF"):
                    clouds = os.path.join(folder_path,file_name)
                elif file_name.endswith("c4_bare_ground.TIF"):
                    bare_ground = os.path.join(folder_path,file_name)
                elif file_name.endswith("c5_vegetation.TIF"):
                    veg = os.path.join(folder_path,file_name)
                elif file_name.endswith("LST.TIF"):
                    lst = os.path.join(folder_path,file_name)
                elif file_name.endswith("NDBI.TIF"):
                    ndbi = os.path.join(folder_path,file_name)
                elif file_name.endswith("NDVI.TIF"):
                    ndvi = os.path.join(folder_path,file_name)
                elif file_name.endswith("NDWI.TIF"):
                    ndwi = os.path.join(folder_path,file_name)
                elif file_name.endswith("ST_EMIS.TIF"):
                    st_emissivity = os.path.join(folder_path,file_name)

        # cargar y apilar
            sea_water, _ = cargar_banda(sea_water)
            fresh_water, _ = cargar_banda(fresh_water)
            builds, _ = cargar_banda(builds) 
            clouds, _ = cargar_banda(clouds) 
            bare_ground, _ = cargar_banda(bare_ground) 
            veg, _ = cargar_banda(veg) 
            lst, profile = cargar_banda(lst) 
            ndbi, _ = cargar_banda(ndbi) 
            ndvi, _ = cargar_banda(ndvi) 
            ndwi, _ = cargar_banda(ndwi) 
            st_emissivity, _ = cargar_banda(st_emissivity)

            assert sea_water.shape[1:] == fresh_water.shape[1:] == builds.shape[1:] == clouds.shape[1:] == bare_ground.shape[1:] == veg.shape[1:] == lst.shape[1:] == ndbi.shape[1:] == ndwi.shape[1:] == ndvi.shape[1:] == st_emissivity.shape[1:], "Las dimensiones no coinciden."
            _, H, W = lst.shape
            anio = np.full((1, H, W), anio_file, dtype='float32')
            mes = np.full((1, H, W), mes_file, dtype='float32')
            dia = np.full((1, H, W), dia_file, dtype='float32')
            este, norte = grilla_xy(profile)
            este = este[np.newaxis, :, :]
            norte = norte[np.newaxis, :, :]

            capa = {
                'anio':anio, 'mes':mes, 'dia':dia, 'sea_water':sea_water, 'fresh_water':fresh_water, 'builds':builds, 'clouds':clouds, 'bare_ground':bare_ground, 'veg':veg, 
                'lst':lst, 'ndvi':ndvi, 'ndbi':ndbi, 'ndwi': ndwi, 'st_emissivity':st_emissivity, 'norte':norte, 'este':este,
            }

            arrays = [capa[f] for f in feature_order]
            multiband = np.concatenate((arrays), axis=0)
            num_bands, alto, ancho = multiband.shape

            data = multiband.reshape(num_bands, -1).T
            df_pix = pd.DataFrame(data, columns=feature_order)

            #profile_out = profile.copy()
            #profile_out.update(dtype=rasterio.uint8, count=1, height=int(alto), width=int(ancho))

            for name, model in best_models.items():
                preds = model.predict(df_pix)
                classification = preds.reshape(alto, ancho)

                out_path = os.path.join(folder_path, f"Temperature_2{name}.TIF")

                profile.update(dtype=rasterio.float32, count=1, height=alto, width=ancho, nodata = np.nan)

                with rasterio.open(out_path, "w", **profile) as dst:
                    dst.write(classification.astype("float32"), 1)
                
                print("Procesado: ", out_path)

In [27]:
path = '/Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena'
clasificar_rasters(path,best_models)

Procesado:  /Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena/LC08_L2SP_009053_20230117_20230131_02_T1/Temperature_2LinearRegression.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena/LC08_L2SP_009053_20230117_20230131_02_T1/Temperature_2RandomForestRegressor.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena/LC08_L2SP_009053_20230117_20230131_02_T1/Temperature_2ExtraTrees.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena/LC08_L2SP_009053_20230117_20230131_02_T1/Temperature_2GradientBoostingRegressor.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena/LC08_L2SP_009053_20230117_20230131_02_T1/Temperature_2SVR.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena/LC08_L2SP_009053_20230117_20230131_02_T1/Temperature_2MLPRegressor.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/10-heat_island_2/cartagena/LC08_L2SP_009053_20210212_2