In [64]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from joblib import parallel_backend

import numpy as np
import pandas as pd
import time, warnings
import rasterio
import os

warnings.filterwarnings("ignore")

In [65]:
df = pd.read_csv('/Users/jonny.sanchez/Documents/tesis/data_maestria_final.csv')
df.head(5)

Unnamed: 0,id,ciudad,cobertura,anio,b2,b3,b4,b5,b6,b7,b8,lst,ndvi,ndbi,ndwi,geometria
0,1,cartagena,Agua de mar,2020,0.029048,0.058033,0.017965,0.006085,0.008698,0.008175,0.106367,29.910431,-0.493971,0.176729,0.739323,POINT (-75.57684056914488 10.2832248167126)
1,2,barranquilla,Suelo desnudo,2022,0.050772,0.078548,0.10393,0.305972,0.290408,0.159287,0.109771,39.039948,0.492904,-0.026099,-0.574216,POINT (-74.89247223516077 10.981728222216416)
2,3,santa_marta,Construcciones,2022,0.073185,0.100107,0.114655,0.273715,0.24341,0.166493,0.148554,42.495575,0.409558,-0.058603,-0.417162,POINT (-74.19021265998644 11.22816510954076)
3,4,barranquilla,Agua dulce,2024,0.065348,0.09315,0.09722,0.04288,0.010127,0.006085,0.114099,31.759583,-0.387866,-0.617884,0.803878,POINT (-74.799935240058 11.031285067372442)
4,5,santa_marta,Construcciones,2022,0.08556,0.119935,0.14122,0.248057,0.264998,0.218165,0.133605,43.469696,0.274451,0.033018,-0.376852,POINT (-74.1657521159878 11.228641543231959)


In [66]:
le = LabelEncoder()

df['cobertura_le'] = le.fit_transform(df['cobertura'])
y = df['cobertura_le'].values

bands = ['b2','b3','b4','b6','lst','ndvi','ndbi']
available_cols = [c for c in bands if c in df.columns]

X = df[available_cols]

labels = le.classes_

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

models = {
    'RandomForest': (
        Pipeline([("model", RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced'))]),
        {"model__n_estimators": [200],
        "model__max_depth": [None, 8],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]}
        ),
    'ExtraTrees':(
        Pipeline([("model",ExtraTreesClassifier(random_state=42,n_jobs=-1))]),
        {
        "model__n_estimators": [300, 500],
        "model__max_depth": [None, 15],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2]
        }
    ),
    'GradientBoosting': (
        Pipeline([("model", GradientBoostingClassifier(random_state=42))]),
        {"model__n_estimators": [200, 400],
        "model__learning_rate": [0.05, 0.1],
        "model__max_depth": [2, 3]}
        ),
    'SVC': (
        Pipeline([("scaler", StandardScaler()), ("model", SVC(probability=True, class_weight='balanced', random_state=42))]),
        {
        "model__C": [0.5, 1, 2],
        "model__gamma": ["scale", "auto"],
        "model__kernel": ["rbf"]
        }
        ),
    'MLP': (
        Pipeline([("scaler", StandardScaler()),("model", MLPClassifier(max_iter=1000, early_stopping=True, n_iter_no_change=25, random_state=42))]),
        {"model__hidden_layer_sizes": [(64, 32), (128, 64, 32)], 
         "model__alpha": [1e-5, 1e-4, 1e-3, 1e-2], 
         "model__activation": ["relu","tanh"],
         "model__solver": ["adam"]}
    ),
}

In [68]:
k          = 10
cv         = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

results = []
best_models = {}
labels = np.unique(y)

for name, (pipe, param_grid) in models.items():
    gs = GridSearchCV(pipe, param_grid, scoring="accuracy", cv=cv, n_jobs=-1, refit=True)
    tic = time.perf_counter()
    gs.fit(X_train, y_train)
    fit_time = round(time.perf_counter() - tic, 3)

    # ---- Predicción en TEST (hold-out) ----
    y_pred_test = gs.predict(X_test)
    acc_test = accuracy_score(y_test, y_pred_test)
    f1_test  = f1_score(y_test, y_pred_test, average="macro", zero_division=0)
    cm_test  = confusion_matrix(y_test, y_pred_test, labels=labels)

    # ---- OOF predict (CV sobre TODO X,y) cronometrado ----
    with parallel_backend("threading"):
        tic = time.perf_counter()
        y_pred_oof = cross_val_predict(gs.best_estimator_, X, y, cv=cv, n_jobs=-1)
        cv_time = round(time.perf_counter() - tic, 3)

    acc_oof = accuracy_score(y, y_pred_oof)
    f1_oof  = f1_score(y, y_pred_oof, average="macro", zero_division=0)
    cm_oof  = confusion_matrix(y, y_pred_oof, labels=labels)

    # ---- Imprimir como NB1 (simple y directo) ----
    print(f"\n=== {name} ===")
    print("Mejores params CV:", gs.best_params_)
    print(f"Tiempo entrenamiento: {fit_time:.3f}s | Tiempo OOF-predict: {cv_time:.3f}s")
    print(f"[TEST]   ACC={acc_test:.3f}  F1={f1_test:.3f}")
    print(f"[OOF-CV] ACC={acc_oof:.3f}  F1={f1_oof:.3f}")

    print("Reporte de clasificación (TEST):")
    display(pd.DataFrame(classification_report(y_test, y_pred_test, output_dict=True, zero_division=0)).T.round(3))
    print("Matriz de confusión (TEST):")
    display(pd.DataFrame(cm_test, index=labels, columns=labels))
    print("Matriz de confusión (OOF-CV):")
    display(pd.DataFrame(cm_oof, index=labels, columns=labels))

    # ---- Guardar resumen y mejor estimador ----
    results.append({
        "modelo": name,
        "best_cv_score": float(gs.best_score_),
        "ACC_test": float(acc_test),
        "F1_test": float(f1_test),
        "ACC_oof": float(acc_oof),
        "F1_oof": float(f1_oof),
        "fit_time_s": fit_time,
        "cv_pred_time_s": cv_time
    })
    best_models[name] = gs.best_estimator_

res_df = pd.DataFrame(results).sort_values("best_cv_score", ascending=False)
print("\nResumen (ordenado por score CV):")
display(res_df)


=== RandomForest ===
Mejores params CV: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Tiempo entrenamiento: 9.083s | Tiempo OOF-predict: 2.200s
[TEST]   ACC=0.974  F1=0.976
[OOF-CV] ACC=0.973  F1=0.976
Reporte de clasificación (TEST):


Unnamed: 0,precision,recall,f1-score,support
0,0.971,0.975,0.973,408.0
1,0.961,0.955,0.958,287.0
2,0.979,0.981,0.98,698.0
3,1.0,1.0,1.0,140.0
4,0.968,0.962,0.965,599.0
5,0.977,0.98,0.978,644.0
accuracy,0.974,0.974,0.974,0.974
macro avg,0.976,0.975,0.976,2776.0
weighted avg,0.974,0.974,0.974,2776.0


Matriz de confusión (TEST):


Unnamed: 0,0,1,2,3,4,5
0,398,10,0,0,0,0
1,12,274,0,0,0,1
2,0,0,685,0,9,4
3,0,0,0,140,0,0
4,0,0,13,0,576,10
5,0,1,2,0,10,631


Matriz de confusión (OOF-CV):


Unnamed: 0,0,1,2,3,4,5
0,1380,21,0,0,1,1
1,34,898,0,0,1,3
2,0,0,2191,0,46,15
3,0,0,0,469,0,0
4,0,0,32,0,1986,41
5,1,6,7,0,37,2082



=== ExtraTrees ===
Mejores params CV: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}
Tiempo entrenamiento: 9.008s | Tiempo OOF-predict: 1.630s
[TEST]   ACC=0.978  F1=0.980
[OOF-CV] ACC=0.978  F1=0.982
Reporte de clasificación (TEST):


Unnamed: 0,precision,recall,f1-score,support
0,0.978,0.988,0.983,408.0
1,0.979,0.965,0.972,287.0
2,0.983,0.979,0.981,698.0
3,1.0,1.0,1.0,140.0
4,0.97,0.965,0.967,599.0
5,0.975,0.984,0.98,644.0
accuracy,0.978,0.978,0.978,0.978
macro avg,0.981,0.98,0.98,2776.0
weighted avg,0.978,0.978,0.978,2776.0


Matriz de confusión (TEST):


Unnamed: 0,0,1,2,3,4,5
0,403,5,0,0,0,0
1,9,277,0,0,0,1
2,0,0,683,0,10,5
3,0,0,0,140,0,0
4,0,0,11,0,578,10
5,0,1,1,0,8,634


Matriz de confusión (OOF-CV):


Unnamed: 0,0,1,2,3,4,5
0,1390,11,0,0,1,1
1,19,914,0,0,0,3
2,0,0,2182,0,52,18
3,0,0,0,469,0,0
4,0,0,20,0,2004,35
5,0,5,7,0,28,2093



=== GradientBoosting ===
Mejores params CV: {'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__n_estimators': 400}
Tiempo entrenamiento: 103.265s | Tiempo OOF-predict: 19.826s
[TEST]   ACC=0.971  F1=0.973
[OOF-CV] ACC=0.971  F1=0.973
Reporte de clasificación (TEST):


Unnamed: 0,precision,recall,f1-score,support
0,0.968,0.978,0.973,408.0
1,0.961,0.944,0.953,287.0
2,0.97,0.974,0.972,698.0
3,1.0,1.0,1.0,140.0
4,0.971,0.958,0.965,599.0
5,0.972,0.981,0.977,644.0
accuracy,0.971,0.971,0.971,0.971
macro avg,0.974,0.973,0.973,2776.0
weighted avg,0.971,0.971,0.971,2776.0


Matriz de confusión (TEST):


Unnamed: 0,0,1,2,3,4,5
0,399,9,0,0,0,0
1,13,271,0,0,0,3
2,0,0,680,0,11,7
3,0,0,0,140,0,0
4,0,0,17,0,574,8
5,0,2,4,0,6,632


Matriz de confusión (OOF-CV):


Unnamed: 0,0,1,2,3,4,5
0,1364,37,0,0,1,1
1,35,896,1,0,0,4
2,0,0,2186,0,51,15
3,0,0,0,468,1,0
4,0,0,35,0,1989,35
5,1,9,12,0,29,2082



=== SVC ===
Mejores params CV: {'model__C': 2, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
Tiempo entrenamiento: 3.827s | Tiempo OOF-predict: 1.066s
[TEST]   ACC=0.943  F1=0.940
[OOF-CV] ACC=0.943  F1=0.942
Reporte de clasificación (TEST):


Unnamed: 0,precision,recall,f1-score,support
0,0.927,0.873,0.899,408.0
1,0.816,0.909,0.86,287.0
2,0.978,0.97,0.974,698.0
3,1.0,1.0,1.0,140.0
4,0.948,0.94,0.944,599.0
5,0.96,0.963,0.961,644.0
accuracy,0.943,0.943,0.943,0.943
macro avg,0.938,0.942,0.94,2776.0
weighted avg,0.944,0.943,0.943,2776.0


Matriz de confusión (TEST):


Unnamed: 0,0,1,2,3,4,5
0,356,52,0,0,0,0
1,26,261,0,0,0,0
2,0,0,677,0,18,3
3,0,0,0,140,0,0
4,0,0,13,0,563,23
5,2,7,2,0,13,620


Matriz de confusión (OOF-CV):


Unnamed: 0,0,1,2,3,4,5
0,1243,159,0,0,0,1
1,79,855,0,0,0,2
2,0,0,2168,0,71,13
3,0,0,0,469,0,0
4,0,0,29,0,1944,86
5,9,22,10,0,44,2048



=== MLP ===
Mejores params CV: {'model__activation': 'tanh', 'model__alpha': 0.01, 'model__hidden_layer_sizes': (128, 64, 32), 'model__solver': 'adam'}
Tiempo entrenamiento: 15.563s | Tiempo OOF-predict: 22.056s
[TEST]   ACC=0.977  F1=0.977
[OOF-CV] ACC=0.974  F1=0.974
Reporte de clasificación (TEST):


Unnamed: 0,precision,recall,f1-score,support
0,0.966,0.978,0.972,408.0
1,0.964,0.944,0.954,287.0
2,0.983,0.989,0.986,698.0
3,1.0,1.0,1.0,140.0
4,0.981,0.965,0.973,599.0
5,0.974,0.984,0.979,644.0
accuracy,0.977,0.977,0.977,0.977
macro avg,0.978,0.977,0.977,2776.0
weighted avg,0.977,0.977,0.977,2776.0


Matriz de confusión (TEST):


Unnamed: 0,0,1,2,3,4,5
0,399,9,0,0,0,0
1,14,271,0,0,0,2
2,0,0,690,0,4,4
3,0,0,0,140,0,0
4,0,0,10,0,578,11
5,0,1,2,0,7,634


Matriz de confusión (OOF-CV):


Unnamed: 0,0,1,2,3,4,5
0,1343,55,0,0,2,3
1,33,897,0,0,0,6
2,0,0,2202,0,39,11
3,0,0,0,469,0,0
4,0,0,20,0,2009,30
5,0,7,7,0,30,2089



Resumen (ordenado por score CV):


Unnamed: 0,modelo,best_cv_score,ACC_test,F1_test,ACC_oof,F1_oof,fit_time_s,cv_pred_time_s
1,ExtraTrees,0.97483,0.978026,0.980458,0.978383,0.981841,9.008,1.63
2,GradientBoosting,0.968346,0.971182,0.973227,0.971141,0.973224,103.265,19.826
0,RandomForest,0.967729,0.974063,0.975706,0.973411,0.976326,9.083,2.2
4,MLP,0.965721,0.976945,0.977357,0.973735,0.974136,15.563,22.056
3,SVC,0.938235,0.942723,0.93969,0.943256,0.941555,3.827,1.066


In [69]:
feature_order = list(X.columns)
print("feature_order:", feature_order)

feature_order: ['b2', 'b3', 'b4', 'b6', 'lst', 'ndvi', 'ndbi']


In [70]:
def cargar_banda(path):
    with rasterio.open(path) as src:
        return src.read(), src.profile

def clasificar_rasters(path_raiz, best_models, le, feature_order):

    for folder_name in os.listdir(path_raiz):
        folder_path = os.path.join(path_raiz, folder_name)
        if os.path.isdir(folder_path):
            b2 = b3 =b4 = b6 = lst = ndvi = ndbi = ndwi = None
            for file_name in os.listdir(folder_path):
                if file_name.endswith("B2.TIF"):
                    b2 = os.path.join(folder_path,file_name)
                elif file_name.endswith("B3.TIF"):
                    b3 = os.path.join(folder_path,file_name)
                elif file_name.endswith("B4.TIF"):
                    b4 = os.path.join(folder_path,file_name)
                elif file_name.endswith("B6.TIF"):
                    b6 = os.path.join(folder_path,file_name)                      
                elif file_name.endswith("LST.TIF"):
                    lst = os.path.join(folder_path,file_name)           
                elif file_name.endswith("NDVI.TIF"):
                    ndvi = os.path.join(folder_path,file_name)
                elif file_name.endswith("NDBI.TIF"):
                    ndbi = os.path.join(folder_path,file_name)
                elif file_name.endswith("NDWI.TIF"):
                    ndwi = os.path.join(folder_path,file_name)

        # cargar y apilar
            b2, profile = cargar_banda(b2)   # 3×H×W
            b3, _ = cargar_banda(b3)
            b4, _ = cargar_banda(b4)
            b6, _ = cargar_banda(b6)
            lst, _  = cargar_banda(lst)
            ndvi, _ = cargar_banda(ndvi)
            ndbi, _ = cargar_banda(ndbi)
            ndwi, _ = cargar_banda(ndwi)

            capa = {
                "b2": b2, "b3": b3, "b4": b4, "b6": b6,
                "banda2": b2, "banda3": b3, "banda4": b4, "banda6": b6,
                "lst": lst, "ndvi": ndvi, "ndbi": ndbi, "ndwi": ndwi,
            }

            arrays = [capa[f] for f in feature_order]  # cada uno es (1, H, W)

            multiband = np.concatenate((arrays), axis=0)
            num_bands, alto, ancho = multiband.shape
        
            data = multiband.reshape(num_bands, -1).T
            df_pix = pd.DataFrame(data, columns=feature_order)

            profile_out = profile.copy()
            profile_out.update(dtype=rasterio.uint8, count=1, height=int(alto), width=int(ancho))

            for name, model in best_models.items():
                preds = model.predict(df_pix)
                classification = preds.reshape(alto, ancho)

                out_path = os.path.join(folder_path, f"Land_Cover_{name}.TIF")

                with rasterio.open(out_path, "w", **profile) as dst:
                    dst.write(classification.astype(np.uint8), 1)
                    
                print("Procesado: ", out_path)

In [75]:
path = '/Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla'

clasificar_rasters(
    path_raiz     = path,
    best_models   = best_models,   
    le            = le,          
    feature_order = feature_order
)

Procesado:  /Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla/LC08_L2SP_009052_20180204_20200902_02_T1/Land_Cover_RandomForest.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla/LC08_L2SP_009052_20180204_20200902_02_T1/Land_Cover_ExtraTrees.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla/LC08_L2SP_009052_20180204_20200902_02_T1/Land_Cover_GradientBoosting.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla/LC08_L2SP_009052_20180204_20200902_02_T1/Land_Cover_SVC.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla/LC08_L2SP_009052_20180204_20200902_02_T1/Land_Cover_MLP.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla/LC08_L2SP_009052_20170116_20200905_02_T1/Land_Cover_RandomForest.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-classification/barranquilla/LC08_L2SP_009052_20170116_20200905_02_T1/Land_Cov

In [72]:
mapping = dict(zip(le.classes_, range(len(le.classes_))))
print("Codificación de 'cobertura':", mapping)

Codificación de 'cobertura': {'Agua de mar': 0, 'Agua dulce': 1, 'Construcciones': 2, 'Nubes': 3, 'Suelo desnudo': 4, 'Vegetación': 5}
