In [119]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

In [120]:
df = pd.read_csv('/Users/jonny.sanchez/Documents/tesis/data_maestria_final.csv')
print(df.describe())

                id         anio           b2           b3           b4  \
count  9252.000000  9252.000000  9252.000000  9252.000000  9252.000000   
mean   4626.500000  2019.366732     0.081777     0.111034     0.113503   
std    2670.966679     3.075961     0.128197     0.126808     0.136314   
min       1.000000  2015.000000     0.000104     0.004600    -0.009776   
25%    2313.750000  2018.000000     0.024785     0.054623     0.037071   
50%    4626.500000  2020.000000     0.048765     0.077200     0.084817   
75%    6939.250000  2022.000000     0.086199     0.119227     0.138223   
max    9252.000000  2024.000000     1.089833     1.118130     1.158033   

                b5           b6           b7           b8          lst  \
count  9252.000000  9252.000000  9252.000000  9252.000000  9252.000000   
mean      0.232624     0.206552     0.141613     0.136371    36.220818   
std       0.165465     0.152703     0.124779     0.111441     8.529719   
min      -0.131928     0.000282     0

In [185]:
le = LabelEncoder()
df['cobertura_le'] = le.fit_transform(df['cobertura'])
y = df['cobertura_le'].values
X = df.drop(columns=['id','ciudad','cobertura','geometria','anio','cobertura_le','ndwi','b8','b5','b7'])
labels = le.classes_

In [186]:
print(X)

            b2        b3        b4        b6        lst      ndvi      ndbi
0     0.029048  0.058033  0.017965  0.008698  29.910431 -0.493971  0.176729
1     0.050772  0.078548  0.103930  0.290408  39.039948  0.492904 -0.026099
2     0.073185  0.100107  0.114655  0.243410  42.495575  0.409558 -0.058603
3     0.065348  0.093150  0.097220  0.010127  31.759583 -0.387866 -0.617884
4     0.085560  0.119935  0.141220  0.264998  43.469696  0.274451  0.033018
...        ...       ...       ...       ...        ...       ...       ...
9247  0.070270  0.109017  0.141082  0.393147  51.703705  0.309870  0.189693
9248  0.069307  0.093700  0.097605  0.020467  30.679474 -0.101994 -0.590670
9249  0.056823  0.082975  0.103188  0.283258  40.554138  0.437969  0.035175
9250  0.015738  0.030917  0.021320  0.035152  27.845947  0.138368  0.110313
9251  0.671420  0.700405  0.730490  0.772263  19.430786  0.058466 -0.030719

[9252 rows x 7 columns]


In [187]:
models = {
    'RandomForest': RandomForestClassifier(
        n_estimators=200,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=250,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ),
    'ExtraTrees': ExtraTreesClassifier(
        n_estimators=300,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
}

In [188]:
k          = 10
cv         = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
confusion  = {}   # Guardará las matrices de confusión
run_times  = {}   # Guardará el tiempo de proceso (s)
reports    = {}   # Métricas detalladas
best_models = {}

for name, clf in models.items():
    tic = time.perf_counter()

    with parallel_backend('threading'):
        y_pred = cross_val_predict(clf, X, y, cv=cv, n_jobs=-1)

    toc = time.perf_counter()
    run_times[name] = round(toc - tic, 2)

    # matriz de confusión
    cm_df = pd.DataFrame(
        confusion_matrix(y, y_pred),
        index=labels, columns=labels
    )
    confusion[name] = cm_df

    # reporte (como DataFrame para evitar truncado)
    report_df = pd.DataFrame(
        classification_report(
            y, y_pred, target_names=labels, output_dict=True
        )
    ).T
    reports[name] = report_df

In [194]:
print('================ TIEMPOS DE PROCESAMIENTO ================')
print(pd.Series(run_times, name='segundos'), '\n')

RandomForest         2.95
GradientBoosting    20.45
ExtraTrees           3.39
Name: segundos, dtype: float64 



In [195]:
print(f'================ RandomForest ================')
print(confusion['RandomForest'])
print(reports['RandomForest'])

                Agua de mar  Agua dulce  Construcciones  Nubes  Suelo desnudo  \
Agua de mar            1372          29               0      0              1   
Agua dulce               37         896               0      0              0   
Construcciones            0           0            2189      0             48   
Nubes                     0           0               0    469              0   
Suelo desnudo             0           0              28      0           1987   
Vegetación                0           7               8      0             35   

                Vegetación  
Agua de mar              1  
Agua dulce               3  
Construcciones          15  
Nubes                    0  
Suelo desnudo           44  
Vegetación            2083  
                precision    recall  f1-score     support
Agua de mar      0.973740  0.977904  0.975818  1403.00000
Agua dulce       0.961373  0.957265  0.959315   936.00000
Construcciones   0.983820  0.972025  0.977887  2252.000

In [196]:
print(f'================ GradientBoosting ================')
print(confusion['GradientBoosting'])
print(reports['GradientBoosting'])

                Agua de mar  Agua dulce  Construcciones  Nubes  Suelo desnudo  \
Agua de mar            1358          43               0      0              1   
Agua dulce               37         890               1      0              3   
Construcciones            0           0            2184      0             52   
Nubes                     0           0               1    467              1   
Suelo desnudo             0           0              37      0           1989   
Vegetación                1          10              10      0             33   

                Vegetación  
Agua de mar              1  
Agua dulce               5  
Construcciones          16  
Nubes                    0  
Suelo desnudo           33  
Vegetación            2079  
                precision    recall  f1-score      support
Agua de mar      0.972779  0.967926  0.970347  1403.000000
Agua dulce       0.943796  0.950855  0.947312   936.000000
Construcciones   0.978056  0.969805  0.973913  2252.

In [197]:
print(f'================ ExtraTrees ================')
print(confusion['ExtraTrees'])
print(reports['ExtraTrees'])

                Agua de mar  Agua dulce  Construcciones  Nubes  Suelo desnudo  \
Agua de mar            1390          11               0      0              1   
Agua dulce               20         913               0      0              0   
Construcciones            0           0            2182      0             52   
Nubes                     0           0               0    469              0   
Suelo desnudo             0           0              20      0           2005   
Vegetación                0           5               7      0             28   

                Vegetación  
Agua de mar              1  
Agua dulce               3  
Construcciones          18  
Nubes                    0  
Suelo desnudo           34  
Vegetación            2093  
                precision    recall  f1-score      support
Agua de mar      0.985816  0.990734  0.988269  1403.000000
Agua dulce       0.982777  0.975427  0.979088   936.000000
Construcciones   0.987777  0.968917  0.978256  2252.

In [198]:
for name, clf in models.items():
    clf.fit(X, y)          # usa el 100 % de los datos
    best_models[name] = clf        # queda disponible en memoria
    print(f'{name} almacenado en best_models["{name}"]')

print("\nModelos disponibles en memoria:", list(best_models.keys()))

RandomForest almacenado en best_models["RandomForest"]
GradientBoosting almacenado en best_models["GradientBoosting"]
ExtraTrees almacenado en best_models["ExtraTrees"]

Modelos disponibles en memoria: ['RandomForest', 'GradientBoosting', 'ExtraTrees']


In [204]:
import rasterio
import os

def cargar_banda(path):
    with rasterio.open(path) as src:
        return src.read(), src.profile

def clasificar_rasters(path_raiz, best_models, le, feature_order):
    """
    Clasifica cada stack raster en 'path_raiz' con los modelos de best_models.
    Devuelve un diccionario de tiempos: {carpeta: {modelo: segs}}.
    """
    
    for folder_name in os.listdir(path_raiz):
        folder_path = os.path.join(path_raiz, folder_name)
        if os.path.isdir(folder_path):
            b2 = b3 =b4 = b6 = lst = ndvi = ndbi = None
            for file_name in os.listdir(folder_path):
                if file_name.endswith("B2.TIF"):
                    b2 = os.path.join(folder_path,file_name)
                elif file_name.endswith("B3.TIF"):
                    b3 = os.path.join(folder_path,file_name)
                elif file_name.endswith("B4.TIF"):
                    b4 = os.path.join(folder_path,file_name)
                elif file_name.endswith("B6.TIF"):
                    b6 = os.path.join(folder_path,file_name)                      
                elif file_name.endswith("LST.TIF"):
                    lst = os.path.join(folder_path,file_name)           
                elif file_name.endswith("NDVI.TIF"):
                    ndvi = os.path.join(folder_path,file_name)
                elif file_name.endswith("NDBI.TIF"):
                    ndbi = os.path.join(folder_path,file_name)

        # cargar y apilar
            b2, profile = cargar_banda(b2)   # 3×H×W
            b3, _ = cargar_banda(b3)
            b4, _ = cargar_banda(b4)
            b6, _ = cargar_banda(b6)
            lst, _  = cargar_banda(lst)
            ndvi, _ = cargar_banda(ndvi)
            ndbi, _ = cargar_banda(ndbi)

            assert b2.shape[1:] == b3.shape[1:] == b4.shape[1:] == b6.shape[1:] == lst.shape[1:] == ndvi.shape[1:] == ndbi.shape[1:], "Las dimensiones no coinciden."

            multiband = np.concatenate((b2, b3, b4, b6, lst, ndvi, ndbi), axis=0)
            num_bands, alto, ancho = multiband.shape
        
            data = multiband.reshape(num_bands, -1).T
            columns = ['banda2', 'banda3', 'banda4', 'banda6', 'lst', 'ndvi', 'ndbi']
            df = pd.DataFrame(data, columns=columns)

            for name, model in best_models.items():
                preds = model.predict(df)
                classification = preds.reshape(alto, ancho)

                profile.update(dtype=rasterio.uint8, count=1, height=int(alto), width=int(ancho))

                with rasterio.open(os.path.join(folder_path,"{}.TIF".format(name)), "w", **profile) as dst:
                    dst.write(classification.astype(np.uint8), 1)
                    print("Procesado: ", folder_path, "{}.TIF".format(name))

In [207]:
path = '/Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta'
feature_order = list(X.columns)    # ['banda1', 'banda2', 'banda3', 'lst','ndwi', 'ndvi', 'ndbi']
print(feature_order)
clasificar_rasters(
    path_raiz     = path,
    best_models   = best_models,   # RandomForest, GradientBoosting, ExtraTrees
    le            = le,           # LabelEncoder con las clases
    feature_order = feature_order
)

['b2', 'b3', 'b4', 'b6', 'lst', 'ndvi', 'ndbi']
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta/LC08_L2SP_009052_20180204_20200902_02_T1 RandomForest.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta/LC08_L2SP_009052_20180204_20200902_02_T1 GradientBoosting.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta/LC08_L2SP_009052_20180204_20200902_02_T1 ExtraTrees.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta/LC08_L2SP_009052_20170116_20200905_02_T1 RandomForest.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta/LC08_L2SP_009052_20170116_20200905_02_T1 GradientBoosting.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta/LC08_L2SP_009052_20170116_20200905_02_T1 ExtraTrees.TIF
Procesado:  /Users/jonny.sanchez/Documents/tesis/8-clasification/santa_marta/LC08_L2SP_009052_20150401_20200909_02_T1 RandomForest.TIF
Pro

In [76]:
mapping = dict(zip(le.classes_, range(len(le.classes_))))
print("Codificación de 'cobertura':", mapping)

Codificación de 'cobertura': {'Agua de mar': 0, 'Agua dulce': 1, 'Construcciones': 2, 'Nubes': 3, 'Suelo desnudo': 4, 'Vegetación': 5}
