#### OBJETIVO: Hacer pruebas de los modelos de ML para seleccionar los hiperparametros a traves del grid Search

Comentario: suelen tardar bastante y va a depender de la cantidad de informacion
- Pregunta: que paramertos fueron iguales en ambos lenguajes?, similares?, muy diferentes?. Comparar!!!!

In [19]:
#Librerias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, root_mean_squared_error
import joblib
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
print("librerias ok")

librerias ok


In [None]:
### -------------- SVR --------------------------
# SVR SIEMPRE debe usarse con variables escaladas

#Datos de entreada
estacion = "BA"
modelo = "1"

dir_base = f"D:/Josefina/Proyectos/ProyectoChile/{estacion}/modelos/ParticionDataSet/"

train_data = pd.read_csv(
    f"{dir_base}/Modelo_{modelo}/M{modelo}_train_{estacion}.csv"
)

test_data = pd.read_csv(
    f"{dir_base}/Modelo_{modelo}/M{modelo}_test_{estacion}.csv"
)

# Variables ver para cada uno de los sitios
features = [
    "AOD_055", "ndvi", "BCSMASS_dia", "DUSMASS_dia",
    "SO2SMASS_dia", "SO4SMASS_dia", "SSSMASS_dia",
    "blh_mean", "sp_mean", "d2m_mean",
    "v10_mean", "u10_mean", "tp_mean",
    "DEM", "dayWeek"
]

#Determinar los datos
X_train = train_data[features]
y_train = train_data["PM25"]

X_test = test_data[features]
y_test = test_data["PM25"]

# Pipeline (escalado + SVR)
# Pipeline es una forma de encadenar pasos que se ejecutan siempre en el orden correcto, sin fugas de información (data leakage).

# En cada fold:
# - El scaler se ajusta solo con el fold de entrenamiento
# - Se transforma el fold de validación
# - Se entrena el SVR
# - Se evalúa

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svr", SVR(kernel="rbf"))
])

#Grilla de hiperparametros a evaluar
param_grid = {
    "svr__C": [0.1, 1, 10, 100],
    "svr__epsilon": [0.01, 0.1, 0.2, 0.5],
    "svr__gamma": [0.001, 0.01, 0.1, 1]
}

# Grid Search con validación cruzada
#Buscar la mejor combinacion de hiperparametros
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="r2",      # equivalente a optimizar R2
    cv=5,              # similar a tune.svm
    n_jobs=-1,         # usa todos los núcleos
    verbose=2
)
#Ajustar el modelo
grid.fit(X_train, y_train)

#############################
#Mejores hiperparametros
print("Mejores parametros:")
print(grid.best_params_)
best_svr = grid.best_estimator_
#para BA tardo 2m9

#Prediccion para ver las metricas pero no son las del modelo final sino ver los mejores hiperparametrsos
y_pred = best_svr.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
bias = np.mean(y_pred - y_test)

#Resultaods
resultados_SVR = pd.DataFrame({
    "R2": [r2],
    "RMSE": [rmse],
    "Bias": [bias],
    "min": [y_pred.min()],
    "max": [y_pred.max()]
})

print(resultados_SVR)


## Guardar modelo
# output_dir = f"D:/Josefina/Proyectos/Tesis/{estacion}/modelos/"
# joblib.dump(best_svr, f"{output_dir}/01-SVR-M{modelo}-tuned-{estacion}.joblib")


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Mejores parámetros:
{'svr__C': 10, 'svr__epsilon': 0.5, 'svr__gamma': 0.1}
        R2     RMSE      Bias       min        max
0  0.47091  7.72543 -1.044734  2.136073  45.452022


In [12]:
### -------------- Extra trees --------------------------

#Datos de entrada
estacion = "BA"
modelo = "1"

dir_base = f"D:/Josefina/Proyectos/ProyectoChile/{estacion}/modelos/ParticionDataSet/"

train_data = pd.read_csv(f"{dir_base}/Modelo_{modelo}/M{modelo}_train_{estacion}.csv")
test_data  = pd.read_csv(f"{dir_base}/Modelo_{modelo}/M{modelo}_test_{estacion}.csv")

#Variables
features = [
    "AOD_055", "ndvi", "BCSMASS_dia", "DUSMASS_dia",
    "SO2SMASS_dia", "SO4SMASS_dia", "SSSMASS_dia",
    "blh_mean", "sp_mean", "d2m_mean",
    "v10_mean", "u10_mean", "tp_mean",
    "t2m_mean", "DEM", "dayWeek"
]

#Setear las variables importantes
X_train = train_data[features]
y_train = train_data["PM25"]

X_test = test_data[features]
y_test = test_data["PM25"]

#Generar modelos
et_model = ExtraTreesRegressor(
    n_estimators=500,
    random_state=123,
    n_jobs=-1
)
#Grilla de hiperparametros a evaluar
param_grid = {
    "max_features": [3, 5, 7, 9],      # mtry
    "min_samples_leaf": [1, 3, 5, 10]  # min.node.size
}

#Contruir modelo con los distintos hiperparametros
grid_et = GridSearchCV(
    estimator=et_model,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=10,
    verbose=2,
    n_jobs=-1
)

#Ajustar los mdoelos
grid_et.fit(X_train, y_train)

### Ver cuales son los mejores hiperparametros
print("Mejores hiperparámetros:")
print(grid_et.best_params_)

print("RMSE CV:")
print(-grid_et.best_score_)

best_et = grid_et.best_estimator_

y_pred = best_et.predict(X_test)

# Quitar negativos si aparecen (tiene  o no?)
mask = y_pred > 0
y_pred = y_pred[mask]
y_test_f = y_test[mask]
#Metricas obtenidas
r2 = r2_score(y_test_f, y_pred)
rmse = root_mean_squared_error(y_test_f, y_pred)
bias = np.mean(y_pred - y_test_f)

resultados_ET = pd.DataFrame({
    "R2": [r2],
    "RMSE": [rmse],
    "Bias": [bias],
    "Min_Pred": [y_pred.min()],
    "Max_Pred": [y_pred.max()]
})
#Resultados
print(resultados_ET)




# joblib.dump(
#     best_et,
#     f"D:/Josefina/Proyectos/Tesis/{estacion}/modelos/01-ET-M{modelo}-tuned-{estacion}.joblib"
# )



Fitting 10 folds for each of 16 candidates, totalling 160 fits
Mejores hiperparámetros:
{'max_features': 9, 'min_samples_leaf': 1}
RMSE CV:
7.5984400502757365
         R2      RMSE      Bias  Min_Pred   Max_Pred
0  0.588821  6.810409  0.429655  4.908413  57.223313


In [None]:
## -------------- RANDOM FOREST --------------------------

#Datos de entrada
estacion = "BA"
modelo = "1"

dir_base = f"D:/Josefina/Proyectos/ProyectoChile/{estacion}/modelos/ParticionDataSet/"

train_data = pd.read_csv(f"{dir_base}/Modelo_{modelo}/M{modelo}_train_{estacion}.csv")
test_data  = pd.read_csv(f"{dir_base}/Modelo_{modelo}/M{modelo}_test_{estacion}.csv")
# Variables que dependen del sitio
features = [
    "AOD_055", "ndvi", "BCSMASS_dia", "DUSMASS_dia", "t2m_mean",
    "SO2SMASS_dia", "SO4SMASS_dia", "SSSMASS_dia", "blh_mean",
    "d2m_mean", "v10_mean", "u10_mean", "tp_mean", "DEM", "dayWeek"
]

#Setear las variables de cada dataset
X_train = train_data[features]
y_train = train_data["PM25"]

X_test = test_data[features]
y_test = test_data["PM25"]

#Construir modelo
rf_model = RandomForestRegressor(
    n_estimators=500,
    random_state=123,
    n_jobs=-1
)

#Grid para evaluar distintos hiperparametros del modelo
param_grid = {
    "max_features": [3, 5, 7, 9]   # unico hiperparametro a evaluar por ahora: mtry 
}

#Se evalua con el cv
grid_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=5,
    verbose=2,
    n_jobs=-1
)
# Se entrena el modelo
grid_rf.fit(X_train, y_train)

#Resultados
print("Mejores hiperparámetros:")
print(grid_rf.best_params_)

print("RMSE CV:")
print(-grid_rf.best_score_)

best_rf = grid_rf.best_estimator_

y_pred = best_rf.predict(X_test)

# eliminar predicciones negativas
mask = y_pred > 0
y_pred = y_pred[mask]
y_test_f = y_test[mask]

r2 = r2_score(y_test_f, y_pred)
rmse = root_mean_squared_error(y_test_f, y_pred)
bias = np.mean(y_pred - y_test_f)

resultados_RF = pd.DataFrame({
    "R2": [r2],
    "RMSE": [rmse],
    "Bias": [bias],
    "Min_Pred": [y_pred.min()],
    "Max_Pred": [y_pred.max()]
})

print(resultados_RF)

importances = pd.DataFrame({
    "Variable": features,
    "Importancia": best_rf.feature_importances_
}).sort_values("Importancia", ascending=False)

print(importances)

#Para BA termino 1.37

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Mejores hiperparámetros:
{'max_features': 3}
RMSE CV:
8.373721419788936
         R2      RMSE      Bias  Min_Pred  Max_Pred
0  0.560002  7.045036  0.222213   5.51161  53.56604
        Variable  Importancia
5   SO2SMASS_dia     0.170333
2    BCSMASS_dia     0.145680
8       blh_mean     0.086662
7    SSSMASS_dia     0.070146
4       t2m_mean     0.068570
11      u10_mean     0.062915
9       d2m_mean     0.057347
10      v10_mean     0.056743
13           DEM     0.051621
1           ndvi     0.049473
3    DUSMASS_dia     0.044418
6   SO4SMASS_dia     0.043721
0        AOD_055     0.041275
12       tp_mean     0.027314
14       dayWeek     0.023783


In [None]:
## -------------- XGB --------------------------

#Datos de entrada
# Paths
estacion = "BA"
modelo = "1"
base_dir = f"D:/Josefina/Proyectos/ProyectoChile/{estacion}/modelos/ParticionDataSet/Modelo_{modelo}/"

train_data = pd.read_csv(f"{base_dir}M{modelo}_train_{estacion}.csv")
test_data  = pd.read_csv(f"{base_dir}M{modelo}_test_{estacion}.csv")

# Variable objetivo
y_train = train_data["PM25"]
y_test  = test_data["PM25"]

# Predictoras
features = [
    "AOD_055","ndvi","BCSMASS_dia","DUSMASS_dia",
    "SO2SMASS_dia","SO4SMASS_dia","SSSMASS_dia",
    "blh_mean","sp_mean","d2m_mean","v10_mean",
    "u10_mean","tp_mean","t2m_mean","DEM","dayWeek"
]
#Dataset
X_train = train_data[features]
X_test  = test_data[features]

#Cross validation
cv = KFold(
    n_splits=5,
    shuffle=True,
    random_state=123
)

#Construir modelo
xgb_model = XGBRegressor(
    objective="reg:squarederror",
    random_state=123,
    n_jobs=-1
)

#Original
# param_grid_xgb = {
#     "n_estimators": [500, 1000, 1500],   # nrounds
#     "max_depth": [3, 6, 9],
#     "learning_rate": [0.01, 0.1, 0.3],  # eta
#     "gamma": [0, 1, 5],
#     "colsample_bytree": [0.6, 0.8, 1],
#     "min_child_weight": [1, 3, 5],
#     "subsample": [0.6, 0.8, 1]
# }
param_grid_xgb = {
    "n_estimators": [300, 600],
    "max_depth": [3, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

#Probar los hiperparametros
grid_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    scoring="neg_root_mean_squared_error",
    cv=cv,
    verbose=2,
    n_jobs=-1
)
#Ajustar modelo
grid_xgb.fit(X_train, y_train)

#Resultados
print("Mejores hiperparámetros:")
print(grid_xgb.best_params_)


xgb_final = grid_xgb.best_estimator_

y_pred = xgb_final.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")
#Para BA 1.39 ardo

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Mejores hiperparámetros:
{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 600, 'subsample': 0.8}
RMSE: 6.81
R²: 0.588
