In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error


In [4]:
# Cargar datos de entrenamiento y prueba
estacion = "CH"
train_data = pd.read_csv(f"D:/Josefina/Proyectos/ProyectoChile/{estacion}/modelos/ParticionDataSet/Modelo_1/M1_train_{estacion}.csv")
test_data = pd.read_csv(f"D:/Josefina/Proyectos/ProyectoChile/{estacion}/modelos/ParticionDataSet/Modelo_1/M1_test_{estacion}.csv")


In [5]:
# Definir las columnas predictoras y la variable objetivo
predictors = [
    "AOD_055", "ndvi", "BCSMASS_dia", "DUSMASS_dia", "DUSMASS25_dia", "OCSMASS_dia",
    "SO2SMASS_dia", "SO4SMASS_dia", "SSSMASS_dia", "SSSMASS25_dia",
    "blh_mean", "sp_mean", "d2m_mean", "t2m_mean", "v10_mean", "u10_mean",
    "tp_mean", "DEM", "dayWeek"
]
target = "PM25"

X_train = train_data[predictors]
y_train = train_data[target]
X_test = test_data[predictors]
y_test = test_data[target]

In [10]:
X_train.shape
y_train.shape
X_test.shape
y_test.shape

(4986,)

In [11]:
# Configuración del modelo Random Forest con validación cruzada (10 pliegues)
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Validación cruzada para el conjunto de entrenamiento
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=kf, scoring="r2")


KeyboardInterrupt: 

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
# Predicciones para los conjuntos de entrenamiento y prueba
predictions_train = rf_model.predict(X_train)
predictions_test = rf_model.predict(X_test)


In [None]:
# Evaluar la importancia de las variables
importances = rf_model.feature_importances_
importance_df = pd.DataFrame({"Feature": predictors, "Importance": importances}).sort_values(by="Importance", ascending=False)



In [None]:
# Evaluación del modelo
def evaluate_model(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    pearson_corr = np.corrcoef(y_true, y_pred)[0, 1]
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    mse = mean_squared_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    return {
        "R2": r2,
        "Pearson": pearson_corr,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape,
        "MSE": mse,
        "MedAE": medae
    }



In [None]:
# Evaluación en conjunto de entrenamiento
train_metrics = evaluate_model(y_train, predictions_train)
# Evaluación en conjunto de prueba
test_metrics = evaluate_model(y_test, predictions_test)


In [None]:

# Mostrar resultados
print("Resultados - Conjunto de Prueba:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.2f}")

print("\nResultados - Conjunto de Entrenamiento:")
for metric, value in train_metrics.items():
    print(f"{metric}: {value:.2f}")



In [None]:
# Importancia de las variables
print("\nImportancia de las variables:")
print(importance_df)

# Guardar el modelo entrenado
import joblib
joblib.dump(rf_model, "D:/Josefina/Proyectos/ProyectoChile/modelos/modelo/02-RF_cv_M5-251124_SP.pkl")
