In [7]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from rdflib.tools.csv2rdf import column
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [8]:
folder_path = r"C:\Users\Ibon\PycharmProjects\Microgrid\objective2_data_cleaned"

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

df_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, parse_dates=["Time Stamp (local standard time) yyyy-mm-ddThh:mm:ss"])
    
    # Selecciona aleatoriamente el 5% de las filas
    sample_df = df.sample(frac=0.05, random_state=42)  # Ajusta random_state para reproducibilidad
    
    df_list.append(sample_df)

df_merged = pd.concat(df_list, ignore_index=True)
df_merged = df_merged.drop(columns='Time Stamp (local standard time) yyyy-mm-ddThh:mm:ss')

In [9]:
# 📌 2️⃣ Seleccionar variables predictoras y objetivo
X = df_merged.drop(columns=['Pmp (W)'])
y = df_merged['Pmp (W)']

# 📌 3️⃣ División en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 4️⃣ Escalado de datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Definir el modelo
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Definir la grilla de hiperparámetros
param_grid = {
    "n_estimators": [50, 100, 200],   # Número de árboles
    "max_depth": [10, 20, None],      # Profundidad del árbol
    "min_samples_split": [2, 5, 10],  # Mínimo de muestras para dividir
    "min_samples_leaf": [1, 2, 4]     # Mínimo de muestras en hojas
}

# GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Mejor modelo
best_rf = grid_search.best_estimator_

KeyboardInterrupt: 

In [6]:
# Predicciones
y_pred = best_rf.predict(X_test_scaled)

# Evaluación del modelo
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
# Importancia de características
feature_importance = pd.DataFrame(
    {"Feature": X.columns, "Importance": best_rf.feature_importances_}
).sort_values(by="Importance", ascending=False)

# 📊 Visualización de importancia de características
plt.figure(figsize=(10, 5))
sns.barplot(x="Importance", y="Feature", data=feature_importance, palette="viridis")
plt.title("Importancia de Características en Random Forest")
plt.show()