In [1]:
%load_ext autoreload
%autoreload 2

In [58]:
import os
import re
# Importaciones específicas del proyecto
from energy_consumption_architecture.utils.paths import data_dir, data_raw_dir
from energy_consumption_architecture.clustering_utils import *
from energy_consumption_architecture.dataset import load_all_series
from energy_consumption_architecture.regresion_utils import pipeline_for_clusters

## Cargar datos 

In [42]:
# Especifica el directorio donde están los archivos CSV
carpeta=data_dir("raw")
# Obtén la lista de todos los archivos en la carpeta
archivos = os.listdir(carpeta)
archivos[:5]

['RefBldgFullServiceRestaurantNew2004_v1.3_7.1_4A_USA_MD_BALTIMORE_Belleville-Scott.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_4A_USA_MD_BALTIMORE_Cahokia.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_4A_USA_MD_BALTIMORE_Carbondale-Southern.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_5A_USA_IL_CHICAGO-OHARE_Aurora.Muni.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_5A_USA_IL_CHICAGO-OHARE_Bloomington.csv']

In [43]:
# Define the columns to keep
columns_to_keep = [
    'Date/Time',
    'Cooling:Electricity [kW](Hourly)',
    'InteriorEquipment:Electricity [kW](Hourly)'
]

combined_df_filtered = load_all_series(archivos, columns_to_keep)

In [44]:
combined_df_filtered.head()

Unnamed: 0,Date/Time,Cooling:Electricity [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),type_building,series_id
0,2004-01-01 01:00:00,0.000733,8.1892,RefBldgFullServiceRestaurantNew2004,series_1
1,2004-01-01 02:00:00,0.0,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
2,2004-01-01 03:00:00,0.0,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
3,2004-01-01 04:00:00,0.0,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
4,2004-01-01 05:00:00,0.0,7.4902,RefBldgFullServiceRestaurantNew2004,series_1


## Clustering

In [45]:
df_stats = calculate_statistics(combined_df_filtered)

In [46]:
df_stats.head()

Unnamed: 0,series_id,Cooling:Electricity [kW](Hourly)_mean,Cooling:Electricity [kW](Hourly)_std_dev,InteriorEquipment:Electricity [kW](Hourly)_mean,InteriorEquipment:Electricity [kW](Hourly)_std_dev
0,series_1,3.073887,6.226848,18.995908,7.265027
1,series_2,3.446951,6.596764,18.995908,7.265027
2,series_3,3.97324,7.146033,18.995908,7.265027
3,series_4,2.05595,4.68873,18.995908,7.265027
4,series_5,2.114192,4.797245,18.995908,7.265027


In [47]:
metrics, best_model_info, data=automated_clustering(df_stats, max_k=10, eps_range=(0.05, 0.2, 0.05), min_samples_range=(3, 12))



In [48]:
best_model_info

{'Model': 'K-Means',
 'Silhouette Score': 0.8357957522052232,
 'Davies-Bouldin Index': 0.21789734657975474,
 'Num Clusters': 4,
 'Silhouette Score Norm': 1.0,
 'Davies-Bouldin Index Norm': 1.0,
 'Combined Score': 1.0}

In [49]:
data.head()

Unnamed: 0,series_id,Cooling:Electricity [kW](Hourly)_mean,Cooling:Electricity [kW](Hourly)_std_dev,InteriorEquipment:Electricity [kW](Hourly)_mean,InteriorEquipment:Electricity [kW](Hourly)_std_dev,Cluster
0,series_1,3.073887,6.226848,18.995908,7.265027,0
1,series_2,3.446951,6.596764,18.995908,7.265027,0
2,series_3,3.97324,7.146033,18.995908,7.265027,0
3,series_4,2.05595,4.68873,18.995908,7.265027,0
4,series_5,2.114192,4.797245,18.995908,7.265027,0


## Regresion

In [50]:
# Define the columns to keep
columns_to_keep = [
    'Date/Time',
    'Electricity:Facility [kW](Hourly)',
    'Fans:Electricity [kW](Hourly)',
    'Cooling:Electricity [kW](Hourly)',
    'Heating:Electricity [kW](Hourly)',
    'InteriorLights:Electricity [kW](Hourly)',
    'InteriorEquipment:Electricity [kW](Hourly)'
]
data_complete = load_all_series(archivos, columns_to_keep)

In [51]:
data_complete.head()

Unnamed: 0,Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Cooling:Electricity [kW](Hourly),Heating:Electricity [kW](Hourly),InteriorLights:Electricity [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),type_building,series_id
0,2004-01-01 01:00:00,22.453919,3.998243,0.000733,0.0,4.589925,8.1892,RefBldgFullServiceRestaurantNew2004,series_1
1,2004-01-01 02:00:00,14.637149,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
2,2004-01-01 03:00:00,14.651183,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
3,2004-01-01 04:00:00,14.657947,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
4,2004-01-01 05:00:00,14.80605,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1


In [59]:
# Asegúrate de que el DataFrame `df_stats` contenga las etiquetas de cluster y el `series_id`
# Y que el DataFrame `data_complete` tenga el `series_id`

average_time_series_by_cluster = calculate_average_time_series_by_cluster(data_complete, data)

In [60]:
average_time_series_by_cluster.head()

Unnamed: 0_level_0,Cluster,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Cooling:Electricity [kW](Hourly),Heating:Electricity [kW](Hourly),InteriorLights:Electricity [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly)
Date/Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-01-01 01:00:00,0,47.210969,5.789775,3.010854,4.63763,6.918895,10.669373
2004-01-01 02:00:00,0,47.047291,5.946731,3.048947,4.450657,6.588343,10.525933
2004-01-01 03:00:00,0,45.619393,5.862989,2.883068,5.406697,5.084587,10.308935
2004-01-01 04:00:00,0,45.697195,6.186896,2.968095,4.80134,5.084587,10.297614
2004-01-01 05:00:00,0,47.030928,5.832209,2.942949,5.603789,5.182567,10.482863


In [54]:
target = 'Electricity:Facility [kW](Hourly)'

In [55]:
# Cargar modelos con configuraciones ajustadas
models = {
    "Linear Regression": LinearRegression(fit_intercept=True, n_jobs=-1),
    "Tree": DecisionTreeRegressor(max_depth=5, min_samples_split=5, random_state=42),
    "SVM": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_split=10, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, subsample=0.8, colsample_bytree=0.8, random_state=42)
}

In [56]:
metrics_df, best_models_df = pipeline_for_clusters(average_time_series_by_cluster, target, models,threshold_ratio=2)

Processing Cluster 0
Best model for Cluster 0:
Model         XGBRegressor(base_score=None, booster=None, ca...
Train RMSE                                             1.677753
Train MAE                                              1.242559
Train R2                                               0.996641
Test RMSE                                              2.418129
Test MAE                                               1.875946
Test R2                                                0.990515
Model Name                                              XGBoost
Name: 4, dtype: object

Processing Cluster 1
Best model for Cluster 1:
Model         LinearRegression(n_jobs=-1)
Train RMSE                      34.509758
Train MAE                        27.98159
Train R2                         0.994399
Test RMSE                       39.835457
Test MAE                        31.613485
Test R2                          0.988051
Model Name              Linear Regression
Name: 0, dtype: object

Processing

In [61]:
best_models_df

Unnamed: 0,Model,Train RMSE,Train MAE,Train R2,Test RMSE,Test MAE,Test R2,Model Name
4,"XGBRegressor(base_score=None, booster=None, ca...",1.677753,1.242559,0.996641,2.418129,1.875946,0.990515,XGBoost
0,LinearRegression(n_jobs=-1),34.509758,27.98159,0.994399,39.835457,31.613485,0.988051,Linear Regression
4,"XGBRegressor(base_score=None, booster=None, ca...",13.238075,9.829072,0.996658,24.246907,19.310649,0.990215,XGBoost
0,LinearRegression(n_jobs=-1),6.566537,5.603544,0.999608,6.617337,5.642924,0.998601,Linear Regression
