In [19]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from joblib import load
import os
import re

## IMPORTAR MODELOS DE REGRESION ENTRENADOS

In [20]:
# average_cluster_0=pd.read_csv("../data/interim/series_cluster_0.csv",parse_dates=["Date/Time"],index_col="Date/Time")
# average_cluster_1=pd.read_csv("../data/interim/series_cluster_1.csv",parse_dates=["Date/Time"],index_col="Date/Time")
# average_cluster_2=pd.read_csv("../data/interim/series_cluster_2.csv",parse_dates=["Date/Time"],index_col="Date/Time")
# average_cluster_3=pd.read_csv("../data/interim/series_cluster_3.csv",parse_dates=["Date/Time"],index_col="Date/Time")

In [21]:
# closest_cluster_0=pd.read_csv("../data/interim/without_closest_cluster_0.csv",parse_dates=["Date/Time"],index_col="Date/Time")
# closest_cluster_1=pd.read_csv("../data/interim/without_closest_cluster_1.csv",parse_dates=["Date/Time"],index_col="Date/Time")
# closest_cluster_2=pd.read_csv("../data/interim/without_closest_cluster_2.csv",parse_dates=["Date/Time"],index_col="Date/Time")
# closest_cluster_3=pd.read_csv("../data/interim/without_closest_cluster_3.csv",parse_dates=["Date/Time"],index_col="Date/Time")

In [22]:
modelo_average_cluster_0= load('../models/modelo_average_cluster_0.joblib')
scaler_average_cluster_0= load('../models/scaler_average_cluster_0.joblib')
modelo_average_cluster_1= load('../models/modelo_average_cluster_1.joblib')
scaler_average_cluster_1= load('../models/scaler_average_cluster_1.joblib')
modelo_average_cluster_2= load('../models/modelo_average_cluster_2.joblib')
scaler_average_cluster_2= load('../models/scaler_average_cluster_2.joblib')
modelo_average_cluster_3= load('../models/modelo_average_cluster_3.joblib')
scaler_average_cluster_3= load('../models/scaler_average_cluster_3.joblib')

In [23]:
modelo_closest_cluster_0= load('../models/modelo_closest_cluster_0.joblib')
scaler_closest_cluster_0= load('../models/scaler_closest_cluster_0.joblib')
modelo_closest_cluster_1= load('../models/modelo_closest_cluster_1.joblib')
scaler_closest_cluster_1= load('../models/scaler_closest_cluster_1.joblib')
modelo_closest_cluster_2= load('../models/modelo_closest_cluster_2.joblib')
scaler_closest_cluster_2= load('../models/scaler_closest_cluster_2.joblib')
modelo_closest_cluster_3= load('../models/modelo_closest_cluster_3.joblib')
scaler_closest_cluster_3= load('../models/scaler_closest_cluster_3.joblib')

## IMPORTAR DATOS PARA LAS PRUEBAS

In [24]:
# Especifica el directorio donde están los archivos CSV
carpeta ="../data/raw/"
# Obtén la lista de todos los archivos en la carpeta
archivos = os.listdir(carpeta)
# Define the columns to keep
columns_to_keep = [
    'Date/Time',
    'Electricity:Facility [kW](Hourly)',
    'Fans:Electricity [kW](Hourly)',
    'Cooling:Electricity [kW](Hourly)',
    'Heating:Electricity [kW](Hourly)',
    'InteriorLights:Electricity [kW](Hourly)',
    'InteriorEquipment:Electricity [kW](Hourly)'
]
# Load and concatenate data from all files
dfs = []
for file in archivos:
    df = pd.read_csv(f"../data/raw/{file}", usecols=columns_to_keep)
    df["Date/Time"] = '2004 ' + df["Date/Time"]
    date_format = '%Y %m/%d %H:%M:%S'
    df["Date/Time"] = pd.to_datetime(df["Date/Time"], format=date_format, errors='coerce')
    match = re.match(r'^[^_]+', file)
    name = match.group(0)
    df["type_building"] = name
    dfs.append(df)

In [25]:
#se extraen los datos clusterizados
data_clustered=pd.read_csv("../data/interim/clustering_pca.csv")

In [26]:
#se cargan los datos mas cercanos a los clusters
closest_points_data=pd.read_csv("../data/interim/closest_points.csv")
closest_points_data=closest_points_data.rename(columns={"Unnamed: 0":"indice_closest"})
closest_points_data

Unnamed: 0,indice_closest,PC1,PC2,PC3,Cluster_KMeans,Cluster_DBSCAN,Cluster_Hierarchical
0,67,-1.122322,-0.026253,0.075707,0,10,0
1,15,4.515147,1.414719,-1.786065,1,-1,1
2,5,5.561883,-2.633525,0.505042,2,-1,2
3,45,2.758477,2.785423,1.36678,1,-1,3


In [27]:
#OBTENER LAS SERIES MAS CERCANAS PARA CADA CLUSTER
indices_clusters=closest_points_data.indice_closest.unique().tolist()
closest_series=[]
for indice in indices_clusters:
    closest_series.append(dfs[indice])

### OBTENER DATOS DE PRUEBA PARA LOS MODELOS ENTRENADOS CON LAS SERIE PROMEDIO DE CADA CLUSTER 

In [28]:
# Agrupar dataframes por cluster
clusters = data_clustered['Cluster_Hierarchical'].unique()
average_dfs = {cluster: [] for cluster in clusters}

for i, df in enumerate(dfs):
    cluster = data_clustered.loc[i, 'Cluster_Hierarchical']
    average_dfs[cluster].append(df.reset_index())

### OBTENER DATOS DE PRUEBA PARA LOS MODELOS ENTRENADOS CON LAS SERIE MAS CERCANA DE CADA CLUSTER 

In [29]:
indices_clusters = closest_points_data.indice_closest.unique().tolist()

# Filtrar la lista de DataFrames eliminando aquellos en los índices de indices_clusters
dfs_actualizados = [df for i, df in enumerate(dfs) if i not in indices_clusters]

In [30]:
# Agrupar dataframes por cluster
clusters = data_clustered['Cluster_Hierarchical'].unique()
closest_dfs = {cluster: [] for cluster in clusters}

for i, df in enumerate(dfs_actualizados):
    cluster = data_clustered.loc[i, 'Cluster_Hierarchical']
    closest_dfs[cluster].append(df.reset_index())

# PRUEBAS CLUSTER 0

In [36]:
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []

In [37]:
closest_dfs[0][0]

Unnamed: 0,index,Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Cooling:Electricity [kW](Hourly),Heating:Electricity [kW](Hourly),InteriorLights:Electricity [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),type_building
0,0,2004-01-01 01:00:00,22.453919,3.998243,0.000733,0.0,4.589925,8.1892,RefBldgFullServiceRestaurantNew2004
1,1,2004-01-01 02:00:00,14.637149,0.000000,0.000000,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004
2,2,2004-01-01 03:00:00,14.651183,0.000000,0.000000,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004
3,3,2004-01-01 04:00:00,14.657947,0.000000,0.000000,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004
4,4,2004-01-01 05:00:00,14.806050,0.000000,0.000000,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004
...,...,...,...,...,...,...,...,...,...
8755,8755,2004-12-31 20:00:00,38.653890,3.998243,0.000000,0.0,9.179851,19.4245,RefBldgFullServiceRestaurantNew2004
8756,8756,2004-12-31 21:00:00,38.637599,3.998243,0.000000,0.0,9.179851,19.4245,RefBldgFullServiceRestaurantNew2004
8757,8757,2004-12-31 22:00:00,38.577471,3.998243,0.000000,0.0,9.179851,19.4245,RefBldgFullServiceRestaurantNew2004
8758,8758,2004-12-31 23:00:00,38.458116,3.998243,0.000000,0.0,9.179851,19.4245,RefBldgFullServiceRestaurantNew2004


In [38]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']

for df_serie in closest_dfs[0]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_closest_cluster_0.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_closest_cluster_0.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}


In [39]:
# Mostrar los resultados
print("Resultados del modelo:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo:


Unnamed: 0,Optimized Random Forest
Average R2,-0.85232
Average RMSE,76.949376


In [59]:
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []

In [60]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']
for df_serie in average_dfs[0]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    df.dropna(inplace=True)
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_average_cluster_0.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_average_cluster_0.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}


In [61]:
# Mostrar los resultados
print("Resultados del modelo:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo:


Unnamed: 0,Optimized Random Forest
Average R2,-1.356732
Average RMSE,24.688037


# PRUEBAS CLUSTER 1

In [42]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []
for df_serie in closest_dfs[1]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_closest_cluster_1.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_closest_cluster_1.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}

In [43]:
# Mostrar los resultados
print("Resultados del modelo Random Forest optimizado:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo Random Forest optimizado:


Unnamed: 0,Optimized Random Forest
Average R2,-6.284018
Average RMSE,101.841944


In [44]:
## closest

In [45]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []
for df_serie in average_dfs[1]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_average_cluster_1.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_average_cluster_1.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}

In [46]:
# Mostrar los resultados
print("Resultados del modelo Random Forest optimizado:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo Random Forest optimizado:


Unnamed: 0,Optimized Random Forest
Average R2,0.996664
Average RMSE,26.466887


# PRUEBAS CLUSTER 2

In [47]:
#average

In [48]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []
for df_serie in average_dfs[2]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_average_cluster_2.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_average_cluster_2.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}

In [49]:
# Mostrar los resultados
print("Resultados del modelo Random Forest optimizado:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo Random Forest optimizado:


Unnamed: 0,Optimized Random Forest
Average R2,0.991122
Average RMSE,23.293884


In [50]:
#closest

In [51]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []
for df_serie in closest_dfs[2]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_closest_cluster_2.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_closest_cluster_2.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}

In [52]:
# Mostrar los resultados
print("Resultados del modelo Random Forest optimizado:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo Random Forest optimizado:


Unnamed: 0,Optimized Random Forest
Average R2,-1.503866
Average RMSE,79.941561


# PRUEBAS CLUSTER 3

In [53]:
# average 

In [54]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []
for df_serie in average_dfs[3]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_average_cluster_3.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_average_cluster_3.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}

In [55]:
# Mostrar los resultados
print("Resultados del modelo Random Forest optimizado:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo Random Forest optimizado:


Unnamed: 0,Optimized Random Forest
Average R2,0.998387
Average RMSE,13.044252


In [56]:
# closest 

In [57]:
# Lista de columnas para análisis
columnas = ['Fans:Electricity [kW](Hourly)', 'Cooling:Electricity [kW](Hourly)',
       'InteriorLights:Electricity [kW](Hourly)',
       'InteriorEquipment:Electricity [kW](Hourly)', 'hour', 'day_of_week',
       'month']
# Inicializar variables para acumular resultados
rmse_list = []
r2_list = []
for df_serie in closest_dfs[3]:
    df=df_serie.copy()
    df["Date/Time"]=pd.to_datetime(df["Date/Time"])
    df.set_index("Date/Time",inplace=True)
    # Extraer características temporales
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    X = df.loc[:, columnas].copy()
    y = df.loc[:, 'Electricity:Facility [kW](Hourly)'].copy()
    
    # Escalar las características
    X_scaled = scaler_closest_cluster_3.transform(X)
    
    # Predecir y calcular métricas de error
    y_pred = modelo_closest_cluster_3.predict(X_scaled)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    
    # Almacenar resultados
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calcular promedios de las métricas de error
average_rmse = np.mean(rmse_list)
average_r2 = np.mean(r2_list)

# Resultados
results = {
    "Optimized Random Forest": {
        'Average RMSE': average_rmse,
        'Average R2': average_r2
    }
}

In [58]:
# Mostrar los resultados
print("Resultados del modelo Random Forest optimizado:")
optimized_rf_results_df=pd.DataFrame(results)
optimized_rf_results_df

Resultados del modelo Random Forest optimizado:


Unnamed: 0,Optimized Random Forest
Average R2,-4.361379
Average RMSE,45.516424
