In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
import numpy as np
import os
import re
from energy_consumption_architecture.utils.paths import data_dir


In [18]:
import pandas as pd
import re

def load_all_series(files, columns_to_keep=None):
    """
    Carga y procesa todas las series de tiempo desde múltiples archivos.

    Parámetros:
    - files: Lista de nombres de archivos a cargar.
    - columns_to_keep: Lista de columnas a mantener en cada archivo. Si es None, se cargan todas las columnas.

    Retorna:
    - combined_df: DataFrame combinado con todas las series de tiempo y un identificador único de serie.
    """
    dfs = []  # Lista para almacenar cada DataFrame cargado

    # Cargar y procesar cada archivo
    for i, file in enumerate(files):
        file_route = data_dir("raw", file)
        
        # Cargar todas las columnas si columns_to_keep es None
        if columns_to_keep:
            df = pd.read_csv(file_route, usecols=columns_to_keep)
        else:
            df = pd.read_csv(file_route)
        
        # Procesar la columna de fecha y tiempo
        df["Date/Time"] = '2004 ' + df["Date/Time"]
        date_format = '%Y %m/%d %H:%M:%S'
        df["Date/Time"] = pd.to_datetime(df["Date/Time"], format=date_format, errors='coerce')
        
        # Extraer el tipo de edificio del nombre del archivo
        match = re.match(r'^[^_]+', file)
        name = match.group(0) if match else f"building_{i + 1}"
        df["type_building"] = name

        # Agregar un identificador de serie único
        df["series_id"] = f"series_{i + 1}"

        # Añadir el DataFrame procesado a la lista
        dfs.append(df)

    # Combinar todos los DataFrames en uno solo
    combined_df = pd.concat(dfs, axis=0, ignore_index=True)

    return combined_df


In [19]:
# Especifica el directorio donde están los archivos CSV
carpeta=data_dir("raw")
# Obtén la lista de todos los archivos en la carpeta
archivos = os.listdir(carpeta)
archivos[:5]

['RefBldgFullServiceRestaurantNew2004_v1.3_7.1_4A_USA_MD_BALTIMORE_Belleville-Scott.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_4A_USA_MD_BALTIMORE_Cahokia.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_4A_USA_MD_BALTIMORE_Carbondale-Southern.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_5A_USA_IL_CHICAGO-OHARE_Aurora.Muni.csv',
 'RefBldgFullServiceRestaurantNew2004_v1.3_7.1_5A_USA_IL_CHICAGO-OHARE_Bloomington.csv']

In [20]:
# Define the columns to keep
columns_to_keep = [
    'Date/Time',
    'Electricity:Facility [kW](Hourly)',
    'Fans:Electricity [kW](Hourly)',
    'Cooling:Electricity [kW](Hourly)',
    'Heating:Electricity [kW](Hourly)',
    'InteriorLights:Electricity [kW](Hourly)',
    'InteriorEquipment:Electricity [kW](Hourly)'
]
combined_df = load_all_series(archivos, columns_to_keep)


In [23]:
combined_df.head()

Unnamed: 0,Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Cooling:Electricity [kW](Hourly),Heating:Electricity [kW](Hourly),InteriorLights:Electricity [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly),type_building,series_id
0,2004-01-01 01:00:00,22.453919,3.998243,0.000733,0.0,4.589925,8.1892,RefBldgFullServiceRestaurantNew2004,series_1
1,2004-01-01 02:00:00,14.637149,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
2,2004-01-01 03:00:00,14.651183,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
3,2004-01-01 04:00:00,14.657947,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1
4,2004-01-01 05:00:00,14.80605,0.0,0.0,0.0,1.529975,7.4902,RefBldgFullServiceRestaurantNew2004,series_1


In [16]:
#se extraen los datos clusterizados
data_clustered=pd.read_csv(data_dir("interim","clustering_all_characteristics.csv"))

In [17]:
data_clustered

Unnamed: 0,series_id,Cooling:Electricity [kW](Hourly)_mean,Cooling:Electricity [kW](Hourly)_std_dev,InteriorEquipment:Electricity [kW](Hourly)_mean,InteriorEquipment:Electricity [kW](Hourly)_std_dev,Cluster
0,series_1,3.073887,6.226848,18.995908,7.265027,0
1,series_2,3.446951,6.596764,18.995908,7.265027,0
2,series_3,3.973240,7.146033,18.995908,7.265027,0
3,series_4,2.055950,4.688730,18.995908,7.265027,0
4,series_5,2.114192,4.797245,18.995908,7.265027,0
...,...,...,...,...,...,...
75,series_76,1.006302,3.543234,3.311257,3.418924,0
76,series_77,1.046543,3.778460,3.311257,3.418924,0
77,series_78,1.181933,3.968506,3.311257,3.418924,0
78,series_79,0.590390,2.526466,3.311257,3.418924,0


## OBTENER LAS SERIES PROMEDIO PARA CADA CLUSTER ]

In [25]:
# Asegúrate de que el DataFrame `df_stats` contenga las etiquetas de cluster y el `series_id`
# Y que el DataFrame `combined_df` tenga el `series_id`

# Hacer un merge para agregar la etiqueta de cluster al DataFrame de series concatenadas
combined_df_with_clusters = combined_df.merge(data_clustered[['series_id', 'Cluster']], on='series_id', how='left')

# Ver el resultado
average_time_series_by_cluster = combined_df_with_clusters.groupby(['Cluster', 'Date/Time']).mean(numeric_only=True)
average_time_series_by_cluster = average_time_series_by_cluster.reset_index()


In [26]:
average_time_series_by_cluster

Unnamed: 0,Cluster,Date/Time,Electricity:Facility [kW](Hourly),Fans:Electricity [kW](Hourly),Cooling:Electricity [kW](Hourly),Heating:Electricity [kW](Hourly),InteriorLights:Electricity [kW](Hourly),InteriorEquipment:Electricity [kW](Hourly)
0,0,2004-01-01 01:00:00,47.210969,5.789775,3.010854,4.637630,6.918895,10.669373
1,0,2004-01-01 02:00:00,47.047291,5.946731,3.048947,4.450657,6.588343,10.525933
2,0,2004-01-01 03:00:00,45.619393,5.862989,2.883068,5.406697,5.084587,10.308935
3,0,2004-01-01 04:00:00,45.697195,6.186896,2.968095,4.801340,5.084587,10.297614
4,0,2004-01-01 05:00:00,47.030928,5.832209,2.942949,5.603789,5.182567,10.482863
...,...,...,...,...,...,...,...,...
33575,3,2004-12-31 19:00:00,122.835252,5.148283,0.000000,0.000000,43.513342,52.539744
33576,3,2004-12-31 20:00:00,156.045530,9.847229,27.667241,0.000000,43.513342,52.539744
33577,3,2004-12-31 21:00:00,159.428897,7.443050,33.373887,0.000000,43.513342,52.539744
33578,3,2004-12-31 22:00:00,167.615204,13.390776,35.738865,0.000000,43.513342,52.539744


In [27]:
average_time_series_by_cluster.to_csv(data_dir("interim","average_time_series_by_cluster.csv"),index=False)