Your first task is to explore the dataset you’ve been assigned. It’s your responsibility to complete all the to-do steps and generate the following:

- Identify the type of dataset
- Print the list of available building IDs
- Select a random building ID, and for a year within its available data range, determine and print the following: 1) Start date 2) End date 3) Latitude 4) Longitude 5) Building type 6) Plot the data from day 1 to day 3 for the selected building

Below is a screenshot of the output I generated. Your output should look similar to this. Feel free to modify the plot or adjust other features for your convenience. However, make sure to only make changes within the `# --- TODO --- #` sections. Do not edit any code within the `# --- Do Not Edit --- #` sections.

![NN model architecture](../../Images/EDA.png)

In [1]:
# --- Do Noy Edit ---  #

import os
from pathlib import Path
from buildings_bench import load_torch_dataset
from buildings_bench.transforms import TimestampTransform
import matplotlib.pyplot as plt
import numpy as np

base_path = "/pscratch/sd/n/nrushad"
os.environ["PATH"] = base_path
os.environ["BUILDINGS_BENCH"] = f"{base_path}/Dataset"

# --- Do Noy Edit ---  #

In [2]:
class EDA:
    def __init__(self, dataset_name, dataset):
        self.name = dataset_name
        self.datasets = dataset.building_datasets

    @staticmethod
    def reconstruct_datetime(data, year):
        timestamp = np.concatenate([
            data['day_of_year'],
            data['day_of_week'],
            data['hour_of_day']
        ], axis=-1)

        timestamp = TimestampTransform().undo_transform(timestamp)

        dt = (
            np.datetime64(f'{year}-01-01')
            + np.timedelta64(1, 'D') * (timestamp[..., 0] - 1)
            + np.timedelta64(1, 'h') * timestamp[..., -1]
        )

        return dt, timestamp

    def dataset_summary(self):
        print(f"Dataset: {self.name}")
        print(f"Type: {type(self.datasets)}")
        print(f"Building IDs: {list(self.datasets.keys())}")

    def building_summary(self, building_id, year):
        ds = dict(self.datasets[building_id])

        data_start = ds[year][0]
        data_end = ds[year][len(ds[year])]

        dt_start, _ = self.reconstruct_datetime(data_start, year)
        dt_end, _ = self.reconstruct_datetime(data_end, year)

        lat = float(data_start['latitude'][0][0])
        lon = float(data_start['longitude'][0][0])
        btype = int(data_start['building_type'][0][0])

        print(f"Building ID: {building_id}")
        print(f"Year: {year}")
        print(f"Number of records in {building_id}: {len(ds[year])}")
        print(f"Start date: {dt_start[0]}")
        print(f"End date: {dt_end[-1]}")
        print(f"Latitude: {lat:.4f}")
        print(f"Longitude: {lon:.4f}")
        print(f"Building Type: {btype}")

    def plot_load(self, building_id, year, day):
        ds = dict(self.datasets[building_id])

        if day < 0 or day >= len(ds[year]):
            print(f"Invalid day index: {day}. Valid range is 0 to {len(ds[year])-1}.")
            return

        data = ds[year][day]
        dt, _ = self.reconstruct_datetime(data, year)
        load = data['load'].squeeze()

        lat = float(data['latitude'][0][0])
        lon = float(data['longitude'][0][0])
        btype = int(data['building_type'][0][0])

        plt.figure(figsize=(12, 4))
        plt.plot(dt.astype('datetime64[m]'), load)
        plt.xlabel('Datetime')
        plt.ylabel('Load (kW)')
        plt.title(f"{building_id} | {year} Day Index {day} | Lat: {lat:.4f}, Lon: {lon:.4f}, Type: {btype}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.grid(True)
        plt.show()
               

# ["ideal", "electricity", "lcl", "sceaux", "borealis"]
# Ideal (2017-2018) | Example ID: home241
# Electricity (2011-2014) | Example ID: MT_061
# lcl (012-2013) | Example ID: MAC003538
# sceaux (2007–2010) | Example ID: Sceaux
# borealis (2011) | Example ID: home25

# dataset_name = "ideal"
# dataset = load_torch_dataset(dataset_name, apply_scaler_transform='')
# eda = EDA(dataset_name, dataset)
# eda.dataset_summary()
# # 2017-2018
# eda.building_summary('home241', 2018)
# eda.plot_load('home241', 2018, day=173)

# dataset_name = "electricity"
# dataset = load_torch_dataset(dataset_name, apply_scaler_transform='')
# eda = EDA(dataset_name, dataset)
# eda.dataset_summary()
# # 2012-2014
# eda.building_summary('MT_061', 2014)
# eda.plot_load('MT_061', 2014, day=5)

# dataset_name = "lcl"
# dataset = load_torch_dataset(dataset_name, apply_scaler_transform='')
# eda = EDA(dataset_name, dataset)
# eda.dataset_summary()
# # 2012-2013
# eda.building_summary('MAC003538', 2012)
# eda.plot_load('MAC003538', 2012, day=5)

# dataset_name = "sceaux"
# dataset = load_torch_dataset(dataset_name, apply_scaler_transform='')
# eda = EDA(dataset_name, dataset)
# eda.dataset_summary()
# # 2007-2010
# eda.building_summary('Sceaux', 2008)
# eda.plot_load('Sceaux', 2008, day=15)

# dataset_name = "borealis"
# dataset = load_torch_dataset(dataset_name, apply_scaler_transform='')
# eda = EDA(dataset_name, dataset)
# eda.dataset_summary()
# # 2011
# eda.building_summary('home25', 2011)
# eda.plot_load('home25', 2011, day=15)