In [2]:
import pandas as pd
from pathlib import Path

ASHRAE = Path("../data/raw/ashrae-energy-prediction")
BDG2 = Path("../data/raw/buildingdatagenomeproject2")
BDG2_META = BDG2 / "metadata.csv"
PROCESSED = Path("../data/processed")
PROCESSED.mkdir(parents=True, exist_ok=True)


In [3]:

def load_ashrae():
    try:
        print("Loading ASHRAE dataset...")
        df = pd.read_csv(ASHRAE / "train.csv", usecols=["timestamp", "building_id", "meter", "meter_reading"])
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.rename(columns={
            "meter": "meter_type",
            "meter_reading": "value"
        })
        df["building_id"] = df["building_id"].astype(str)
        df["meter_type"] = df["meter_type"].astype(str)
        df["source"] = "ASHRAE"

        meta_path = ASHRAE / "building_metadata.csv"
        if meta_path.exists():
            meta = pd.read_csv(meta_path)
            meta["building_id"] = meta["building_id"].astype(str)
            df = pd.merge(df, meta, on="building_id", how="left")
            print("ASHRAE metadata merged.")
        else:
            print("No ASHRAE metadata found.")

        return df

    except Exception as e:
        print("Error loading ASHRAE data:", e)
        sys.exit(1)

In [4]:

def load_bdg2():
    try:
        print("Loading BDG2 dataset...")
        meter_files = {
            "electricity": "electricity.csv",
            "gas": "gas.csv",
            "chilledwater": "chilledwater.csv",
            "hotwater": "hotwater.csv",
            "steam": "steam.csv"
        }
        all_data = []

        for meter_type, filename in meter_files.items():
            file_path = BDG2 / filename
            if not file_path.exists():
                print(f"File not found: {filename}")
                continue

            print(f"Reading {filename}...")
            df = pd.read_csv(file_path, low_memory=False)
            df = df.rename(columns={"local_15min": "timestamp"})
            df["timestamp"] = pd.to_datetime(df["timestamp"])
            df = df.melt(id_vars=["timestamp"], var_name="building_id", value_name="value")
            df["building_id"] = df["building_id"].astype(str)
            df["meter_type"] = meter_type
            df["source"] = "BDG2"
            all_data.append(df)

        if not all_data:
            raise ValueError("No BDG2 files were loaded successfully.")

        df_all = pd.concat(all_data, ignore_index=True)

        # 업로드된 메타데이터 병합
        if BDG2_META.exists():
            meta = pd.read_csv(BDG2_META)
            meta["building_id"] = meta["building_id"].astype(str)
            df_all = pd.merge(df_all, meta, on="building_id", how="left")
            print("BDG2 metadata merged.")
        else:
            print("BDG2 metadata.csv not found. Skipping metadata merge.")

        return df_all

    except Exception as e:
        print("Error loading BDG2 data:", e)
        sys.exit(1)


In [None]:
def merge_datasets():
    print("Merging datasets...")
    df1 = load_ashrae()
    df2 = load_bdg2()

    print("Concatenating datasets...")
    df = pd.concat([df1, df2], ignore_index=True)

    print("Dropping nulls in value column...")
    df.dropna(subset=["value"], inplace=True)

    print("Saving to parquet file...")
    try:
        df.to_parquet(PROCESSED / "merged_energy.parquet", index=False)
        print(f"Done. File saved to {PROCESSED / 'merged_energy.parquet'}")
        print("Final shape:", df.shape)
    except Exception as e:
        print("Failed to save parquet:", e)
        sys.exit(1)

if __name__ == "__main__":
    merge_datasets()


Merging datasets...
Loading ASHRAE dataset...
ASHRAE metadata merged.
Loading BDG2 dataset...
Reading electricity.csv...
Reading gas.csv...
Reading chilledwater.csv...
Reading hotwater.csv...
Reading steam.csv...
