In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

# 1. Load merged dataset
df = pd.read_parquet("../data/processed/merged_energy.parquet")
print("Original shape:", df.shape)


Original shape: (67455298, 5)


In [2]:

# 2. Filter: office 건물만 (ASHRAE: 'primary_use', BDG2: 'building_type')
if 'primary_use' in df.columns:
    df = df[df['primary_use'].str.lower() == 'office']
elif 'building_type' in df.columns:
    df = df[df['building_type'].str.lower() == 'office']
print("Filtered to office buildings:", df.shape)


Filtered to office buildings: (67455298, 5)


In [3]:

# 3. Drop rows with essential nulls and sort
df = df.dropna(subset=['value', 'timestamp', 'building_id'])
df = df.sort_values('timestamp')


In [4]:

# 4. 파생 변수 생성
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['dayofweek'] >= 5
df['holiday'] = df['timestamp'].dt.strftime('%Y-%m-%d').isin([
    # 필요 시 휴일 리스트 입력
]).astype(int)


In [5]:
# 5. Feature 선택 (base + optional)
base_features = [
    'hour', 'dayofweek', 'is_weekend', 'holiday',
    'air_temperature', 'cloud_coverage', 'wind_speed',
    'dew_temperature', 'precip_depth_1_hr',
    'square_feet', 'year_built', 'floor_count',
    # 'indoor_temperature', 'humidity', 'occupancy'  # 시뮬레이션 필요
]

optional_features = [
    col for col in df.columns
    if col not in base_features + ['timestamp', 'building_id', 'site_id', 'source', 'primary_use', 'building_type']
    and pd.api.types.is_numeric_dtype(df[col])
]

feature_columns = base_features + optional_features
print("Number of features used:", len(feature_columns))
print("Feature columns:", feature_columns)

Number of features used: 13
Feature columns: ['hour', 'dayofweek', 'is_weekend', 'holiday', 'air_temperature', 'cloud_coverage', 'wind_speed', 'dew_temperature', 'precip_depth_1_hr', 'square_feet', 'year_built', 'floor_count', 'value']


In [6]:

# 7. 정규화
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

# 8. Save
os.makedirs("../data/processed", exist_ok=True)
df_scaled.to_parquet("../data/processed/office_data_merged.parquet", index=False)
print("Saved to: office_data_merged.parquet")


KeyError: "['air_temperature', 'cloud_coverage', 'wind_speed', 'dew_temperature', 'precip_depth_1_hr', 'square_feet', 'year_built', 'floor_count'] not in index"