In [1]:
import os

import pandas as pd
import numpy as np

import pyarrow.parquet as pq

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
class ResstockDataset(Dataset):
    def __init__(self, basedir: str, filenames: list):
        self.basedir = basedir
        self.filenames = filenames
        self.filepaths = [os.path.join(basedir, filename) for filename in filenames]
        
        self.metadatas = [pq.read_metadata(p) for p in self.filepaths]
        
        self.lengths = [metadata.num_rows for metadata in self.metadatas]
        self.lengths_cumsum = np.cumsum(self.lengths)

        self.col_names = [c.name for c in self.metadatas[0].schema]
        
        self.total_length = sum(self.lengths)

        self.current_file_idx = None
        self.current_table = None
        
    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        file_idx = np.searchsorted(self.lengths_cumsum, idx)

        if file_idx == 0:
            row_idx = idx
        else:
            row_idx = idx - self.lengths_cumsum[file_idx-1]

        if file_idx != self.current_file_idx:
            self.current_file_idx = file_idx
            self.current_table = pd.read_parquet(self.filepaths[file_idx]).values.astype(np.float32)
        
        row = torch.from_numpy(self.current_table[row_idx, :])

        t = row[3:6] # time features
        
        h = torch.concat(
            (
                F.one_hot(row[6].long(), num_classes=49), # states one-hot
                F.one_hot(row[7].long(), num_classes=5), # building type one-hot
                row[8:13], # rest of house features
            ),
            dim=0
        )

        x = row[13:109] # aggregate load
        T = row[109:205] # temperature

        y = row[205:].reshape(-1, 96) # individual load profiles (11, 96)

        return t, h, x, T, y

In [3]:
BASEDIR = '/Users/darwish/Documents/Berkeley_Offline/W210/capstone/data/resstock/staged/20240227221540'
BATCH_SIZE = 10

train_dataset = ResstockDataset(BASEDIR, ['0.parquet', '1.parquet', '2.parquet', '3.parquet'])
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = ResstockDataset(BASEDIR, ['4.parquet'])
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

t torch.Size([10, 3])
h torch.Size([10, 59])
x torch.Size([10, 96])
T torch.Size([10, 96])

y torch.Size([10, 11, 96])


In [None]:
t, h, x, T, y = next(iter(train_dataloader))

print('t', t.shape)
print('h', h.shape)
print('x', x.shape)
print('T', T.shape)
print()
print('y', y.shape)