In [2]:
import os
import pandas as pd
import numpy as np
import h5py
from pytz import timezone
import re
from datetime import datetime
import random
from time import time

one_sqm_in_sqft = 1562500 / 145161 # source: Wolfram Alpha

In [3]:
def apply_func(f):
    '''Apply the given function on the data and close the data'''
    file = 'meta/meta.hdf5'
    buildings_data = h5py.File(file, 'r+')
    results = f(buildings_data)
    buildings_data.close()
    return results

In [4]:
class Building:
    def __init__(self, path):
        self.df = pd.read_csv(path)
        self.mat = self.df[['timestamp', 'kWh']].as_matrix()
        self.years = np.array([date[:4] for date in self.mat[:, 0]])
        self.id = re.search('(?<=/)(.(?!/))*(?=\.csv)', path).group(0)
    
    def split_years(self):
        mat_by_years = {}
        for year in sorted(list(set(self.years))):
            mat_by_years[year] = self.mat[self.years == year]
        return mat_by_years
    
    def get_id(self):
        return self.id

In [5]:
def store_numerical(buildings_data):
    if 'washington' in buildings_data.keys():
        del buildings_data['washington']
    buildings_data.create_group('washington')
    
    root = './meta/DGS_ENERGY_AND_METADATA/energy/dgs_15min_api_1hr_agr_filled_and_filtered/'
    for path in os.listdir(root):
        building = Building(root + path)
        buildings_data['washington'].create_group(building.get_id())
        mat_by_years = building.split_years()
        for year in mat_by_years:
            buildings_data['washington'][building.get_id()].create_dataset(year, shape=mat_by_years[year].shape, dtype='|S32')
            buildings_data['washington'][building.get_id()][year][:] = mat_by_years[year].astype('|S32')

In [6]:
start = time()
apply_func(store_numerical)
print('Time taken: %.2f' % (time() - start))

Time taken: 69.28


In [7]:
meta_df = pd.read_csv('./meta/DGS_ENERGY_AND_METADATA/metadata/dgs_metadata_filtered_and_cleaned_final.csv')

In [8]:
meta_dict = {}

In [9]:
for idx, row in meta_df.iterrows():
    meta_dict[str(row.id)] = {
        'PSU': row.PSU,
        'Industry': row.Industry,
        'Subindustry': row.subindustry,
        'Sqft': row.square_feet,
        'Climatezone': '4',
        'Timezone': 'America/New_York'
    }

In [10]:
def store_meta(buildings_data):
    for building_id in meta_dict:
        for field in meta_dict[building_id]:
            if building_id in buildings_data['washington']:
                buildings_data['washington'][building_id].attrs[field] = meta_dict[building_id][field]

apply_func(store_meta)

### Deprecated

In [1]:
def reduce_size(buildings_data):
    for dset_name in buildings_data.keys():
        dset = buildings_data[dset_name]
        for building_name in dset:
            building = dset[building_name]
            for year in building:
                try:
                    data = building[year]
                    data[:, 1] = data[:, 1].astype('float').astype('|S16')
                    del building[year]
                    building[year].create_dataset(year, shape=data.shape, dtype='|S16')
                    building[year][:] = data.astype('|S16')
                    
                except Exception as e:
                    print(e)

start = time()
apply_func(reduce_size)
print('Time taken: %.2f' % (time() - start))

In [None]:
def check_data(buildings_data):
    print(buildings_data['genome']['Office_Abbey']['2015'][:])

apply_func(check_data)