Code to analyze metadata and select house-resident IDs for further analysis from Pecan Street's Dataport

In [1]:
import pandas as pd
import numpy as np

In [37]:
# Import dataframe, remove column descriptions, and set index to dataid
metadata_df = pd.read_csv('metadata.csv')
metadata_df.drop(labels=0, axis=0, inplace=True)
metadata_df.set_index('dataid',inplace=True)

# Keep only relevant columns and drop all others
metadata_df = metadata_df[['city', 'state', 'egauge_1min_min_time', 'egauge_1min_max_time', 
                   'egauge_1min_data_availability', 'battery1', 'grid', 'solar', 'pv', 'total_amount_of_pv' ]]
                   
# Convert relevant columns to floats
metadata_df.egauge_1min_data_availability = metadata_df.egauge_1min_data_availability.str.rstrip('%').astype('float') / 100.0

# Only keep rows with >95% data availability (assuming we can impute values later)
metadata_df = metadata_df.loc[metadata_df.egauge_1min_data_availability >= 0.95]

# Only keep rows with grid data (the minimum amount of data that we need!)
metadata_df = metadata_df.loc[metadata_df.grid == 'yes']


In [44]:
# Get all Texas houses with batteries
tx_df = metadata_df.loc[(metadata_df.state == 'Texas') & (metadata_df.battery1 == 'yes')]

In [42]:
# Get all California houses with grid and PV data available
ca_df = metadata_df.loc[(metadata_df.state == 'California') & (metadata_df.grid == 'yes') & (metadata_df.solar == 'yes')]

In [65]:
# Save IDs to csv to retrieve 15-minute data from
tx_ids = tx_df.index.to_numpy(dtype=float)
ca_ids = ca_df.index.to_numpy(dtype=float)
np.savetxt("tx_ids.csv", tx_ids, delimiter=",")
np.savetxt("ca_ids.csv", ca_ids, delimiter=",")

In [66]:
tx_ids

array([8707.,  974., 2925., 6836., 5403.])

In [67]:
ca_ids

array([1.1896e+04, 2.3540e+03, 4.5090e+03, 4.3150e+03, 1.1000e+01,
       4.6220e+03, 9.4770e+03, 8.2770e+03, 1.1478e+04, 8.9080e+03,
       9.2210e+03, 8.6400e+02, 3.6370e+03])