In [38]:
import pandas as pd

from util import Pipeline

p = Pipeline('configs')

In [39]:
with pd.HDFStore('data/pipeline.h5') as store:
    test = store['/adjusted_units_change_targets']

In [41]:
test.head()

Unnamed: 0,target_id,start,units_chg,units_chg_adj,RGID,county_id,dec_total_pop,dec_units,dec_hh,dec_gq,...,hh_2044,hhsz,hhpop_initial_2044,initial_hhpop_2044_sum_by_rgid,hhpop_rgid_factored_2044,hhpop_factor,hhpop_factored_2044,dec_gq_pct,gq_2044,totalpop_2044
0,1,2019,35000,34580,1,53033,151854,64688,60953,1440,...,94602,2.340073,221376,1104682,1152938,1.043683,231046,0.020391,2043,233089
1,2,2019,112000,109453,1,53033,737015,368308,345627,29918,...,455306,1.940026,883306,1104682,1152938,1.043683,921892,0.42366,42454,964346
2,3,2019,12000,11996,2,53033,77245,28050,27057,916,...,38284,2.709411,103727,990961,982441,0.991402,102835,0.012971,1300,104135
3,4,2019,5800,5457,2,53033,28956,12682,12006,289,...,17341,2.293237,39767,990961,982441,0.991402,39425,0.004092,410,39835
4,5,2019,7500,7428,2,53033,52066,20785,19874,518,...,26972,2.491102,67190,990961,982441,0.991402,66612,0.007335,735,67347


In [33]:
def load_input_tables(pipeline):
    p = pipeline
    # load control to target lookup
    control_target_lookup = p.get_table('control_target_lookup')

    # sum decennial data to target areas
    dec = (
        p.get_table('decennial_by_control_area')
        .merge(control_target_lookup, on='control_id', how='left')
        .groupby(['target_id','RGID','county_id']).sum().reset_index()
        .drop(columns=['control_id','name'])
    )

    # merge decennial data with adjusted targets
    df = (
        p.get_table('adjusted_total_pop_change_targets')
        .merge(dec, on='target_id', how='left')
    )

    # calculate decennial household population, households and household size by rgid
    df['dec_hhpop_by_rgid'] = df.groupby('RGID')['dec_hhpop'].transform('sum')
    df['dec_hh_by_rgid'] = df.groupby('RGID')['dec_hh'].transform('sum')
    df['dec_hhsz_by_rgid'] = df['dec_hhpop_by_rgid'] / df['dec_hh_by_rgid']

    # calculate decennial hhsz
    df['dec_hhsz'] = df['dec_hhpop'] / df['dec_hh']

    return df

In [34]:
df = load_input_tables(p)

In [36]:
def create_horizon_year_column_names(pipeline):
    p = pipeline
    target_horizon_year = p.settings['target_horizon_year']
    units_horizon_col = f'units_{target_horizon_year}'
    households_horizon_col = f'hh_{target_horizon_year}'
    hhpop_horizon_col = f'hhpop_{target_horizon_year}'
    return units_horizon_col, households_horizon_col, hhpop_horizon_col

In [37]:
units_horizon_col, households_horizon_col, hhpop_horizon_col = create_horizon_year_column_names(p)