# Generating Subsets for testing


1. Create state-level subsets for NHGIS base crosswalks
1. Create state-level subsets for NHGIS base tabular data
1. Record unit tests values for posterity


***This is currently only intended for use with block-level data.***


**James Gaboardi** **(<jgaboardi@gmail.com>), 2020-05**

In [None]:
%load_ext watermark
%watermark

In [None]:
import inspect
import nhgisxwalk
import numpy
import pandas

%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv

In [None]:
def build_subset(
    stfips,
    src_year,
    src_geog,
    trg_year,
    trg_geog,
    tabular,
    tabular_dir="",
    xwalk_dir="",
    code_type="GJOIN",
    out_dir="../testing_data_subsets",
    ur_path=None
):
    """Create and write out state-level subsets for NHGIS base crosswalks
    and the associated base tabular data. This is currently only intended
    for use with block-level data.
    
    Parameters
    ----------
    
    stfips : str
        State FIPS code for subset (target year)
        
    src_year : str
        Source year.
    
    src_geog : str
        Source geography.
    
    trg_year : str
        Target year.
    
    trg_geog : str
        Target geography.
    
    tabular : str
        Tabular geography type (as a base for file name).
    
    tabular_dir : str
        Path to the tabular geography file. Default is ''.
    
    xwalk_dir : str
        Path to the crosswalk file. Default is ''.
    
    code_type : str
        Code type/format. Default is 'GJOIN'.
    
    out_dir : str
        Default is '../testing_data_subsets'.
    
    ur_path : str
        Path to Urban/Rural code data for 2000 blocks.
    
    Returns
    -------
    
    xwalk : pandas.DataFrame
        Subset of the base crosswalk.
    
    tab : pandas.DataFrame
        Subset of the base tabular data.
    
    """
    
    def blk_id(df, order, cname="GISJOIN", tzero=["STATE", "COUNTY"]):
        """Recreate BLK GISJOIN ---- Used to extract 2000 block UR codes."""
        def _gjoin(x):
            join_id_vals = []
            for o in order:
                _id_val = getattr(x, o)
                if o in tzero:
                    _id_val += "0"
                join_id_vals.append(_id_val)
            id_str = "".join(join_id_vals)
            id_str = "G" + id_str
            return id_str
        df[cname] = [_gjoin(record) for record in df.itertuples()]
        return df
    
    # 1. read in base_xwalk
    src_col = "%s%s" % (code_type, src_year)
    trg_col = "%s%s" % (code_type, trg_year)
    dtype = {c:str for c in [src_col, trg_col]}
    ct = code_type.lower()[:2]
    xwalk_base_name = "/nhgis_%s%s_%s%s_%s.csv.zip"
    f_xwalk = xwalk_base_name % (src_geog, src_year, trg_geog, trg_year, ct)
    xwalk = pandas.read_csv(xwalk_dir+f_xwalk, index_col=0, dtype=dtype)
    
    # 2. create temp column in base_xwalk for trg_year, stfips
    temp_col = "state%s" % trg_year
    xwalk[temp_col] = xwalk[trg_col].map(lambda x: x[1:3])
    
    # 3. subset base_xwalk, remove temp column, reset index, and write out
    xwalk = xwalk[xwalk[temp_col] == stfips]
    xwalk = xwalk[xwalk.columns[:-1]]
    xwalk.reset_index(inplace=True, drop=True)
    xwalk.to_csv(out_dir+f_xwalk)
    
    # 4. read in base_tabular
    dtype = ["GISJOIN", "YEAR", "STATE", "STATEA", "COUNTY", "COUNTYA"]
    dtype += ["CTY_SUB", "CTY_SUBA", "PLACE", "PLACEA", "TRACTA"]
    dtype += ["BLCK_GRPA", "AIANHHA", "URBRURALA", "NAME"]
    dtype = {c:str for c in dtype}
    f_tab = "%s_%s" % (src_year, tabular)
    f_tab_in = "/%s/%s.csv" % (f_tab, f_tab)
    tab = pandas.read_csv(tabular_dir+f_tab_in, index_col=0, dtype=dtype)
    
    # 5. subset base_tabular by stfips and extract UR code for 2000 blocks
    tab = tab[tab["STATEA"] == stfips]
    if src_year == "2000" and ur_path:
        # Special case for 2000 blocks (of 2000 bgp)-- needs Urban/Rural code
        # For more details see:
        # https://gist.github.com/jGaboardi/36c7640af1f228cdc8a691505262e543
        ur_df = pandas.read_csv(ur_path, index_col=0, dtype=str)
        ur_df = ur_df[ur_df["STATE"] == stfips]
        blk_cols = ["STATE", "COUNTY", "TRACT", "BLOCK"]
        ur_df = blk_id(ur_df, blk_cols)
        tab["URBRURALA"] = tab.index.map(dict(ur_df[["GISJOIN", "UR"]].values))
        # reorder columns
        cols = tab.columns
        reorder_cols = list(cols[:11]) + list(cols[-1:]) + list(cols[11:-1])
        tab = tab[reorder_cols]
        
    # 6. write out
    tab.to_csv("%s/%s.csv.zip"%(out_dir,f_tab))
    
    return xwalk, tab

## Generate subset

In [None]:
subset_state = "10" # Delaware
#subset_state = "11" # DC
#subset_state = "56" # Wyoming

#source_year, target_year = "1990", "2010"
source_year, target_year = "2000", "2010"

In [None]:
if source_year == "1990" or target_year == "1990":
    xwalk_df, tab_df = build_subset(
        subset_state,
        source_year,
        "blk",
        target_year,
        "blk",
        "block",
        tabular_dir="../../tabular_data",
        xwalk_dir="../../crosswalks"
    )
if source_year == "2000" or target_year == "2000":
    xwalk_df, tab_df = build_subset(
        subset_state,
        source_year,
        "blk",
        target_year,
        "blk",
        "block",
        tabular_dir="../../tabular_data",
        xwalk_dir="../../crosswalks",
        ur_path="../../missing_UR_codes/2000_block_UR.csv.zip"
    )

In [None]:
xwalk_df.head()

In [None]:
tab_df.head()

## Test crosswalk build with generated subset

In [None]:
subset_data_dir = "../testing_data_subsets"
base_xwalk_name = "/nhgis_blk%s_blk%s_gj.csv.zip" % (source_year, target_year)
base_xwalk_file = subset_data_dir + base_xwalk_name
data_types = nhgisxwalk.str_types(["GJOIN%s"%source_year, "GJOIN%s"%target_year])
base_xwalk = pandas.read_csv(base_xwalk_file, index_col=0, dtype=data_types)
base_xwalk.head()

In [None]:
#invc = nhgisxwalk.desc_code_1990
invc = nhgisxwalk.desc_code_2000_SF1b

input_var_names = ["Persons", "Families", "Households", "Housing Units"]
input_var_codes = [invc[name]["Total"] for name in input_var_names]
input_var_codes

In [None]:
input_var_tags = ["pop", "fam", "hh", "hu"]

In [None]:
'''
bgp1990_to_trt2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="trt",
    base_source_table=subset_data_dir+"/1990_block.csv.zip",
    input_var=input_var_codes,
    weight_var=input_var_tags,
    stfips=subset_state
)
'''

bgp2000_to_trt2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="trt",
    base_source_table=subset_data_dir+"/2000_block.csv.zip",
    input_var=input_var_codes,
    weight_var=input_var_tags,
    stfips=subset_state
)

-----------------------

## Data for unittests and doctests
### Delaware
**base_xwalk_blk1990_blk2010**

In [None]:
wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
id_cols = ["bgp2000", "trt2010"]

In [None]:
#ix1, ix2 = 677, 681

#ix1, ix2 = 1025, 1029 # bgp1990_to_trt2010
#bgp1990_to_trt2010.xwalk[ix1:ix2]

**base_xwalk_blk2000_blk2010**

In [None]:
bgp2000_to_trt2010.xwalk.head()

In [None]:
ix1, ix2 = 1025, 1029 # bgp2000_to_trt2010
bgp2000_to_trt2010.xwalk[ix1:ix2]

In [None]:
#bgp2000_to_trt2010.xwalk[id_cols][ix1:ix2].values

In [None]:
knw_str_vals = numpy.array(
    [
        ["G10000509355299999051303U1", "G1000050051303"],
        ["G10000509355299999051304R1", "G1000050051305"],
        ["G10000509355299999051304R1", "G1000050051306"],
        ["G10000509355299999051304R1", "G1000050051400"]
    ]
)
knw_str_vals

In [None]:
numpy.equal(
    bgp2000_to_trt2010.xwalk[id_cols][ix1:ix2].values,
    knw_str_vals
).all()

In [None]:
#bgp2000_to_trt2010.xwalk[wgt_cols][ix1:ix2].values

In [None]:
knw_num_vals = numpy.array(
    [
        [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
        [6.80605382e-01, 6.33909150e-01, 6.57366450e-01, 6.59501671e-01],
        [3.19167389e-01, 3.65781711e-01, 3.42281879e-01, 3.40110906e-01],
        [2.27229039e-04, 3.09138740e-04, 3.51671251e-04, 3.87423412e-04]
    ]
)
knw_num_vals

In [None]:
numpy.allclose(
    bgp2000_to_trt2010.xwalk[wgt_cols][ix1:ix2].values,
    knw_num_vals
)

-------------------------

### DC
**base_xwalk_blk1990_blk2010**

In [None]:
#ix1, ix2 = 688, 692

#knw_str_vals = numpy.array(
#    [
#        ["G11000105000050000009806989999999884011", "G1100010009811"],
#        ["G11000105000050000009806989999999884012", "G1100010009810"],
#        ["G11000105000050000009806989999999884012", "G1100010009811"],
#        ["G11000105000050000009807989999999884011", "G1100010009807"],
#    ]
#)
#knw_num_vals = numpy.array(
#    [
#        [1.0, 1.0, 1.0, 1.0],
#        [0.41477113, 0.41545353, 0.39687267, 0.39506995],
#        [0.58522887, 0.58454647, 0.60312733, 0.60493005],
#        [1.0, 1.0, 1.0, 1.0],
#    ]
#)
#obs_xwalk = nhgisxwalk.GeoCrossWalk(
#    base_xwalk_blk1990_blk2010,
#    source_year=_90,
#    target_year=_10,
#    source_geo=bgp,
#    target_geo=trt,
#    base_source_table=tab_data_path_1990,
#    input_var=input_vars_1990,
#    weight_var=input_var_tags,
#)
#ix1, ix2 = 688, 692
#id_cols = ["bgp1990", "trt2010"]
#obs_str_vals = obs_xwalk.xwalk[id_cols][ix1:ix2].values
#wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
#obs_num_vals = obs_xwalk.xwalk[wgt_cols][ix1:ix2].values
#numpy.testing.assert_equal(knw_str_vals, obs_str_vals)
#numpy.testing.assert_allclose(knw_num_vals, obs_num_vals)

**base_xwalk_blk2000_blk2010**

In [None]:
#ix1, ix2 = 677, 681

#knw_str_vals = numpy.array(
#    [
#        ["G1101050000500009806R2", "G1100010009810"],
#        ["G1101050000500009806U1", "G1100010009811"],
#        ["G1101050000500009806U2", "G1100010009810"],
#        ["G1101050000500009806U2", "G1100010009811"],
#    ]
#)
#knw_num_vals = numpy.array(
#    [
#        [0.0, 0.0, 0.0, 0.0],
#        [1.0, 1.0, 1.0, 1.0],
#        [0.4234478601567, 0.4310747663551, 0.404344193817, 0.4043715846994],
#        [0.5765521398432, 0.5689252336448, 0.595655806182, 0.5956284153005],
#    ]
#)
#obs_xwalk = nhgisxwalk.GeoCrossWalk(
#    base_xwalk_blk2000_blk2010,
#    source_year=_00,
#    target_year=_10,
#    source_geo=bgp,
#    target_geo=trt,
#    base_source_table=tab_data_path_2000,
#    input_var=input_vars_2000_SF1b,
#    weight_var=input_var_tags,
#)
#ix1, ix2 = 677, 681
#id_cols = ["bgp2000", "trt2010"]
#obs_str_vals = obs_xwalk.xwalk[id_cols][ix1:ix2].values
#wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
#obs_num_vals = obs_xwalk.xwalk[wgt_cols][ix1:ix2].values
#numpy.testing.assert_equal(knw_str_vals, obs_str_vals)
#numpy.testing.assert_allclose(knw_num_vals, obs_num_vals)

### Wyoming
**base_xwalk_blk1990_blk2010**

**base_xwalk_blk2000_blk2010**

-----------------