# Generating Subsets for testing


1. Create state-level subsets for NHGIS base crosswalks
1. Create state-level subsets for NHGIS base tabular data
1. Record unit tests values for posterity


***This is currently only intended for use with block-level data.***


**James Gaboardi** **(<jgaboardi@gmail.com>), 2020-05**

In [1]:
%load_ext watermark
%watermark

2020-05-19T12:01:06-04:00

CPython 3.7.6
IPython 7.13.0

compiler   : Clang 9.0.1 
system     : Darwin
release    : 19.4.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


In [2]:
import inspect
import nhgisxwalk
import numpy
import pandas

%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv

watermark 2.0.2
numpy      1.18.1
nhgisxwalk 0.0.2
pandas     1.0.3



In [3]:
def build_subset(
    stfips,
    src_year,
    src_geog,
    trg_year,
    trg_geog,
    tabular,
    tabular_dir="",
    xwalk_dir="",
    code_type="GJOIN",
    out_dir="../testing_data_subsets",
    ur_path=None
):
    """Create and write out state-level subsets for NHGIS base crosswalks
    and the associated base tabular data. This is currently only intended
    for use with block-level data.
    
    Parameters
    ----------
    
    stfips : str
        State FIPS code for subset (target year)
        
    src_year : str
        Source year.
    
    src_geog : str
        Source geography.
    
    trg_year : str
        Target year.
    
    trg_geog : str
        Target geography.
    
    tabular : str
        Tabular geography type (as a base for file name).
    
    tabular_dir : str
        Path to the tabular geography file. Default is ''.
    
    xwalk_dir : str
        Path to the crosswalk file. Default is ''.
    
    code_type : str
        Code type/format. Default is 'GJOIN'.
    
    out_dir : str
        Default is '../testing_data_subsets'.
    
    ur_path : str
        Path to Urban/Rural code data for 2000 blocks.
    
    Returns
    -------
    
    xwalk : pandas.DataFrame
        Subset of the base crosswalk.
    
    tab : pandas.DataFrame
        Subset of the base tabular data.
    
    """
    
    # 1. read in base_xwalk
    src_col = "%s%s" % (code_type, src_year)
    trg_col = "%s%s" % (code_type, trg_year)
    dtype = {c:str for c in [src_col, trg_col]}
    ct = code_type.lower()[:2]
    xwalk_base_name = "/nhgis_%s%s_%s%s_%s.csv.zip"
    f_xwalk = xwalk_base_name % (src_geog, src_year, trg_geog, trg_year, ct)
    xwalk = pandas.read_csv(xwalk_dir+f_xwalk, index_col=0, dtype=dtype)
    
    # 2. create temp column in base_xwalk for trg_year, stfips
    temp_col = "state%s" % trg_year
    xwalk[temp_col] = xwalk[trg_col].map(lambda x: x[1:3])
    
    # 3. subset base_xwalk, remove temp column, reset index, and write out
    xwalk = xwalk[xwalk[temp_col] == stfips]
    xwalk = xwalk[xwalk.columns[:-1]]
    xwalk.reset_index(inplace=True, drop=True)
    xwalk.to_csv(out_dir+f_xwalk)
    
    # 4. read in base_tabular
    dtype = ["GISJOIN", "YEAR", "STATE", "STATEA", "COUNTY", "COUNTYA"]
    dtype += ["CTY_SUB", "CTY_SUBA", "PLACE", "PLACEA", "TRACTA"]
    dtype += ["BLCK_GRPA", "AIANHHA", "URBRURALA", "NAME"]
    dtype += ["URB_AREAA", "CDA", "RES_TRSTA", "ANRCA"]
    dtype = {c:str for c in dtype}
    f_tab = "%s_%s" % (src_year, tabular)
    f_tab_in = "/%s/%s.csv" % (f_tab, f_tab)
    tab = pandas.read_csv(tabular_dir+f_tab_in, index_col=0, dtype=dtype)
    
    # 5. subset base_tabular by stfips and extract UR code for 2000 blocks
    tab = tab[tab["STATEA"] == stfips]
    if src_year == "2000" and ur_path:
        # Special case for 2000 blocks (of 2000 bgp)-- needs Urban/Rural code
        # For more details see:
        # https://gist.github.com/jGaboardi/36c7640af1f228cdc8a691505262e543
        ur_df = pandas.read_csv(ur_path, index_col=0, dtype=str)
        ur_df = ur_df[ur_df["STATE"] == stfips]
        blk_cols = ["STATE", "COUNTY", "TRACT", "BLOCK"]
        ur_df = nhgisxwalk.id_codes.blk_id(ur_df, blk_cols, )
        tab["URBRURALA"] = tab.index.map(dict(ur_df[["GISJOIN", "UR"]].values))
        # reorder columns
        cols = tab.columns
        reorder_cols = list(cols[:11]) + list(cols[-1:]) + list(cols[11:-1])
        tab = tab[reorder_cols]
        
    # 6. write out
    tab.to_csv("%s/%s.csv.zip"%(out_dir,f_tab))
    
    return xwalk, tab

## Generate subset

In [4]:
subset_state = "10" # Delaware
#subset_state = "11" # DC
#subset_state = "56" # Wyoming

source_year, target_year = "1990", "2010"
#source_year, target_year = "2000", "2010"

In [5]:
if source_year == "1990" or target_year == "1990":
    xwalk_df, tab_df = build_subset(
        subset_state,
        source_year,
        "blk",
        target_year,
        "blk",
        "block",
        tabular_dir="../../tabular_data",
        xwalk_dir="../../crosswalks"
    )
if source_year == "2000" or target_year == "2000":
    xwalk_df, tab_df = build_subset(
        subset_state,
        source_year,
        "blk",
        target_year,
        "blk",
        "block",
        tabular_dir="../../tabular_data",
        xwalk_dir="../../crosswalks",
        ur_path="../../missing_UR_codes/2000_block_UR.csv.zip"
    )

  mask |= (ar1 == a)


In [6]:
xwalk_df.head()
#xwalk_df[145:155]

Unnamed: 0,GJOIN1990,GJOIN2010,WEIGHT,PAREA_VIA_BLK00
0,,G10000100432021078,0.0,0.0
1,,G10000100432023014,0.0,0.0
2,,G10000100432023015,0.0,0.0
3,,G10000109900000011,0.0,0.0
4,,G10000109900000012,0.0,0.0


In [7]:
tab_df.head()

Unnamed: 0_level_0,YEAR,ANRCA,AIANHHA,RES_ONLYA,TRUSTA,RES_TRSTA,BLOCKA,BLCK_GRPA,TRACTA,CDA,...,STATE,STATEA,URBRURALA,URB_AREAA,CD103A,ANPSADPI,ET1001,EUD001,EUO001,ESA001
GISJOIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G10000100401101,1990,99,9999,9999,9999,9,101,1,401,0,...,Delaware,10,2,9999,,Block 101,24,6,8,9
G10000100401102,1990,99,9999,9999,9999,9,102,1,401,0,...,Delaware,10,2,9999,,Block 102,145,39,50,58
G10000100401103,1990,99,9999,9999,9999,9,103,1,401,0,...,Delaware,10,2,9999,,Block 103,75,23,27,29
G10000100401104,1990,99,9999,9999,9999,9,104,1,401,0,...,Delaware,10,2,9999,,Block 104,69,19,21,22
G10000100401105,1990,99,9999,9999,9999,9,105,1,401,0,...,Delaware,10,2,9999,,Block 105,2,0,2,3


## Test crosswalk build with generated subset

In [11]:
subset_data_dir = "../testing_data_subsets"
base_xwalk_name = "/nhgis_blk%s_blk%s_gj.csv.zip" % (source_year, target_year)
base_xwalk_file = subset_data_dir + base_xwalk_name
data_types = nhgisxwalk.str_types(["GJOIN%s"%source_year, "GJOIN%s"%target_year])
base_xwalk = pandas.read_csv(base_xwalk_file, index_col=0, dtype=data_types)
base_xwalk.head()

Unnamed: 0,GJOIN2000,GJOIN2010,WEIGHT,PAREA
0,G10000100401001000,G10000100401001000,1.0,1.0
1,G10000100401001001,G10000100401001001,0.999981,0.999988
2,G10000100401001001,G10000100401001003,1.9e-05,1.2e-05
3,G10000100401001002,G10000100401001002,1.0,1.0
4,G10000100401001003,G10000100401001003,1.0,1.0


In [17]:
invc = nhgisxwalk.desc_code_1990
#invc = nhgisxwalk.desc_code_2000_SF1b

input_var_names = ["Persons", "Families", "Households", "Housing Units"]
input_var_codes = [invc[name]["Total"] for name in input_var_names]
input_var_codes

['FXS001', 'F2V001', 'FY4001', 'FV5001']

In [18]:
input_var_tags = ["pop", "fam", "hh", "hu"]

In [19]:
#'''
bgp1990_to_trt2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="trt",
    base_source_table=subset_data_dir+"/1990_block.csv.zip",
    input_var=input_var_codes,
    weight_var=input_var_tags,
    stfips=subset_state
)

#'''
'''
bgp2000_to_trt2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="trt",
    base_source_table=subset_data_dir+"/2000_block.csv.zip",
    input_var=input_var_codes,
    weight_var=input_var_tags,
    stfips=subset_state
)
'''
bgp1990_to_trt2010.xwalk.head()
#bgp2000_to_trt2010.xwalk.head()

Unnamed: 0,bgp2000,trt2010,wt_pop,wt_fam,wt_hh,wt_hu
0,G10000109044444430042202U1,G1000010042202,1.0,1.0,1.0,1.0
1,G10000109044461265042201R1,G1000010042201,1.0,1.0,1.0,1.0
2,G10000109044461265042201U1,G1000010042201,1.0,1.0,1.0,1.0
3,G10000109044461265042201U2,G1000010042201,1.0,1.0,1.0,1.0
4,G10000109044461480042202R2,G1000010042202,1.0,1.0,1.0,1.0


-----------------------

## Data for unittests and doctests
### Delaware
**base_xwalk_blk1990_blk2010**

In [23]:
wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
id_cols = ["bgp2000", "trt2010"]

In [24]:
#ix1, ix2 = 677, 681

#ix1, ix2 = 1025, 1029 # bgp1990_to_trt2010
#bgp1990_to_trt2010.xwalk[ix1:ix2]

**base_xwalk_blk2000_blk2010**

In [26]:
# doctest
print(bgp2000_to_trt2010.xwalk[id_cols+wgt_cols[:2]][1020:1031])

                         bgp2000         trt2010    wt_pop    wt_fam
1020  G10000509355299999051302R1  G1000050051302  1.000000  1.000000
1021  G10000509355299999051302R2  G1000050051302  1.000000  1.000000
1022  G10000509355299999051302U1  G1000050051302  1.000000  1.000000
1023  G10000509355299999051303R1  G1000050051303  1.000000  1.000000
1024  G10000509355299999051303U1  G1000050051303  1.000000  1.000000
1025  G10000509355299999051304R1  G1000050051305  0.680605  0.633909
1026  G10000509355299999051304R1  G1000050051306  0.319167  0.365782
1027  G10000509355299999051304R1  G1000050051400  0.000227  0.000309
1028  G10000509355299999051304R2  G1000050051305  0.802661  0.817568
1029  G10000509355299999051304R2  G1000050051306  0.197339  0.182432
1030  G10000509355299999051304U2  G1000050051305  0.530658  0.557464


In [27]:
bgp2000_to_trt2010.xwalk.head()

Unnamed: 0,bgp2000,trt2010,wt_pop,wt_fam,wt_hh,wt_hu
0,G10000109044444430042202U1,G1000010042202,1.0,1.0,1.0,1.0
1,G10000109044461265042201R1,G1000010042201,1.0,1.0,1.0,1.0
2,G10000109044461265042201U1,G1000010042201,1.0,1.0,1.0,1.0
3,G10000109044461265042201U2,G1000010042201,1.0,1.0,1.0,1.0
4,G10000109044461480042202R2,G1000010042202,1.0,1.0,1.0,1.0


In [28]:
ix1, ix2 = 1025, 1029 # bgp2000_to_trt2010
bgp2000_to_trt2010.xwalk[ix1:ix2]

Unnamed: 0,bgp2000,trt2010,wt_pop,wt_fam,wt_hh,wt_hu
1025,G10000509355299999051304R1,G1000050051305,0.680605,0.633909,0.657366,0.659502
1026,G10000509355299999051304R1,G1000050051306,0.319167,0.365782,0.342282,0.340111
1027,G10000509355299999051304R1,G1000050051400,0.000227,0.000309,0.000352,0.000387
1028,G10000509355299999051304R2,G1000050051305,0.802661,0.817568,0.820896,0.836237


In [29]:
bgp2000_to_trt2010.xwalk[id_cols][ix1:ix2].values

array([['G10000509355299999051304R1', 'G1000050051305'],
       ['G10000509355299999051304R1', 'G1000050051306'],
       ['G10000509355299999051304R1', 'G1000050051400'],
       ['G10000509355299999051304R2', 'G1000050051305']], dtype=object)

In [31]:
knw_str_vals = numpy.array(
    [
        ["G10000509355299999051304R1", "G1000050051305"],
        ["G10000509355299999051304R1", "G1000050051306"],
        ["G10000509355299999051304R1", "G1000050051400"],
        ["G10000509355299999051304R2", "G1000050051305"]
    ]
)
knw_str_vals

array([['G10000509355299999051304R1', 'G1000050051305'],
       ['G10000509355299999051304R1', 'G1000050051306'],
       ['G10000509355299999051304R1', 'G1000050051400'],
       ['G10000509355299999051304R2', 'G1000050051305']], dtype='<U26')

In [32]:
numpy.equal(
    bgp2000_to_trt2010.xwalk[id_cols][ix1:ix2].values,
    knw_str_vals
).all()

True

In [33]:
bgp2000_to_trt2010.xwalk[wgt_cols][ix1:ix2].values

array([[6.80605382e-01, 6.33909150e-01, 6.57366450e-01, 6.59501671e-01],
       [3.19167389e-01, 3.65781711e-01, 3.42281879e-01, 3.40110906e-01],
       [2.27229039e-04, 3.09138740e-04, 3.51671251e-04, 3.87423412e-04],
       [8.02660754e-01, 8.17567568e-01, 8.20895522e-01, 8.36236934e-01]])

In [34]:
knw_num_vals = numpy.array(
    [
        [6.80605382e-01, 6.33909150e-01, 6.57366450e-01, 6.59501671e-01],
        [3.19167389e-01, 3.65781711e-01, 3.42281879e-01, 3.40110906e-01],
        [2.27229039e-04, 3.09138740e-04, 3.51671251e-04, 3.87423412e-04],
        [8.02660754e-01, 8.17567568e-01, 8.20895522e-01, 8.36236934e-01]
    ]
)
knw_num_vals

array([[6.80605382e-01, 6.33909150e-01, 6.57366450e-01, 6.59501671e-01],
       [3.19167389e-01, 3.65781711e-01, 3.42281879e-01, 3.40110906e-01],
       [2.27229039e-04, 3.09138740e-04, 3.51671251e-04, 3.87423412e-04],
       [8.02660754e-01, 8.17567568e-01, 8.20895522e-01, 8.36236934e-01]])

In [35]:
numpy.allclose(
    bgp2000_to_trt2010.xwalk[wgt_cols][ix1:ix2].values,
    knw_num_vals
)

True

-------------------------

### DC
**base_xwalk_blk1990_blk2010**

In [None]:
#ix1, ix2 = 688, 692

#knw_str_vals = numpy.array(
#    [
#        ["G11000105000050000009806989999999884011", "G1100010009811"],
#        ["G11000105000050000009806989999999884012", "G1100010009810"],
#        ["G11000105000050000009806989999999884012", "G1100010009811"],
#        ["G11000105000050000009807989999999884011", "G1100010009807"],
#    ]
#)
#knw_num_vals = numpy.array(
#    [
#        [1.0, 1.0, 1.0, 1.0],
#        [0.41477113, 0.41545353, 0.39687267, 0.39506995],
#        [0.58522887, 0.58454647, 0.60312733, 0.60493005],
#        [1.0, 1.0, 1.0, 1.0],
#    ]
#)
#obs_xwalk = nhgisxwalk.GeoCrossWalk(
#    base_xwalk_blk1990_blk2010,
#    source_year=_90,
#    target_year=_10,
#    source_geo=bgp,
#    target_geo=trt,
#    base_source_table=tab_data_path_1990,
#    input_var=input_vars_1990,
#    weight_var=input_var_tags,
#)
#ix1, ix2 = 688, 692
#id_cols = ["bgp1990", "trt2010"]
#obs_str_vals = obs_xwalk.xwalk[id_cols][ix1:ix2].values
#wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
#obs_num_vals = obs_xwalk.xwalk[wgt_cols][ix1:ix2].values
#numpy.testing.assert_equal(knw_str_vals, obs_str_vals)
#numpy.testing.assert_allclose(knw_num_vals, obs_num_vals)

**base_xwalk_blk2000_blk2010**

In [None]:
#ix1, ix2 = 677, 681

#knw_str_vals = numpy.array(
#    [
#        ["G1101050000500009806R2", "G1100010009810"],
#        ["G1101050000500009806U1", "G1100010009811"],
#        ["G1101050000500009806U2", "G1100010009810"],
#        ["G1101050000500009806U2", "G1100010009811"],
#    ]
#)
#knw_num_vals = numpy.array(
#    [
#        [0.0, 0.0, 0.0, 0.0],
#        [1.0, 1.0, 1.0, 1.0],
#        [0.4234478601567, 0.4310747663551, 0.404344193817, 0.4043715846994],
#        [0.5765521398432, 0.5689252336448, 0.595655806182, 0.5956284153005],
#    ]
#)
#obs_xwalk = nhgisxwalk.GeoCrossWalk(
#    base_xwalk_blk2000_blk2010,
#    source_year=_00,
#    target_year=_10,
#    source_geo=bgp,
#    target_geo=trt,
#    base_source_table=tab_data_path_2000,
#    input_var=input_vars_2000_SF1b,
#    weight_var=input_var_tags,
#)
#ix1, ix2 = 677, 681
#id_cols = ["bgp2000", "trt2010"]
#obs_str_vals = obs_xwalk.xwalk[id_cols][ix1:ix2].values
#wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
#obs_num_vals = obs_xwalk.xwalk[wgt_cols][ix1:ix2].values
#numpy.testing.assert_equal(knw_str_vals, obs_str_vals)
#numpy.testing.assert_allclose(knw_num_vals, obs_num_vals)

### Wyoming
**base_xwalk_blk1990_blk2010**

**base_xwalk_blk2000_blk2010**

-----------------