# Test Subsets for testing: 2000
##  blocks, block groups parts, and blocks


1. From a national crosswalk: 
 1. Create target state-level subsets for NHGIS base crosswalks
 1. Create target state-level subsets for NHGIS base tabular data
 1. Record unit tests values for posterity



**This is currently only intended for use with block-level data as base units.**


**James Gaboardi** **(<jgaboardi@gmail.com>), 2020-05**

In [1]:
%load_ext watermark
%watermark

2020-07-29T16:10:24-04:00

CPython 3.7.6
IPython 7.15.0

compiler   : Clang 9.0.1 
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit


In [2]:
import inspect
import nhgisxwalk
import numpy
import pandas

%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv

watermark 2.0.2
nhgisxwalk 0.0.6
pandas     1.0.4
numpy      1.18.5



### Set the state (for subsetting), source & target, and year & geography

In [3]:
subset_state = "10"
source_year, target_year = "2000", "2010"
gj_src, gj_trg = "GJOIN%s"%source_year, "GJOIN%s"%target_year

### Set the base-level crosswalk file name

In [4]:
base_xwalk_name = "nhgis_blk%s_blk%s_gj.zip" % (source_year, target_year)
base_xwalk_file = "../testing_data_subsets/%s" % base_xwalk_name
base_xwalk_file

'../testing_data_subsets/nhgis_blk2000_blk2010_gj.zip'

### Set the base (source) summary file name

In [5]:
base_source_name = "%s_block.csv.zip" % source_year
base_source_file = "../testing_data_subsets/%s" % base_source_name
base_source_file

'../testing_data_subsets/2000_block.csv.zip'

### Read in the national the base-level crosswalk

In [6]:
data_types = nhgisxwalk.str_types([gj_src, gj_trg])
base_xwalk = pandas.read_csv(base_xwalk_file, index_col=0, dtype=data_types)
base_xwalk

Unnamed: 0,GJOIN2000,GJOIN2010,WEIGHT,PAREA
0,G10000100401001000,G10000100401001000,1.000000,1.000000
1,G10000100401001001,G10000100401001001,0.999981,0.999988
2,G10000100401001001,G10000100401001003,0.000019,0.000012
3,G10000100401001002,G10000100401001002,1.000000,1.000000
4,G10000100401001003,G10000100401001003,1.000000,1.000000
...,...,...,...,...
28471,G10000500519002095,G10000500519002075,1.000000,1.000000
28472,G10000500519002096,G10000500519002131,1.000000,1.000000
28473,G10000500519002097,G10000500519002130,1.000000,1.000000
28474,G10000500519002098,G10000500519002079,1.000000,1.000000


In [7]:
base_xwalk.head()

Unnamed: 0,GJOIN2000,GJOIN2010,WEIGHT,PAREA
0,G10000100401001000,G10000100401001000,1.0,1.0
1,G10000100401001001,G10000100401001001,0.999981,0.999988
2,G10000100401001001,G10000100401001003,1.9e-05,1.2e-05
3,G10000100401001002,G10000100401001002,1.0,1.0
4,G10000100401001003,G10000100401001003,1.0,1.0


### Declare input variable
**not needed for creating a subset perse, but should do regardless**

In [8]:
input_vars = [
    nhgisxwalk.desc_code_2000_SF1b["Persons"]["Total"],
    nhgisxwalk.desc_code_2000_SF1b["Families"]["Total"],
    nhgisxwalk.desc_code_2000_SF1b["Households"]["Total"],
    nhgisxwalk.desc_code_2000_SF1b["Housing Units"]["Total"]
]
input_var_tags = ["pop", "fam", "hh", "hu"]

### Generate the desired crosswalk and subset down to the target state

In [9]:
state_bgp2000trt2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="tr",
    base_source_table=base_source_file,
    input_var=input_vars,
    weight_var=input_var_tags,
    keep_base=True,
    add_geoid=True,
    stfips=subset_state
)
#del base_xwalk
state_bgp2000trt2010.xwalk

Unnamed: 0,bgp2000gj,tr2010gj,tr2010ge,wt_pop,wt_fam,wt_hh,wt_hu
0,G10000109044444430042202U1,G1000010042202,10001042202,1.0,1.0,1.0,1.0
1,G10000109044461265042201R1,G1000010042201,10001042201,1.0,1.0,1.0,1.0
2,G10000109044461265042201U1,G1000010042201,10001042201,1.0,1.0,1.0,1.0
3,G10000109044461265042201U2,G1000010042201,10001042201,1.0,1.0,1.0,1.0
4,G10000109044461480042202R2,G1000010042202,10001042202,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
1038,G10000509355299999051500R4,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1039,G10000509355299999051500U1,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1040,G10000509355299999051500U3,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1041,G10000509355299999051500U4,G1000050051500,10005051500,1.0,1.0,1.0,1.0


### docstring test

In [10]:
state_bgp2000trt2010.xwalk[1020:1031].drop(columns=["wt_hh", "wt_hu"])

Unnamed: 0,bgp2000gj,tr2010gj,tr2010ge,wt_pop,wt_fam
1020,G10000509355299999051302R1,G1000050051302,10005051302,1.0,1.0
1021,G10000509355299999051302R2,G1000050051302,10005051302,1.0,1.0
1022,G10000509355299999051302U1,G1000050051302,10005051302,1.0,1.0
1023,G10000509355299999051303R1,G1000050051303,10005051303,1.0,1.0
1024,G10000509355299999051303U1,G1000050051303,10005051303,1.0,1.0
1025,G10000509355299999051304R1,G1000050051305,10005051305,0.680605,0.633909
1026,G10000509355299999051304R1,G1000050051306,10005051306,0.319167,0.365782
1027,G10000509355299999051304R1,G1000050051400,10005051400,0.000227,0.000309
1028,G10000509355299999051304R2,G1000050051305,10005051305,0.802661,0.817568
1029,G10000509355299999051304R2,G1000050051306,10005051306,0.197339,0.182432


### unittests

In [11]:
state_bgp2000trt2010.xwalk.drop_duplicates(subset=["bgp2000gj", "tr2010gj"])

Unnamed: 0,bgp2000gj,tr2010gj,tr2010ge,wt_pop,wt_fam,wt_hh,wt_hu
0,G10000109044444430042202U1,G1000010042202,10001042202,1.0,1.0,1.0,1.0
1,G10000109044461265042201R1,G1000010042201,10001042201,1.0,1.0,1.0,1.0
2,G10000109044461265042201U1,G1000010042201,10001042201,1.0,1.0,1.0,1.0
3,G10000109044461265042201U2,G1000010042201,10001042201,1.0,1.0,1.0,1.0
4,G10000109044461480042202R2,G1000010042202,10001042202,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
1038,G10000509355299999051500R4,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1039,G10000509355299999051500U1,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1040,G10000509355299999051500U3,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1041,G10000509355299999051500U4,G1000050051500,10005051500,1.0,1.0,1.0,1.0


In [12]:
ix1, ix2 = 1025, 1029
id_cols = ["bgp2000gj", "tr2010gj", "tr2010ge"]
obs_str_vals = state_bgp2000trt2010.xwalk[id_cols][ix1:ix2].values
obs_str_vals

array([['G10000509355299999051304R1', 'G1000050051305', '10005051305'],
       ['G10000509355299999051304R1', 'G1000050051306', '10005051306'],
       ['G10000509355299999051304R1', 'G1000050051400', '10005051400'],
       ['G10000509355299999051304R2', 'G1000050051305', '10005051305']],
      dtype=object)

In [13]:
wgt_cols = ["wt_pop", "wt_fam", "wt_hh", "wt_hu"]
obs_num_vals = state_bgp2000trt2010.xwalk[wgt_cols][ix1:ix2].values
obs_num_vals

array([[6.80605382e-01, 6.33909150e-01, 6.57366450e-01, 6.59501671e-01],
       [3.19167389e-01, 3.65781711e-01, 3.42281879e-01, 3.40110906e-01],
       [2.27229000e-04, 3.09138700e-04, 3.51671300e-04, 3.87423400e-04],
       [8.02660754e-01, 8.17567568e-01, 8.20895522e-01, 8.36236934e-01]])

-----------------