# Generating Subsets for testing: 1990
##  blocks, block groups parts, and blocks


1. From a national crosswalk: 
 1. Create target state-level subsets for NHGIS base crosswalks
 1. Create target state-level subsets for NHGIS base tabular data
 1. Record unit tests values for posterity



**This is currently only intended for use with block-level data as base units.**


**James Gaboardi** **(<jgaboardi@gmail.com>), 2020-05**

In [1]:
%load_ext watermark
%watermark

2020-08-10T15:49:29-04:00

CPython 3.8.5
IPython 7.16.1

compiler   : Clang 10.0.1 
system     : Darwin
release    : 19.6.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit


In [2]:
import inspect
import nhgisxwalk
import numpy
import pandas

%load_ext autoreload
%autoreload 2
%watermark -w
%watermark -iv

watermark 2.0.2
nhgisxwalk 0.0.8
numpy      1.19.1
pandas     1.1.0



### Set the state (for subsetting), source & target, and year & geography

In [3]:
subset_state = "10" # Delaware
#subset_state = "11" # DC
#subset_state = "15" # Hawaii
source_year, target_year = "1990", "2010"
gj_src, gj_trg = "GJOIN%s"%source_year, "GJOIN%s"%target_year

In [4]:
data_in = "../../crosswalks/"
data_out = "../testing_data_subsets/"
data_tab = "../../tabular_data/"

### Set the base-level crosswalk file name

In [5]:
base_xwalk_name = "nhgis_blk%s_blk%s_gj" % (source_year, target_year)
base_xwalk_name

'nhgis_blk1990_blk2010_gj'

### Set the base (source) summary file name

In [6]:
base_source_name = "%s_block/%s_block.csv" % (source_year, source_year)
base_source_file = "%s%s" % (data_tab, base_source_name)
base_source_file

'../../tabular_data/1990_block/1990_block.csv'

### Set the supplementary summary file name

In [7]:
supp_source_name = "%s_blck_grp_598_103/%s_blck_grp_598_103.csv" % (
    source_year, source_year
)
supp_source_file = "%s%s" % (data_tab, supp_source_name)
supp_source_file

'../../tabular_data/1990_blck_grp_598_103/1990_blck_grp_598_103.csv'

### Read in the national the base-level crosswalk

In [8]:
data_types = nhgisxwalk.str_types([gj_src, gj_trg])
from_csv_kws = {"path": data_in, "archived": True, "remove_unpacked": True}
read_csv_kws = {"dtype": data_types}
base_xwalk = nhgisxwalk.xwalk_df_from_csv(
    base_xwalk_name, **from_csv_kws, **read_csv_kws
)
base_xwalk.head()

Unnamed: 0,GJOIN1990,GJOIN2010,WEIGHT,PAREA_VIA_BLK00
0,G01000100201101A,G01000100201002004,0.000753,0.014284
1,G01000100201101A,G01000100201002005,0.04202,0.109618
2,G01000100201101A,G01000100201002006,0.262146,0.498133
3,G01000100201101A,G01000100201002016,0.237187,0.218109
4,G01000100201101A,G01000100201002023,0.099097,0.012864


### Create the state subset of the base-level crosswalk (for use in GH testing)

In [9]:
ss_base = base_xwalk[
    base_xwalk["GJOIN2010"].map(lambda x: x[1:3] == subset_state)
].copy()
ss_base.reset_index(drop=True, inplace=True)

### Declare input variable
**not needed for creating a subset perse, but should do regardless**

In [10]:
input_vars = [
    nhgisxwalk.desc_code_1990["Persons"]["Total"],
    nhgisxwalk.desc_code_1990["Families"]["Total"],
    nhgisxwalk.desc_code_1990["Households"]["Total"],
    nhgisxwalk.desc_code_1990["Housing Units"]["Total"]
]
input_var_tags = ["pop", "fam", "hh", "hu"]

### Generate the desired crosswalk and subset down to the target state

In [11]:
state_bgp1990tr2010 = nhgisxwalk.GeoCrossWalk(
    base_xwalk,
    source_year=source_year,
    target_year=target_year,
    source_geo="bgp",
    target_geo="tr",
    base_source_table=base_source_file,
    supp_source_table=supp_source_file,
    input_var=input_vars,
    weight_var=input_var_tags,
    keep_base=True,
    add_geoid=True,
    stfips=subset_state
)
del base_xwalk
state_bgp1990tr2010.xwalk

Unnamed: 0,bgp1990gj,tr2010gj,tr2010ge,wt_pop,wt_fam,wt_hh,wt_hu
0,G100001090444072500423009999999999921,G1000010043202,10001043202,1.0,1.0,1.0,1.0
1,G100001090444444300422009999999999926,G1000010042202,10001042202,1.0,1.0,1.0,1.0
2,G100001090444612650422009999999219011,G1000010041200,10001041200,0.0,0.0,0.0,0.0
3,G100001090444612650422009999999219011,G1000010042201,10001042201,1.0,1.0,1.0,1.0
4,G100001090444612650422009999999219012,G1000010042201,10001042201,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
1058,G100005093552999990515009999999999923,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1059,G100005093552999990515009999999999924,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1060,G100005093552999990516009999999999921,G1000050051702,10005051702,1.0,1.0,1.0,1.0
1061,G340033010610106000204029999999916014,G1000030990100,10003990100,0.0,0.0,0.0,0.0


In [12]:
state_bgp1990tr2010.xwalk.drop_duplicates(subset=["bgp1990gj", "tr2010gj"])

Unnamed: 0,bgp1990gj,tr2010gj,tr2010ge,wt_pop,wt_fam,wt_hh,wt_hu
0,G100001090444072500423009999999999921,G1000010043202,10001043202,1.0,1.0,1.0,1.0
1,G100001090444444300422009999999999926,G1000010042202,10001042202,1.0,1.0,1.0,1.0
2,G100001090444612650422009999999219011,G1000010041200,10001041200,0.0,0.0,0.0,0.0
3,G100001090444612650422009999999219011,G1000010042201,10001042201,1.0,1.0,1.0,1.0
4,G100001090444612650422009999999219012,G1000010042201,10001042201,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
1058,G100005093552999990515009999999999923,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1059,G100005093552999990515009999999999924,G1000050051500,10005051500,1.0,1.0,1.0,1.0
1060,G100005093552999990516009999999999921,G1000050051702,10005051702,1.0,1.0,1.0,1.0
1061,G340033010610106000204029999999916014,G1000030990100,10003990100,0.0,0.0,0.0,0.0


In [13]:
state_bgp1990tr2010.xwalk["bgp1990gj"].nunique()

777

In [14]:
state_bgp1990tr2010.xwalk["tr2010gj"].nunique()

218

### Write out the state subset of the base-level crosswalk (for use in GH testing)

In [15]:
out_path = "%s%s" % (data_out, base_xwalk_name)
nhgisxwalk.prepare_data_product(ss_base, base_xwalk_name, out_path, remove=True)

In [16]:
ss_base

Unnamed: 0,GJOIN1990,GJOIN2010,WEIGHT,PAREA_VIA_BLK00
0,G10000100401101,G10000100401001000,1.000000,1.000000
1,G10000100401102,G10000100401001001,0.921750,0.976774
2,G10000100401102,G10000100401001002,0.078219,0.023215
3,G10000100401102,G10000100401001003,0.000031,0.000012
4,G10000100401103,G10000100401001003,1.000000,1.000000
...,...,...,...,...
38292,,G10000509900000019,0.000000,0.000000
38293,,G10000509900000020,0.000000,0.000000
38294,,G10000509900000021,0.000000,0.000000
38295,,G10000509900000022,0.000000,0.000000


### Record, subset, and write out the 1990 BLKs (sf1) needed to create this subset

In [17]:
blk1990 = ss_base[~ss_base["GJOIN1990"].isna()]["GJOIN1990"]
blk1990

0         G10000100401101
1         G10000100401102
2         G10000100401102
3         G10000100401102
4         G10000100401103
               ...       
38143     G10000500519289
38144    G34003300204401A
38145     G34003300204418
38146     G34003300204419
38147     G34003300204420
Name: GJOIN1990, Length: 38148, dtype: object

In [18]:
blk1990.shape

(38148,)

In [19]:
blk1990.nunique()

15538

In [20]:
# read in base source file
base_source_df = pandas.read_csv(base_source_file, dtype=str)
base_source_df

Unnamed: 0,GISJOIN,YEAR,ANRCA,AIANHHA,RES_ONLYA,TRUSTA,RES_TRSTA,BLOCKA,BLCK_GRPA,TRACTA,...,STATE,STATEA,URBRURALA,URB_AREAA,CD103A,ANPSADPI,ET1001,EUD001,EUO001,ESA001
0,G01000100201101A,1990,99,9999,9999,9999,9,101A,1,0201,...,Alabama,01,1,5240,,Block 101A,332,91,107,112
1,G01000100201101B,1990,99,9999,9999,9999,9,101B,1,0201,...,Alabama,01,2,9999,,Block 101B,14,3,6,7
2,G01000100201102A,1990,99,9999,9999,9999,9,102A,1,0201,...,Alabama,01,1,5240,,Block 102A,248,74,88,89
3,G01000100201103,1990,99,9999,9999,9999,9,103,1,0201,...,Alabama,01,1,5240,,Block 103,49,15,15,16
4,G01000100201104,1990,99,9999,9999,9999,9,104,1,0201,...,Alabama,01,1,5240,,Block 104,12,3,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4934102,G56004509513577B,1990,99,9999,9999,9999,9,577B,5,9513,...,Wyoming,56,2,9999,,Block 577B,5,1,3,3
4934103,G56004509513578,1990,99,9999,9999,9999,9,578,5,9513,...,Wyoming,56,1,9999,,Block 578,59,20,21,22
4934104,G56004509513579,1990,99,9999,9999,9999,9,579,5,9513,...,Wyoming,56,1,9999,,Block 579,30,8,10,11
4934105,G56004509513580,1990,99,9999,9999,9999,9,580,5,9513,...,Wyoming,56,2,9999,,Block 580,35,9,11,11


In [21]:
base_source_df = base_source_df[base_source_df["GISJOIN"].isin(blk1990.unique())]
base_source_df.reset_index(drop=True, inplace=True)
base_source_df

Unnamed: 0,GISJOIN,YEAR,ANRCA,AIANHHA,RES_ONLYA,TRUSTA,RES_TRSTA,BLOCKA,BLCK_GRPA,TRACTA,...,STATE,STATEA,URBRURALA,URB_AREAA,CD103A,ANPSADPI,ET1001,EUD001,EUO001,ESA001
0,G10000100401101,1990,99,9999,9999,9999,9,101,1,0401,...,Delaware,10,2,9999,,Block 101,24,6,8,9
1,G10000100401102,1990,99,9999,9999,9999,9,102,1,0401,...,Delaware,10,2,9999,,Block 102,145,39,50,58
2,G10000100401103,1990,99,9999,9999,9999,9,103,1,0401,...,Delaware,10,2,9999,,Block 103,75,23,27,29
3,G10000100401104,1990,99,9999,9999,9999,9,104,1,0401,...,Delaware,10,2,9999,,Block 104,69,19,21,22
4,G10000100401105,1990,99,9999,9999,9999,9,105,1,0401,...,Delaware,10,2,9999,,Block 105,2,0,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11935,G10000500519289,1990,99,9999,9999,9999,9,289,2,0519,...,Delaware,10,2,9999,,Block 289,8,1,6,6
11936,G34003300204401A,1990,99,9999,9999,9999,9,401A,4,0204,...,New Jersey,34,1,9160,,Block 401A,122,31,51,52
11937,G34003300204418,1990,99,9999,9999,9999,9,418,4,0204,...,New Jersey,34,1,9160,,Block 418,86,25,35,36
11938,G34003300204419,1990,99,9999,9999,9999,9,419,4,0204,...,New Jersey,34,1,9160,,Block 419,207,50,114,123


In [22]:
base_source_df.to_csv("%s%s_block.csv.zip" % (data_out, source_year))

### Record, subset, and write out the 1990 BGPs (sf1) needed to create this subset

In [23]:
bgp1990 = state_bgp1990tr2010.xwalk[
    ~state_bgp1990tr2010.xwalk["bgp1990gj"].isna()
]["bgp1990gj"]
bgp1990

0       G100001090444072500423009999999999921
1       G100001090444444300422009999999999926
2       G100001090444612650422009999999219011
3       G100001090444612650422009999999219011
4       G100001090444612650422009999999219012
                        ...                  
1057    G100005093552999990515009999999999922
1058    G100005093552999990515009999999999923
1059    G100005093552999990515009999999999924
1060    G100005093552999990516009999999999921
1061    G340033010610106000204029999999916014
Name: bgp1990gj, Length: 1062, dtype: object

In [24]:
# read in supplement source file
supp_source_df = pandas.read_csv(supp_source_file, dtype=str)
supp_source_df

Unnamed: 0,GISJOIN,YEAR,ANRCA,AIANHH,AIANHHA,RES_ONLYA,TRUSTA,RES_TRSTA,BLOCKA,BLCK_GRPA,...,STATEA,URBRURALA,URB_AREA,URB_AREAA,CD103A,ANPSADPI,ET1001,EUD001,EUO001,ESA001
0,G0100010901710322002119999999999922,1990,99,,9999,9999,9999,9,,2,...,01,2,,9999,,BG 2 (pt.),402,105,144,167
1,G0100010901710322002119999999999923,1990,99,,9999,9999,9999,9,,3,...,01,2,,9999,,BG 3 (pt.),279,63,88,96
2,G0100010901719999902119999999999921,1990,99,,9999,9999,9999,9,,1,...,01,2,,9999,,BG 1,611,150,179,213
3,G0100010901719999902119999999999922,1990,99,,9999,9999,9999,9,,2,...,01,2,,9999,,BG 2 (pt.),1244,318,410,444
4,G0100010901719999902119999999999923,1990,99,,9999,9999,9999,9,,3,...,01,2,,9999,,BG 3 (pt.),447,126,166,197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367248,G5600450935209999995129999999999923,1990,99,,9999,9999,9999,9,,3,...,56,2,,9999,,BG 3,44,12,13,21
367249,G5600450935209999995129999999999924,1990,99,,9999,9999,9999,9,,4,...,56,2,,9999,,BG 4,42,12,18,22
367250,G5600450935209999995129999999999925,1990,99,,9999,9999,9999,9,,5,...,56,2,,9999,,BG 5,29,6,9,15
367251,G5600450935209999995129999999999926,1990,99,,9999,9999,9999,9,,6,...,56,2,,9999,,BG 6 (pt.),30,8,10,15


In [25]:
# GISJOIN ID correction
bgpidcols = nhgisxwalk.id_codes.code_cols("bgp", "1990")
supp_source_df = nhgisxwalk.id_codes.bgp_gj(supp_source_df, bgpidcols, cname="__GISJOIN")
supp_source_df

Unnamed: 0,GISJOIN,YEAR,ANRCA,AIANHH,AIANHHA,RES_ONLYA,TRUSTA,RES_TRSTA,BLOCKA,BLCK_GRPA,...,URBRURALA,URB_AREA,URB_AREAA,CD103A,ANPSADPI,ET1001,EUD001,EUO001,ESA001,__GISJOIN
0,G0100010901710322002119999999999922,1990,99,,9999,9999,9999,9,,2,...,2,,9999,,BG 2 (pt.),402,105,144,167,G010001090171032200211039999999999922
1,G0100010901710322002119999999999923,1990,99,,9999,9999,9999,9,,3,...,2,,9999,,BG 3 (pt.),279,63,88,96,G010001090171032200211039999999999923
2,G0100010901719999902119999999999921,1990,99,,9999,9999,9999,9,,1,...,2,,9999,,BG 1,611,150,179,213,G010001090171999990211039999999999921
3,G0100010901719999902119999999999922,1990,99,,9999,9999,9999,9,,2,...,2,,9999,,BG 2 (pt.),1244,318,410,444,G010001090171999990211039999999999922
4,G0100010901719999902119999999999923,1990,99,,9999,9999,9999,9,,3,...,2,,9999,,BG 3 (pt.),447,126,166,197,G010001090171999990211039999999999923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367248,G5600450935209999995129999999999923,1990,99,,9999,9999,9999,9,,3,...,2,,9999,,BG 3,44,12,13,21,G560045093520999999512009999999999923
367249,G5600450935209999995129999999999924,1990,99,,9999,9999,9999,9,,4,...,2,,9999,,BG 4,42,12,18,22,G560045093520999999512009999999999924
367250,G5600450935209999995129999999999925,1990,99,,9999,9999,9999,9,,5,...,2,,9999,,BG 5,29,6,9,15,G560045093520999999512009999999999925
367251,G5600450935209999995129999999999926,1990,99,,9999,9999,9999,9,,6,...,2,,9999,,BG 6 (pt.),30,8,10,15,G560045093520999999512009999999999926


In [26]:
supp_source_df = supp_source_df[supp_source_df["__GISJOIN"].isin(bgp1990.unique())].copy()
supp_source_df

Unnamed: 0,GISJOIN,YEAR,ANRCA,AIANHH,AIANHHA,RES_ONLYA,TRUSTA,RES_TRSTA,BLOCKA,BLCK_GRPA,...,URBRURALA,URB_AREA,URB_AREAA,CD103A,ANPSADPI,ET1001,EUD001,EUO001,ESA001,__GISJOIN
57044,G1000010904440725004239999999999921,1990,99,,9999,9999,9999,9,,1,...,2,,9999,,BG 1 (pt.),179,48,83,168,G100001090444072500423009999999999921
57045,G1000010904444443004229999999999926,1990,99,,9999,9999,9999,9,,6,...,2,,9999,,BG 6 (pt.),211,62,81,86,G100001090444444300422009999999999926
57046,G1000010904446126504229999999219011,1990,99,,9999,9999,9999,9,,1,...,1,"Dover, DE",2190,,BG 1 (pt.),1848,490,615,694,G100001090444612650422009999999219011
57047,G1000010904446126504229999999219012,1990,99,,9999,9999,9999,9,,2,...,1,"Dover, DE",2190,,BG 2 (pt.),67,17,21,22,G100001090444612650422009999999219012
57048,G1000010904446148004229999999999924,1990,99,,9999,9999,9999,9,,4,...,2,,9999,,BG 4 (pt.),924,261,307,315,G100001090444614800422009999999999924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57816,G1000050935529999905159999999999922,1990,99,,9999,9999,9999,9,,2,...,2,,9999,,BG 2 (pt.),1127,316,399,446,G100005093552999990515009999999999922
57817,G1000050935529999905159999999999923,1990,99,,9999,9999,9999,9,,3,...,2,,9999,,BG 3 (pt.),228,74,99,108,G100005093552999990515009999999999923
57818,G1000050935529999905159999999999924,1990,99,,9999,9999,9999,9,,4,...,2,,9999,,BG 4,808,214,269,310,G100005093552999990515009999999999924
57819,G1000050935529999905169999999999921,1990,99,,9999,9999,9999,9,,1,...,2,,9999,,BG 1,1100,323,401,445,G100005093552999990516009999999999921


In [27]:
supp_source_df.drop(columns=["__GISJOIN"], inplace=True)
supp_source_df.reset_index(drop=True, inplace=True)
supp_source_df

Unnamed: 0,GISJOIN,YEAR,ANRCA,AIANHH,AIANHHA,RES_ONLYA,TRUSTA,RES_TRSTA,BLOCKA,BLCK_GRPA,...,STATEA,URBRURALA,URB_AREA,URB_AREAA,CD103A,ANPSADPI,ET1001,EUD001,EUO001,ESA001
0,G1000010904440725004239999999999921,1990,99,,9999,9999,9999,9,,1,...,10,2,,9999,,BG 1 (pt.),179,48,83,168
1,G1000010904444443004229999999999926,1990,99,,9999,9999,9999,9,,6,...,10,2,,9999,,BG 6 (pt.),211,62,81,86
2,G1000010904446126504229999999219011,1990,99,,9999,9999,9999,9,,1,...,10,1,"Dover, DE",2190,,BG 1 (pt.),1848,490,615,694
3,G1000010904446126504229999999219012,1990,99,,9999,9999,9999,9,,2,...,10,1,"Dover, DE",2190,,BG 2 (pt.),67,17,21,22
4,G1000010904446148004229999999999924,1990,99,,9999,9999,9999,9,,4,...,10,2,,9999,,BG 4 (pt.),924,261,307,315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,G1000050935529999905159999999999922,1990,99,,9999,9999,9999,9,,2,...,10,2,,9999,,BG 2 (pt.),1127,316,399,446
773,G1000050935529999905159999999999923,1990,99,,9999,9999,9999,9,,3,...,10,2,,9999,,BG 3 (pt.),228,74,99,108
774,G1000050935529999905159999999999924,1990,99,,9999,9999,9999,9,,4,...,10,2,,9999,,BG 4,808,214,269,310
775,G1000050935529999905169999999999921,1990,99,,9999,9999,9999,9,,1,...,10,2,,9999,,BG 1,1100,323,401,445


In [28]:
supp_source_df.to_csv("%s%s_blck_grp_598_103.csv.zip" % (data_out, source_year))

-----------------