In [1]:
import pandas as pd
import numpy as np

# Notes about the Data
* Data is from the Geocorr tool provided by Missouri Data Center: https://mcdc.missouri.edu/applications/geocorr2018.html


Data columns:
* zcta5 = zip code
* state = state code
* puma12 = PUMA code (as defined by the 2012 Census to use with 2010+ Census data)
* intptlon = Wtd centroid W longitude, degrees
* intptlat = Wtd centroid latitude, degrees
* hus10 = # of housing units in the region as defined by 2010 census
* afact = decimal portion of the source area contained in the target area (i.e. how much of the zip is in that puma)
* afact2 = decimal portion of the target area contained in the source area (i.e. how much of the puma is in that zip)

In [4]:
pumas = pd.read_csv("../data/zips_to_pumas.csv", converters={'zcta5' : lambda x: str(x)}, skiprows = [1])
pumas

Unnamed: 0,zcta5,state,puma12,stab,zipname,PUMAname,intptlon,intptlat,hus10,afact,AFACT2
0,38769,5,1800,AR,"Rosedale, MS",Southeast Arkansas,-91.024254,33.794609,1,1.000000,0.000020
1,65729,5,300,AR,"Pontiac, MO","Baxter, Boone, Carroll, Marion, Madison, Newto...",-92.614367,36.481570,45,1.000000,0.000567
2,65733,5,300,AR,"Protem, MO","Baxter, Boone, Carroll, Marion, Madison, Newto...",-92.820994,36.481048,297,1.000000,0.003742
3,65761,5,300,AR,"Theodosia, MO","Baxter, Boone, Carroll, Marion, Madison, Newto...",-92.691525,36.483711,99,1.000000,0.001247
4,71601,5,1700,AR,"Pine Bluff, AR","Jefferson, Grant & Arkansas (Northwest) Counties",-91.991864,34.210894,7411,1.000000,0.160432
...,...,...,...,...,...,...,...,...,...,...,...
705,99999,5,700,AR,99999,"St. Francis, Poinsett, Phillips, Cross, Lee & ...",-91.030931,34.219337,1,0.038462,0.000021
706,99999,5,1500,AR,99999,"Logan, Polk, Franklin, Sevier, Howard & Scott ...",-94.080488,34.345128,6,0.230769,0.000129
707,99999,5,1800,AR,99999,Southeast Arkansas,-91.620018,33.460740,2,0.076923,0.000041
708,99999,5,1900,AR,99999,South Central Arkansas,-92.575733,33.682207,1,0.038462,0.000019


In [5]:
tracts = pd.read_csv("../data/zips_to_census.csv", converters={'zcta5' : lambda x: str(x)}, skiprows = [1])
tracts

Unnamed: 0,zcta5,county,tract,bg,cntyname,zipname,pop10,afact
0,38769,5041,9501.00,1,Desha AR,"Rosedale, MS",2,1.0000
1,65729,5089,9603.00,1,Marion AR,"Pontiac, MO",36,1.0000
2,65733,5009,7902.00,2,Boone AR,"Protem, MO",5,0.0230
3,65733,5089,9603.00,4,Marion AR,"Protem, MO",212,0.9770
4,65761,5089,9603.00,4,Marion AR,"Theodosia, MO",93,1.0000
...,...,...,...,...,...,...,...,...
3645,72959,5143,111.03,2,Washington AR,"Winslow, AR",610,0.2190
3646,75556,5091,210.00,8,Miller AR,"Bloomburg, TX",26,1.0000
3647,99999,5093,110.00,4,Mississippi AR,99999,3,0.5000
3648,99999,5093,112.00,2,Mississippi AR,99999,2,0.3333


## Exploring # of zips in PUMA vs. Tracts dataset

In [8]:
# 592 Unique zip codes in the puma dataset
np.unique(pumas.zcta5).shape

(592,)

In [35]:
# There's only 20 pumas
np.unique(pumas.puma12).shape

(20,)

In [37]:
np.unique(tracts.zcta5).shape

(596,)

In [38]:
np.unique(tracts.tract).shape

(425,)

In [31]:
def set_dcount(row):
    return len(row)

In [33]:
## For each zip code, how many pumas map to it?

## For each zip code, get the list of puma codes that map to it
counts = pumas[['zcta5','puma12']].groupby("zcta5").agg( {'puma12' : 'unique'})

## Save these as a new df
puma_map = counts.reset_index()

## The puma12 column now contains a list of unique puma codes -> Let's get a count of this
puma_map['dcount_pumas'] = puma_map['puma12'].apply(set_dcount)
puma_map.dcount_pumas.value_counts()

1    490
2     90
3     10
4      1
6      1
Name: dcount_pumas, dtype: int64

* Most of the zip codes only map to one PUMA (490)
* 90 of the zipcodes map to 2 PUMAs 
* 10 of the zipcodes map to 3 PUMAs


In [34]:
puma_map[puma_map['dcount_pumas']>=2]

Unnamed: 0,zcta5,puma12,dcount_pumas
6,71603,"[1700, 1800]",2
15,71644,"[1700, 1800]",2
60,71764,"[1900, 2000]",2
63,71770,"[1900, 2000]",2
83,71846,"[1500, 2000]",2
...,...,...,...
577,72944,"[1400, 1500]",2
580,72947,"[1400, 1500]",2
582,72949,"[1300, 1500]",2
589,72959,"[200, 1400, 1500]",3


In [39]:
## For each zip code, how many pumas map to it?

## For each zip code, get the list of puma codes that map to it
counts = tracts[['zcta5','tract']].groupby("zcta5").agg( {'tract' : 'unique'})

## Save these as a new df
tract_map = counts.reset_index()

## The puma12 column now contains a list of unique puma codes -> Let's get a count of this
tract_map['dcount_tracts'] = tract_map['tract'].apply(set_dcount)
tract_map.dcount_tracts.value_counts()

1     207
2     151
3      76
4      56
5      31
7      14
6      13
8      13
10     12
12      8
11      4
9       3
16      3
13      2
15      2
20      1
Name: dcount_tracts, dtype: int64