## Advanced GIS: Interactive Web Mapping
#### Final Project | 3/31/2022
**Purpose**: clean and combine housing choice voucher data and neighborhood tabulation geographies for visualization

In [9]:
# Packages and custom functions
import numpy as np
import pandas as pd
import re
import os
import geojson
import geopandas as gpd
import requests as r

def get_county(x):
    c = re.findall('NY New York [\d]{3} (.* County)',x)
    if len(c) > 0:
        return(c[0])
    else:
        return(None)

**Source**: https://www.huduser.gov/portal/datasets/assthsg.html#2009-2021_data, 2021 data

**Documentation**: https://www.huduser.gov/portal/datasets/pictures/dictionary_2021.pdf

**Definition of Missing values**
Some cell entries across variables report no data or are suppressed. In such cases
one of the following codes will apply to such missing values in the downloadable file
"NA" = Not applicable
"-1" = Missing
"-4" = Suppressed (where the cell entry is less than 11 for reported families)
"-5" = Non-reporting (where reporting rates--see % Reported--are less than 50%) 

In [None]:
# Read in voucher data
dat = pd.read_excel('TRACT_MO_WY_2021.xlsx')

# Add fields for filtering, joining, and checking accuracy

## Filter to NY State
dat = dat.loc[dat.states=='NY New York']

## Create county, census tract, and boro fields
dat['county'] = dat['entities'].apply(get_county)
dat['census_tract'] = dat['code'].apply(lambda x: int(x[5:]) if re.match('\d{5}',x) else None)
boros = {"Kings County":3,
        "Queens County":4,
        "Bronx County":2,
        "New York County":1,
        "Richmond County":5}

dat["borocode"] = dat["county"].replace(boros)

## Create aggregate fields for units and occupied for quality checks
dat['total_occupied'] = dat['total_units']*(dat['pct_occupied']/100)
dat['diff_occupied_reported'] = dat['total_occupied'] - dat['number_reported']

In [148]:
## Filter to just NYC
cut = dat.loc[dat.county.isin([
    'Kings County',
    'Queens County',
    'Bronx County',
    'Richmond County',
    'New York County'
])]

# Filter to just HCV
hcv = cut.loc[cut.program_label=='Housing Choice Vouchers',
              ['program_label','county','borocode','census_tract','number_reported','people_total']]

# Replace -4 missing data code with None
hcv.replace(to_replace = -4, value = None, inplace = True)

# Make sure HCV cut is unique on borough and census tract
check = hcv.groupby(['borocode','census_tract']).aggregate({'program_label':'count'})
assert len(check.loc[check.program_label>1]) == 0, "Error! Data is not unique on borough and census tract"

# Group HCV by borough and census tract
hcv = hcv.groupby(['program_label','borocode','county','census_tract']).\
    aggregate({'number_reported':'max','people_total':'max'}).\
    reset_index()

hcv.head()

Unnamed: 0,program_label,borocode,county,census_tract,number_reported,people_total
0,Housing Choice Vouchers,1,New York County,201.0,26.0,34.0
1,Housing Choice Vouchers,1,New York County,202.0,92.0,103.0
2,Housing Choice Vouchers,1,New York County,600.0,234.0,492.0
3,Housing Choice Vouchers,1,New York County,700.0,,
4,Housing Choice Vouchers,1,New York County,800.0,96.0,126.0


In [None]:
# Check data accuracy
assert len(hcv.loc[hcv.total_units<hcv.total_occupied])==0, "Error! Total occupied units greater than total available units"
assert len(hcv.loc[hcv.total_occupied<hcv.number_reported])==0, "Error! Total occupied units less than reported units"
hcv.loc[hcv.avg_hh_size!=hcv.people_per_unit,['number_reported','people_total','avg_hh_size','people_per_unit']]

**Source and Documentation**: https://github.com/vr00n/NYC-LocalGeo-CrossWalk

**Data Source Definition** Crosswalk of all NYC geographies for mapping census tract to community district. Note: last update was 2017, suggesting these are 2010 census definitions--however, this might need to be updated in the future.

In [48]:
# Read census tract to CD mapping table
xwalk = pd.read_csv("https://github.com/vr00n/NYC-LocalGeo-CrossWalk/raw/master/MASTER-CROSSWALK-NAD83.csv")

# Combine and aggregate hcv data at community district level
hcv_cd = hcv.merge(xwalk,how='left',left_on='census_tract', right_on='CT2010')
hcv_cd_g = hcv_cd.groupby(["program_label","BoroName","BoroCD","NTAName"]).aggregate({
    "number_reported":'sum',
    'people_total':'sum'}).\
    sort_values('number_reported',ascending=False).\
    reset_index()

hcv_cd_g['avg_hh_size'] = hcv_cd_g.apply(lambda x: round(x['people_total']/x['number_reported'],1) if x['number_reported'] >0 else None, axis=1)

hcv_cd_g.to_csv('grouped_hcv_data.csv')

In [188]:
# Get records that are in HCV dataset and match to multiple CDs in xwalk
check = xwalk.groupby(["BoroCode","CT2010"]).\
    aggregate({"BoroCD":"nunique"})

check.loc[check.BoroCD>1].\
    reset_index().merge(
        hcv[['census_tract','borocode']],
        how='inner',
        left_on=['BoroCode','CT2010'],
        right_on=['borocode','census_tract']).\
    drop_duplicates()

Unnamed: 0,BoroCode,CT2010,BoroCD,census_tract,borocode
0,1,202,2,202.0,1
1,1,700,2,700.0,1
2,1,900,2,900.0,1
3,1,1501,2,1501.0,1
4,1,1502,2,1502.0,1
...,...,...,...,...,...
437,5,18902,2,18902.0,5
438,5,27301,2,27301.0,5
439,5,27900,2,27900.0,5
440,5,29102,2,29102.0,5


In [172]:
# Example problem record
xwalk.loc[xwalk.CT2010==38302,["BoroCode","CT2010","BoroCD"]].drop_duplicates().sort_values('BoroCode')
hcv.loc[hcv.census_tract==38302]

Unnamed: 0,program_label,borocode,county,census_tract,number_reported,people_total
484,Housing Choice Vouchers,2,Bronx County,38302.0,593.0,1449.0


In [173]:
pumas = pd.read_csv("2010_Census_Tract_to_2010_PUMA.csv")

In [183]:
nypumas = pumas.loc[(pumas.STATEFP==36)&
                    (pumas.COUNTYFP.isin([5,47,61,81,85]))]

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,PUMA5CE
43283,36,5,26701,3701
43284,36,5,26702,3701
43285,36,5,27300,3701
43286,36,5,27700,3701
43287,36,5,27900,3701
...,...,...,...,...
47145,36,85,30301,3903
47146,36,85,30302,3903
47147,36,85,31901,3903
47148,36,85,31902,3903


In [38]:
# Read census tract to NTA mapping table
tabs = pd.read_excel("nyc2010census_tabulation_equiv.xlsx",header=3)

# Combine and aggregate hcv data at community district level
hcv_nta = hcv.merge(tabs,how='left',left_on='census_tract', right_on='2010 Census Tract')
hcv_nta_g = hcv_nta.groupby(["program_label","census_tract","Name","Borough"]).aggregate({
    "number_reported":'sum',
    'people_total':'sum'}).\
    sort_values('number_reported',ascending=False).\
    reset_index()

hcv_nta_g['avg_hh_size'] = hcv_nta_g.apply(lambda x: round(x['people_total']/x['number_reported'],1) if x['number_reported'] >0 else None, axis=1)

In [47]:
print(len(hcv),len(hcv_nta_g))
hcv_nta_g.loc[hcv_nta_g['Name']=="Longwood"]

1882 2018


Unnamed: 0,program_label,census_tract,Name,Borough,number_reported,people_total,avg_hh_size
140,Housing Choice Vouchers,8700.0,Longwood,Bronx,431,768,1.8
313,Housing Choice Vouchers,13100.0,Longwood,Bronx,281,531,1.9
383,Housing Choice Vouchers,8300.0,Longwood,Bronx,241,504,2.1
435,Housing Choice Vouchers,8500.0,Longwood,Bronx,215,369,1.7
471,Housing Choice Vouchers,12901.0,Longwood,Bronx,197,409,2.1


**Source and documentation**: https://www1.nyc.gov/site/planning/data-maps/open-data/census-download-metadata.page

**Source and Documentation**: https://data.cityofnewyork.us/City-Government/Community-Districts/yfnk-k7r4

**Data Source Definition:** NYC Open Data Portal GeoJSON file for community districts, including boroughCD id for mapping to crosswalk

In [6]:
# Read in and inspect data
cd_json = gpd.read_file("https://data.cityofnewyork.us/api/geospatial/yfnk-k7r4?method=export&format=GeoJSON")
cd_json.head()

Unnamed: 0,boro_cd,shape_area,shape_leng,geometry
0,206,42664311.5086,35875.7117328,"MULTIPOLYGON (((-73.87185 40.84376, -73.87192 ..."
1,404,65739661.969,37018.3738392,"MULTIPOLYGON (((-73.84751 40.73901, -73.84801 ..."
2,304,56662613.1611,37007.806599,"MULTIPOLYGON (((-73.89647 40.68234, -73.89653 ..."
3,205,38316975.1292,29443.0481287,"MULTIPOLYGON (((-73.89138 40.86170, -73.89142 ..."
4,207,53311689.1,44812.1474256,"MULTIPOLYGON (((-73.87519 40.87158, -73.87619 ..."


In [16]:
# Read in and inspect data
nta_json = gpd.read_file("https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_Neighborhood_Tabulation_Areas_2010/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=pgeojson")
nta_json.head()

Unnamed: 0,OBJECTID,BoroCode,BoroName,CountyFIPS,NTACode,NTAName,Shape__Area,Shape__Length,geometry
0,1,4,Queens,81,QN51,Murray Hill,52488280.0,33266.904856,"POLYGON ((-73.80379 40.77562, -73.80099 40.775..."
1,2,4,Queens,81,QN27,East Elmhurst,19726950.0,19816.684463,"POLYGON ((-73.86110 40.76367, -73.85993 40.762..."
2,3,4,Queens,81,QN41,Fresh Meadows-Utopia,27774850.0,22106.431272,"POLYGON ((-73.77758 40.73020, -73.77849 40.729..."
3,4,4,Queens,81,QN08,St. Albans,77412760.0,45401.225138,"POLYGON ((-73.75205 40.70524, -73.75175 40.704..."
4,5,3,Brooklyn,47,BK69,Clinton Hill,20528220.0,23971.451296,"POLYGON ((-73.95337 40.68065, -73.95328 40.680..."


In [20]:
# Merge with HCV attributes and write to geojson
viz = nta_json.merge(hcv_nta_g,
               how='inner',
               left_on='NTAName',
               right_on='Name')

viz = viz.loc[viz.number_reported > 0,
              ['BoroName','NTAName','geometry','number_reported','people_total','avg_hh_size']]

viz.to_file("hcv_dat.geojson")

  pd.Int64Index,
