In [1]:
import sys
sys.path.append("../")

import csv
import pandas as pd
import numpy as np
import json
from itertools import repeat, chain
import utils
from utils import pad_with_zeros, zeros

## Running to Collect ACS Data using Census API

In [2]:
# Lets set up our requests session. You can use teh creaet session utility or just pass the requests object
session = utils.create_session()


In [3]:
alameda = np.array([[6, 1]]) #6 is State of California, 1 is Alameda county
alameda

array([[6, 1]])

In [4]:
## These codes and endpoints are from viewing the Census API documentation
## 

codes = {'acs/acs5':[ "B03002_001E","B03002_002E","B03002_003E","B03002_004E","B03002_005E", "B03002_006E",
                    "B03002_007E", "B03002_008E", "B03002_009E", "B03002_010E", "B03002_011E", "B03002_012E",
                    "B03002_013E", "B03002_014E"],
        'acs/acs5/subject' : ["S0601_C01_047E", "S1501_C01_006E", "S1501_C01_014E", "S1501_C01_015E", 
                              "S2801_C01_001E", "S2801_C01_002E", "S2801_C01_012E"]}
acs_endpoints = ['acs/acs5', 'acs/acs5/subject'] 


df = utils.gather_results(session, acs_endpoints, utils.config, alameda, codes, dfs=[], start=0)


Status: 100.00%


In [5]:
df.columns

Index(['NAME_x', 'B03002_001E', 'B03002_002E', 'B03002_003E', 'B03002_004E',
       'B03002_005E', 'B03002_006E', 'B03002_007E', 'B03002_008E',
       'B03002_009E', 'B03002_010E', 'B03002_011E', 'B03002_012E',
       'B03002_013E', 'B03002_014E', 'state', 'county', 'tract', 'block group',
       'NAME_y', 'S0601_C01_047E', 'S1501_C01_006E', 'S1501_C01_014E',
       'S1501_C01_015E', 'S2801_C01_001E', 'S2801_C01_002E', 'S2801_C01_012E'],
      dtype='object')

In [6]:
df

Unnamed: 0,NAME_x,B03002_001E,B03002_002E,B03002_003E,B03002_004E,B03002_005E,B03002_006E,B03002_007E,B03002_008E,B03002_009E,...,tract,block group,NAME_y,S0601_C01_047E,S1501_C01_006E,S1501_C01_014E,S1501_C01_015E,S2801_C01_001E,S2801_C01_002E,S2801_C01_012E
0,"Block Group 1, Census Tract 4421, Alameda Coun...",2188,2154,212,78,5,1790,5,0,64,...,442100,1,"Census Tract 4421, Alameda County, California",92228,3836,3590,2868,1630,1564,1550
1,"Block Group 2, Census Tract 4421, Alameda Coun...",3274,3254,519,0,24,2609,25,11,66,...,442100,2,"Census Tract 4421, Alameda County, California",92228,3836,3590,2868,1630,1564,1550
2,"Block Group 4, Census Tract 4422, Alameda Coun...",1201,1109,384,57,18,600,9,0,41,...,442200,4,"Census Tract 4422, Alameda County, California",68202,5249,5072,3490,2267,2240,2157
3,"Block Group 3, Census Tract 4422, Alameda Coun...",1544,1524,344,0,0,1171,0,0,9,...,442200,3,"Census Tract 4422, Alameda County, California",68202,5249,5072,3490,2267,2240,2157
4,"Block Group 2, Census Tract 4422, Alameda Coun...",2296,2204,488,0,0,1656,0,0,60,...,442200,2,"Census Tract 4422, Alameda County, California",68202,5249,5072,3490,2267,2240,2157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042,"Block Group 1, Census Tract 4501.01, Alameda C...",5750,5286,1837,255,22,2715,10,0,447,...,450101,1,"Census Tract 4501.01, Alameda County, California",85547,3958,3817,2996,2168,2168,2073
1043,"Block Group 1, Census Tract 4226, Alameda Coun...",1043,914,403,32,0,459,0,0,20,...,422600,1,"Census Tract 4226, Alameda County, California",4831,282,282,268,26,21,17
1044,"Block Group 1, Census Tract 4301.02, Alameda C...",2424,2126,1402,41,0,469,17,20,177,...,430102,1,"Census Tract 4301.02, Alameda County, California",68049,1908,1842,1045,900,895,877
1045,"Block Group 0, Census Tract 9900, Alameda Coun...",0,0,0,0,0,0,0,0,0,...,990000,0,"Census Tract 9900, Alameda County, California",-666666666,0,0,0,0,0,0


## Cleaning up Census Data

1. Identify missing values and change to NA
2. Calculate the percentages from the total variables
3. Merge with the lat/long/addresses dataset
4. Merge with redlining data from : https://github.com/americanpanorama/Census_HOLC_Research/tree/main/2010_Census_Tracts

In [7]:
#Identify missing values and change to NA
for name in df.columns:
    for row in range(len(df[name])):
        if str(df.loc[row, name])==str(-666666666):
            print(df.loc[row, name])
            df.loc[row, name] = np.nan
            


-666666666
-666666666


In [8]:
#Remove unnecessary columns
df.drop(columns=['NAME_y','state','county'],inplace=True)

In [10]:
def safe_division(x, y):
    try:
        return x / y
    except:
        return 0

In [16]:
##Calculating percentages from totals Census data
##Following variable codes noted here: https://docs.google.com/document/d/1xjMTwoRWXRhH0ncSYHBVcabCEi7ZDx-vF32qZtZl5zU/edit

#Columns to be created
df["pct_white"] = df.apply(lambda x: safe_division(float(x['B03002_003E']),float(x["B03002_001E"])), axis=1)
df['pct_black'] = df.apply(lambda x: safe_division(float(x['B03002_004E']),float(x["B03002_001E"])), axis=1)
df['pct_native'] = df.apply(lambda x: safe_division(float(x['B03002_005E']),float(x["B03002_001E"])), axis=1)
df['pct_asian'] = df.apply(lambda x: safe_division(float(x['B03002_006E']),float(x["B03002_001E"])), axis=1)
df['pct_hisp'] = df.apply(lambda x: safe_division(float(x['B03002_012E']),float(x["B03002_001E"])), axis=1)
df['pct_other'] = df.apply(lambda x: safe_division(float(x['B03002_007E'])+float(x['B03002_008E'])+float(x['B03002_009E'])+float(x['B03002_010E'])+float(x['B03002_011E']),float(x["B03002_001E"])), axis=1)

df['median_income'] = df['S0601_C01_047E']

df["pct_college"] = df.apply(lambda x: safe_division(float(x['S1501_C01_015E']),float(x["S1501_C01_006E"])), axis=1)
df["pct_hs_grad"] = df.apply(lambda x: safe_division(float(x['S1501_C01_014E']),float(x["S1501_C01_006E"])), axis=1)

df['pct_device_access'] = df.apply(lambda x: safe_division(float(x['S2801_C01_002E']),float(x["S2801_C01_001E"])), axis=1)
df['pct_internet_access'] = df.apply(lambda x: safe_division(float(x['S2801_C01_012E']),float(x["S2801_C01_001E"])), axis=1)


In [17]:
df.columns

Index(['NAME_x', 'B03002_001E', 'B03002_002E', 'B03002_003E', 'B03002_004E',
       'B03002_005E', 'B03002_006E', 'B03002_007E', 'B03002_008E',
       'B03002_009E', 'B03002_010E', 'B03002_011E', 'B03002_012E',
       'B03002_013E', 'B03002_014E', 'tract', 'block group', 'S0601_C01_047E',
       'S1501_C01_006E', 'S1501_C01_014E', 'S1501_C01_015E', 'S2801_C01_001E',
       'S2801_C01_002E', 'S2801_C01_012E', 'pct_white', 'pct_black',
       'pct_native', 'pct_asian', 'pct_hisp', 'pct_other', 'median_income',
       'pct_college', 'pct_hs_grad', 'pct_device_access',
       'pct_internet_access'],
      dtype='object')

### Creating a GEOID column

In [18]:
#Create a geoid column

#Some of the data needs a geoid
#GEOID is defined as: STATE+COUNTY+TRACT+BlOCKGP
#See: https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html
def create_geoid(row):
    geoid = "06" + "001" + str(utils.pad_tract_with_zero(row["tract"]))+str(row["block group"])
    return int(geoid)

df["GEOID"] = df.apply(create_geoid, axis=1)

In [26]:
def create_geoid_tract(row):
    geoid = "06" + "001" + str(utils.pad_tract_with_zero(row["tract"]))
    return int(geoid)

df["GEOID_tract"] = df.apply(create_geoid_tract, axis=1)

In [27]:
df["GEOID_tract"]

0       6001442100
1       6001442100
2       6001442200
3       6001442200
4       6001442200
           ...    
1042    6001450101
1043    6001422600
1044    6001430102
1045    6001990000
1046    6001402900
Name: GEOID_tract, Length: 1047, dtype: int64

In [28]:
df.to_csv("acs_demographic_data.csv")

### Merging the Redlining Data