In [1]:
import sys
sys.path.append("../")

import csv
import pandas as pd
import numpy as np
import json
from itertools import repeat, chain
import utils
from utils import pad_with_zeros, zeros

In [2]:
## Update this to wherever your data directory is that contains that downloaded files referenced in the README
## of this repo
data_directory = "data"

## Prep - Scoping to just most populous counties

The following file of county populations is from this site: https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/

In [3]:
# Read the recent populations estimates for all counties in the US
population = pd.read_csv(f"{data_directory}/co-est2019-alldata.csv", header=0, encoding='ISO-8859-1')

In [4]:
# Get counties with population greater than 100K - this can be changed and is just to narrow our initial scope of data
pop = population[population['POPESTIMATE2019'] > 100000]

# Filter out entries that are not cities
pop =  pop[pop['CTYNAME'].str.contains('County')]

# Create unique state and place code for merge
pop['STATE_COUNTY'] = pop.apply(lambda x: f"{x['STATE']}:{x['COUNTY']}", axis=1)

# Generate a list of valid states and place
filter_pop = pop['STATE_COUNTY'].to_list()

In [5]:
# Save the unique state and county pairs that we will be querying from the ACS
with open(f'{data_directory}/state_county.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(list(map(lambda x: x.split(":"), filter_pop)))

## Running to Collect ACS Data using Census API

In [6]:
# Lets set up our requests session. You can use teh creaet session utility or just pass the requests object
session = utils.create_session()

states_and_counties_fp = "{}/state_county.csv".format(data_directory)

# Read in the states and places we will be querying
states_and_counties = pd.read_csv(states_and_counties_fp).values

In [7]:
## These codes and endpoints are from viewing the Census API documentation

## B03002_001E - Total population
## B03002_012E - Total population Hispanic origin
## B03002_003E - Total population white and not Hispanic
## B03002_004E - Total population black and not Hispanic
## B03002_005E - Total population Native American and not Hispanic
## B03002_006E - Total population Asian and not Hispanic

## B01002_001E - Median Age
## B01002_002E - Median Male Age

## DP03_0062E - Median Income
## DP04_0037E - Median Rooms
## DP02_0001E - # of houses

## B06009_001E - Total population 
## B06009_002E - Less than high school graduate
## B06009_003E - Just high school graduate
## B06009_005E - bachelors
## B06009_006E - graduate studies +


codes = {'acs/acs5':[ "B01003_001E", "B01002_002E", "B01002_001E", "B02001_002E", "B06009_001E",
                    "B06009_002E", "B06009_003E", "B06009_005E", "B06009_006E", "B03002_001E", "B03002_012E", 
                     "B03002_004E", "B03002_003E", "B03002_006E", "B03002_005E", "B03002_002E"],
        'acs/acs5/profile' : ["DP02_0001E", "DP04_0037E", "DP03_0062E"]} 
acs_endpoints = ['acs/acs5', 'acs/acs5/profile']


df = utils.gather_results(session, acs_endpoints, utils.config, states_and_counties, codes, dfs=[], start=0)


1 15
Status: 0.17%
1 55
Status: 0.35%
1 69
Status: 0.52%
1 73
Status: 0.70%
1 81
Status: 0.87%
1 89
Status: 1.05%
1 97
Status: 1.22%
1 101
Status: 1.40%
1 103
Status: 1.57%
1 117
Status: 1.75%
1 125
Status: 1.92%
4 3
Status: 2.10%
4 5
Status: 2.27%
4 13
Status: 2.45%
4 15
Status: 2.62%
4 17
Status: 2.80%
4 19
Status: 2.97%
4 21
Status: 3.15%
4 25
Status: 3.32%
4 27
Status: 3.50%
5 7
Status: 3.67%
5 31
Status: 3.85%
5 45
Status: 4.02%
5 119
Status: 4.20%
5 125
Status: 4.37%
5 131
Status: 4.55%
5 143
Status: 4.72%
6 1
Status: 4.90%
6 7
Status: 5.07%
6 13
Status: 5.24%
6 17
Status: 5.42%
6 19
Status: 5.59%
6 23
Status: 5.77%
6 25
Status: 5.94%
6 29
Status: 6.12%
6 31
Status: 6.29%
6 37
Status: 6.47%
6 39
Status: 6.64%
6 41
Status: 6.82%
6 47
Status: 6.99%
6 53
Status: 7.17%
6 55
Status: 7.34%
6 59
Status: 7.52%
6 61
Status: 7.69%
6 65
Status: 7.87%
6 67
Status: 8.04%
6 71
Status: 8.22%
6 73
Status: 8.39%
6 75
Status: 8.57%
6 77
Status: 8.74%
6 79
Status: 8.92%
6 81
Status: 9.09%
6 83
Stat

Status: 69.06%
39 109
Status: 69.23%
39 113
Status: 69.41%
39 133
Status: 69.58%
39 139
Status: 69.76%
39 151
Status: 69.93%
39 153
Status: 70.10%
39 155
Status: 70.28%
39 165
Status: 70.45%
39 169
Status: 70.63%
39 173
Status: 70.80%
40 17
Status: 70.98%
40 27
Status: 71.15%
40 31
Status: 71.33%
40 109
Status: 71.50%
40 143
Status: 71.68%
41 5
Status: 71.85%
41 17
Status: 72.03%
41 19
Status: 72.20%
41 29
Status: 72.38%
41 39
Status: 72.55%
41 43
Status: 72.73%
41 47
Status: 72.90%
41 51
Status: 73.08%
41 67
Status: 73.25%
41 71
Status: 73.43%
42 1
Status: 73.60%
42 3
Status: 73.78%
42 7
Status: 73.95%
42 11
Status: 74.13%
42 13
Status: 74.30%
42 17
Status: 74.48%
42 19
Status: 74.65%
42 21
Status: 74.83%
42 27
Status: 75.00%
42 29
Status: 75.17%
42 41
Status: 75.35%
42 43
Status: 75.52%
42 45
Status: 75.70%
42 49
Status: 75.87%
42 51
Status: 76.05%
42 55
Status: 76.22%
42 69
Status: 76.40%
42 71
Status: 76.57%
42 75
Status: 76.75%
42 77
Status: 76.92%
42 79
Status: 77.10%
42 81
Statu

In [8]:
df.columns

Index(['NAME', 'B01003_001E', 'B01002_002E', 'B01002_001E', 'B02001_002E',
       'B06009_001E', 'B06009_002E', 'B06009_003E', 'B06009_005E',
       'B06009_006E', 'B03002_001E', 'B03002_012E', 'B03002_004E',
       'B03002_003E', 'state', 'county', 'tract', 'DP02_0001E', 'DP04_0037E',
       'DP03_0062E'],
      dtype='object')

In [8]:
df.to_csv("census_tracts_hisp.csv", index= False)

In [9]:
df.shape

(54450, 23)

## Merging Data by Tract - Start here if already collected ACS data

In [10]:
census_tracts = pd.read_csv("census_tracts_hisp.csv")
census_tracts

Unnamed: 0,NAME,B01003_001E,B01002_002E,B01002_001E,B02001_002E,B06009_001E,B06009_002E,B06009_003E,B06009_005E,B06009_006E,...,B03002_003E,B03002_006E,B03002_005E,B03002_002E,state,county,tract,DP02_0001E,DP04_0037E,DP03_0062E
0,"Census Tract 20, Calhoun County, Alabama",7331,39.2,39.8,6926,5131,387,1758,598,481,...,6806,32,17,7211,1,15,2000,2735,6.0,66217
1,"Census Tract 8, Calhoun County, Alabama",981,39.2,41.7,419,752,204,278,43,35,...,418,2,0,980,1,15,800,423,5.1,27656
2,"Census Tract 9, Calhoun County, Alabama",3617,40.9,44.3,2476,2610,266,638,459,448,...,2464,9,7,3593,1,15,900,1528,6.4,50810
3,"Census Tract 14, Calhoun County, Alabama",3105,43.7,47.1,2502,2343,618,922,46,54,...,2480,0,0,3054,1,15,1400,1178,5.7,42083
4,"Census Tract 21.02, Calhoun County, Alabama",3350,39.4,34.8,2576,2320,80,382,521,629,...,2563,0,0,3310,1,15,2102,1312,6.5,69875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54445,"Census Tract 14, Winnebago County, Wisconsin",3928,41.6,44.2,3661,2864,232,1234,326,123,...,3581,8,0,3828,55,139,1400,1827,5.4,47156
54446,"Census Tract 34, Winnebago County, Wisconsin",4606,34.5,35.2,3913,2987,211,1277,426,135,...,3861,54,25,4370,55,139,3400,1819,5.2,47628
54447,"Census Tract 11, Winnebago County, Wisconsin",4200,37.0,43.4,3687,2871,162,1140,458,153,...,3614,343,7,4127,55,139,1100,1990,4.7,35446
54448,"Census Tract 20, Winnebago County, Wisconsin",7192,41.8,42.4,7091,4909,220,1715,1080,338,...,6883,69,6,6978,55,139,2000,2886,6.0,68457


In [11]:
#Some of the data needs a geoid
#GEOID is defined as: STATE+COUNTY+TRACT
#See: https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html
def create_geoid(row):
    geoid = str(utils.pad_with_zero(row["state"])) + str(utils.pad_place_with_zero(row["county"])) + str(utils.pad_tract_with_zero(row["tract"]))
    return int(geoid)

census_tracts["GEOID"] = census_tracts.apply(create_geoid, axis=1)

### Other data files

In [12]:
census_tracts.shape

(54450, 24)

In [13]:
land_area = pd.read_csv("data/2019_Gaz_tracts_national.txt", sep="\t")
land_area

Unnamed: 0,USPS,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
0,AL,1001020100,9817813,28435,3.791,0.011,32.481959,-86.491338
1,AL,1001020200,3325680,5669,1.284,0.002,32.475758,-86.472468
2,AL,1001020300,5349273,9054,2.065,0.003,32.474024,-86.459703
3,AL,1001020400,6384276,8408,2.465,0.003,32.471030,-86.444835
4,AL,1001020500,11408866,43534,4.405,0.017,32.458922,-86.421826
...,...,...,...,...,...,...,...,...
73996,PR,72153750501,1820185,0,0.703,0.000,18.031211,-66.867347
73997,PR,72153750502,689930,0,0.266,0.000,18.024746,-66.860442
73998,PR,72153750503,3298433,1952,1.274,0.001,18.023148,-66.876603
73999,PR,72153750601,10987037,4527,4.242,0.002,18.017809,-66.839070


In [14]:
land_area.columns = ['USPS', 'GEOID', 'ALAND', 'AWATER', 'ALAND_SQMI', 'AWATER_SQMI','INTPTLAT','INTPTLONG']                                                                                              

In [15]:
land_area.columns

Index(['USPS', 'GEOID', 'ALAND', 'AWATER', 'ALAND_SQMI', 'AWATER_SQMI',
       'INTPTLAT', 'INTPTLONG'],
      dtype='object')

In [16]:
census_tracts["GEOID"]

0         1015002000
1         1015000800
2         1015000900
3         1015001400
4         1015002102
            ...     
54445    55139001400
54446    55139003400
54447    55139001100
54448    55139002000
54449    55139002100
Name: GEOID, Length: 54450, dtype: int64

In [17]:
tracts_with_land = census_tracts.merge(land_area[["GEOID","ALAND", "ALAND_SQMI", "INTPTLAT", "INTPTLONG"]], how = "left", on = "GEOID")
tracts_with_land.shape

(54450, 28)

In [18]:
#This is by block group so may need to align on whether to average or not?
epa = pd.read_csv("data/EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv")
epa

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,4.811300e+11,4.811300e+11,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.000000,3110.360820,2.978361e+05
1,2,4.811300e+11,4.811300e+11,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.469110,4.849451e+05
2,3,4.811300e+11,4.811300e+11,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,1.067059e+05
3,4,4.811300e+11,4.811300e+11,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,4.818284e+05
4,5,4.811300e+11,4.811300e+11,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,6.876848e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220735,220736,7.803100e+11,7.803100e+11,78,30,961000,2,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,20,1,7.333333,3414.446949,3.355857e+05
220736,220737,7.803100e+11,7.803100e+11,78,30,961000,3,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,20,1,7.333333,2421.025608,2.924305e+05
220737,220738,7.803100e+11,7.803100e+11,78,30,961000,5,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,20,1,7.333333,1955.909418,1.619395e+05
220738,220739,7.803100e+11,7.803100e+11,78,30,960700,3,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,10,1,4.000000,16896.768870,1.038966e+07


In [19]:
epa["TRACTCE"].value_counts()

950100    1199
950200    1164
950300    1107
200       1049
300       1042
          ... 
864307       1
45116        1
188700       1
16712        1
212305       1
Name: TRACTCE, Length: 23969, dtype: int64

In [20]:
#EPA only has the 10 & 20 digit geoids and I want an 11 digit geoid
#Let's use the same process as earlier in the notebook
#Copying the comments from there below here too:

#Some of the data needs a geoid
#GEOID is defined as: STATE+COUNTY+TRACT
#See: https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html

def create_geoid_epa(row):
    geoid = str(utils.pad_with_zero(row["STATEFP"])) + str(utils.pad_place_with_zero(row["COUNTYFP"])) + str(utils.pad_tract_with_zero(row["TRACTCE"]))
    return int(geoid)

epa["GEOID"] = epa.apply(create_geoid_epa, axis=1)
epa

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area,GEOID
0,1,4.811300e+11,4.811300e+11,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000476,0.137707,6,14,15,17,14.000000,3110.360820,2.978361e+05,48113007825
1,2,4.811300e+11,4.811300e+11,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000801,0.231868,3,10,12,14,10.833333,3519.469110,4.849451e+05,48113007825
2,3,4.811300e+11,4.811300e+11,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,1.067059e+05,48113007825
3,4,4.811300e+11,4.811300e+11,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,4.818284e+05,48113007824
4,5,4.811300e+11,4.811300e+11,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,6.876848e+05,48113007824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220735,220736,7.803100e+11,7.803100e+11,78,30,961000,2,,,,...,-99999.000000,-99999.000000,1,1,20,1,7.333333,3414.446949,3.355857e+05,78030961000
220736,220737,7.803100e+11,7.803100e+11,78,30,961000,3,,,,...,-99999.000000,-99999.000000,1,1,20,1,7.333333,2421.025608,2.924305e+05,78030961000
220737,220738,7.803100e+11,7.803100e+11,78,30,961000,5,,,,...,-99999.000000,-99999.000000,1,1,20,1,7.333333,1955.909418,1.619395e+05,78030961000
220738,220739,7.803100e+11,7.803100e+11,78,30,960700,3,,,,...,-99999.000000,-99999.000000,1,1,10,1,4.000000,16896.768870,1.038966e+07,78030960700


In [21]:
#Merge epa with census_tracts
#TODO: How do we want to define walkability? It's currently defined by blockgroup? Do we find average?
tracts_with_epa= census_tracts.merge(epa[["GEOID","NatWalkInd"]], how = "inner", on = "GEOID")
print("After merge, the shape is:", tracts_with_epa.shape)

# The shape includes additional rows since the epa data is provided by census block group (there are sometimes a few
# block groups within a tract)

# If we groupby GEOID after the merge and then take the mean of the walkability index, that should give us a column
tracts_with_epa.groupby("GEOID")["NatWalkInd"].agg("mean")

After merge, the shape is: (159160, 25)


GEOID
1015000200      8.250000
1015000300      9.750000
1015000400      5.277778
1015000500      7.833333
1015000600      6.416667
                 ...    
55139003400    11.625000
55139003500     8.000000
55139003600     8.611111
55139003701     7.944444
55139003702     6.472222
Name: NatWalkInd, Length: 54450, dtype: float64

In [22]:
#Let's update the tracts_with_land df with this new index
tracts_with_land_walk = tracts_with_land.merge(tracts_with_epa.groupby("GEOID")["NatWalkInd"].agg("mean"), on="GEOID")
tracts_with_land_walk

Unnamed: 0,NAME,B01003_001E,B01002_002E,B01002_001E,B02001_002E,B06009_001E,B06009_002E,B06009_003E,B06009_005E,B06009_006E,...,tract,DP02_0001E,DP04_0037E,DP03_0062E,GEOID,ALAND,ALAND_SQMI,INTPTLAT,INTPTLONG,NatWalkInd
0,"Census Tract 20, Calhoun County, Alabama",7331,39.2,39.8,6926,5131,387,1758,598,481,...,2000,2735,6.0,66217,1015002000,239320450,92.402,33.738267,-85.681293,3.066667
1,"Census Tract 8, Calhoun County, Alabama",981,39.2,41.7,419,752,204,278,43,35,...,800,423,5.1,27656,1015000800,3607490,1.393,33.645722,-85.827015,11.500000
2,"Census Tract 9, Calhoun County, Alabama",3617,40.9,44.3,2476,2610,266,638,459,448,...,900,1528,6.4,50810,1015000900,12114697,4.678,33.644908,-85.796520,5.166667
3,"Census Tract 14, Calhoun County, Alabama",3105,43.7,47.1,2502,2343,618,922,46,54,...,1400,1178,5.7,42083,1015001400,21093280,8.144,33.659900,-85.889541,5.291667
4,"Census Tract 21.02, Calhoun County, Alabama",3350,39.4,34.8,2576,2320,80,382,521,629,...,2102,1312,6.5,69875,1015002102,20789534,8.027,33.835620,-85.738289,7.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54445,"Census Tract 14, Winnebago County, Wisconsin",3928,41.6,44.2,3661,2864,232,1234,326,123,...,1400,1827,5.4,47156,55139001400,6286797,2.427,43.991282,-88.560126,12.583333
54446,"Census Tract 34, Winnebago County, Wisconsin",4606,34.5,35.2,3913,2987,211,1277,426,135,...,3400,1819,5.2,47628,55139003400,3913783,1.511,44.184596,-88.478399,11.625000
54447,"Census Tract 11, Winnebago County, Wisconsin",4200,37.0,43.4,3687,2871,162,1140,458,153,...,1100,1990,4.7,35446,55139001100,2698989,1.042,44.019767,-88.571056,14.666667
54448,"Census Tract 20, Winnebago County, Wisconsin",7192,41.8,42.4,7091,4909,220,1715,1080,338,...,2000,2886,6.0,68457,55139002000,182238158,70.363,43.988283,-88.716849,6.100000


In [23]:
schools = pd.read_excel("data/GRF19/grf19_lea_tract.xlsx")
schools.rename(columns={"TRACT" : "GEOID"}, inplace=True)
schools[:10]

Unnamed: 0,LEAID,NAME_LEA19,GEOID,COUNT,LANDAREA,WATERAREA
0,100001,Fort Rucker School District,1031010300,2,23.428498,0.0
1,100001,Fort Rucker School District,1045020000,2,66.513225,1.081745
2,100003,Maxwell AFB School District,1101000900,3,3.35659,0.143795
3,100003,Maxwell AFB School District,1101001000,3,0.001526,0.0
4,100003,Maxwell AFB School District,1101006000,3,0.003588,0.0
5,100005,Albertville City School District,1095030701,8,2.116012,0.0
6,100005,Albertville City School District,1095030702,8,0.800889,0.00993
7,100005,Albertville City School District,1095030801,8,5.437714,0.012075
8,100005,Albertville City School District,1095030802,8,5.297315,0.017784
9,100005,Albertville City School District,1095030902,8,7.293396,0.033274


In [24]:
tracts_with_schools = tracts_with_land.merge(schools, how = "left", on = "GEOID")
tracts_with_schools.shape

(76380, 33)

In [25]:
#Some geoids map to multiple school districts so let's aggregate them all into a list
tracts_with_schools.groupby("GEOID")["NAME_LEA19"].agg(list)

GEOID
1015000200                       [Anniston City School District]
1015000300                       [Anniston City School District]
1015000400     [Anniston City School District, Calhoun County...
1015000500     [Anniston City School District, Calhoun County...
1015000600     [Anniston City School District, Calhoun County...
                                     ...                        
55139003400                             [Neenah School District]
55139003500                             [Neenah School District]
55139003600                             [Neenah School District]
55139003701                             [Neenah School District]
55139003702                             [Neenah School District]
Name: NAME_LEA19, Length: 54450, dtype: object

In [26]:
tracts_with_land_walk_schools = tracts_with_land_walk.merge(tracts_with_schools.groupby("GEOID")["NAME_LEA19"].agg(list), on="GEOID")
tracts_with_land_walk_schools

Unnamed: 0,NAME,B01003_001E,B01002_002E,B01002_001E,B02001_002E,B06009_001E,B06009_002E,B06009_003E,B06009_005E,B06009_006E,...,DP02_0001E,DP04_0037E,DP03_0062E,GEOID,ALAND,ALAND_SQMI,INTPTLAT,INTPTLONG,NatWalkInd,NAME_LEA19
0,"Census Tract 20, Calhoun County, Alabama",7331,39.2,39.8,6926,5131,387,1758,598,481,...,2735,6.0,66217,1015002000,239320450,92.402,33.738267,-85.681293,3.066667,"[Calhoun County School District, Jacksonville ..."
1,"Census Tract 8, Calhoun County, Alabama",981,39.2,41.7,419,752,204,278,43,35,...,423,5.1,27656,1015000800,3607490,1.393,33.645722,-85.827015,11.500000,[Anniston City School District]
2,"Census Tract 9, Calhoun County, Alabama",3617,40.9,44.3,2476,2610,266,638,459,448,...,1528,6.4,50810,1015000900,12114697,4.678,33.644908,-85.796520,5.166667,"[Anniston City School District, Calhoun County..."
3,"Census Tract 14, Calhoun County, Alabama",3105,43.7,47.1,2502,2343,618,922,46,54,...,1178,5.7,42083,1015001400,21093280,8.144,33.659900,-85.889541,5.291667,[Calhoun County School District]
4,"Census Tract 21.02, Calhoun County, Alabama",3350,39.4,34.8,2576,2320,80,382,521,629,...,1312,6.5,69875,1015002102,20789534,8.027,33.835620,-85.738289,7.166667,"[Calhoun County School District, Jacksonville ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54445,"Census Tract 14, Winnebago County, Wisconsin",3928,41.6,44.2,3661,2864,232,1234,326,123,...,1827,5.4,47156,55139001400,6286797,2.427,43.991282,-88.560126,12.583333,[Oshkosh Area School District]
54446,"Census Tract 34, Winnebago County, Wisconsin",4606,34.5,35.2,3913,2987,211,1277,426,135,...,1819,5.2,47628,55139003400,3913783,1.511,44.184596,-88.478399,11.625000,[Neenah School District]
54447,"Census Tract 11, Winnebago County, Wisconsin",4200,37.0,43.4,3687,2871,162,1140,458,153,...,1990,4.7,35446,55139001100,2698989,1.042,44.019767,-88.571056,14.666667,[Oshkosh Area School District]
54448,"Census Tract 20, Winnebago County, Wisconsin",7192,41.8,42.4,7091,4909,220,1715,1080,338,...,2886,6.0,68457,55139002000,182238158,70.363,43.988283,-88.716849,6.100000,"[Omro School District, Oshkosh Area School Dis..."


In [27]:
tracts_with_land_walk_schools.to_csv(f"{data_directory}/merged_data.csv", index=False)

## Next let's manipulate some of the variables to create new columns to be what we want to compare

In [38]:
# Let's rename our big df
#Indices to keep as is: NatWalkInd, ALAND, AWATER, 
df = pd.read_csv(f"{data_directory}/merged_data.csv")
new_columns = ["population_density", "housing_density", "median_age", "median_male_age", "pct_white", "median_income",
              "school_districts", "pct_lt_hs_grad", "median_rooms", "pct_hs_grad", "pct_at_least_col_grad", "GEOID",
               "state", "tract", "NAME", "NatWalkInd", "ALAND", "ALAND_SQMI", "county", "pct_black", "pct_hisp",
              "pct_asian", "pct_native", "pct_not_hispanic"]

In [39]:
df.columns

Index(['NAME', 'B01003_001E', 'B01002_002E', 'B01002_001E', 'B02001_002E',
       'B06009_001E', 'B06009_002E', 'B06009_003E', 'B06009_005E',
       'B06009_006E', 'B03002_001E', 'B03002_012E', 'B03002_004E',
       'B03002_003E', 'B03002_006E', 'B03002_005E', 'B03002_002E', 'state',
       'county', 'tract', 'DP02_0001E', 'DP04_0037E', 'DP03_0062E', 'GEOID',
       'ALAND', 'ALAND_SQMI', 'INTPTLAT', 'INTPTLONG', 'NatWalkInd',
       'NAME_LEA19'],
      dtype='object')

In [40]:
#Columns to be renamed
df = df.rename(columns={"B01002_001E" : "median_age" , "B01002_002E" : "median_male_age" ,
                    "DP03_0062E" : "median_income",  "NAME_LEA19" : "school_districts" , 
                        "DP04_0037E" : "median_rooms"})

In [41]:
def safe_division(x, y):
    try:
        return x / y
    except:
        return 0

In [42]:
#Columns to be created
df["population_density"] = df.apply(lambda x: safe_division(float(x['B01003_001E']),float(x["ALAND_SQMI"])), axis=1)
df["housing_density"] = df.apply(lambda x: safe_division(float(x['DP02_0001E']),float(x["ALAND_SQMI"])), axis=1)
df["pct_white"] = df.apply(lambda x: safe_division(float(x['B03002_003E']),float(x["B03002_001E"])), axis=1)
df["pct_hisp"] = df.apply(lambda x: safe_division(float(x['B03002_012E']),float(x["B03002_001E"])), axis=1)
df["pct_black"] = df.apply(lambda x: safe_division(float(x['B03002_004E']),float(x["B03002_001E"])), axis=1)
df["pct_asian"] = df.apply(lambda x: safe_division(float(x['B03002_006E']),float(x["B03002_001E"])), axis=1)
df["pct_not_hispanic"] = df.apply(lambda x: safe_division(float(x['B03002_002E']),float(x["B03002_001E"])), axis=1)
df["pct_native"] = df.apply(lambda x: safe_division(float(x['B03002_005E']),float(x["B03002_001E"])), axis=1)

df["pct_lt_hs_grad"] = df.apply(lambda x: safe_division(float(x['B06009_002E']),float(x["B06009_001E"])), axis=1)
df["pct_hs_grad"] = df.apply(lambda x: safe_division(float(x['B06009_003E']),float(x["B06009_001E"])), axis=1)
df["pct_at_least_col_grad"] = df.apply(lambda x: safe_division((float(x["B06009_005E"]) + float(x["B06009_006E"])),float(x["B06009_001E"])), axis=1)


In [43]:
df.columns

Index(['NAME', 'B01003_001E', 'median_male_age', 'median_age', 'B02001_002E',
       'B06009_001E', 'B06009_002E', 'B06009_003E', 'B06009_005E',
       'B06009_006E', 'B03002_001E', 'B03002_012E', 'B03002_004E',
       'B03002_003E', 'B03002_006E', 'B03002_005E', 'B03002_002E', 'state',
       'county', 'tract', 'DP02_0001E', 'median_rooms', 'median_income',
       'GEOID', 'ALAND', 'ALAND_SQMI', 'INTPTLAT', 'INTPTLONG', 'NatWalkInd',
       'school_districts', 'population_density', 'housing_density',
       'pct_white', 'pct_hisp', 'pct_black', 'pct_asian', 'pct_not_hispanic',
       'pct_native', 'pct_lt_hs_grad', 'pct_hs_grad', 'pct_at_least_col_grad'],
      dtype='object')

In [44]:
new_df = df[new_columns]
new_df.columns

Index(['population_density', 'housing_density', 'median_age',
       'median_male_age', 'pct_white', 'median_income', 'school_districts',
       'pct_lt_hs_grad', 'median_rooms', 'pct_hs_grad',
       'pct_at_least_col_grad', 'GEOID', 'state', 'tract', 'NAME',
       'NatWalkInd', 'ALAND', 'ALAND_SQMI', 'county', 'pct_black', 'pct_hisp',
       'pct_asian', 'pct_native', 'pct_not_hispanic'],
      dtype='object')

In [45]:
new_df.to_csv(f"{data_directory}/tracts_data_hisp.csv", index=False)