# Looking at where incidents are occurring: find census tracts for our lat/lngs
Are pipelines breaking that were installed after 2010 breaking in areas with higher population density than pre-2010?

In [1]:
import pandas as pd
import numpy as np
import requests
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



In [2]:
df_raw = pd.read_csv('../data/processed/pipeline_incidents_2010_present_all_CLEAN.csv')
df_raw.columns

Index(['datafile_as_of', 'ff', 'significant', 'serious', 'report_number',
       'supplemental_number', 'report_received_date', 'report_type',
       'operator_id', 'name', 'operator_street_address', 'operator_city_name',
       'operator_state_abbreviation', 'operator_postal_code', 'local_datetime',
       'time_zone', 'daylight_savings_ind', 'iyear', 'location_street_address',
       'location_city_name', 'location_county_name',
       'location_state_abbreviation', 'location_postal_code',
       'location_latitude', 'location_longitude', 'total_cost',
       'total_cost_current', 'injury_ind', 'injure', 'num_pub_evacuated',
       'fatal', 'cause', 'cause_details', 'material_involved',
       'material_details', 'narrative', 'unintentional_release',
       'installation_year'],
      dtype='object')

In [3]:
len(df_raw)

7499

In [4]:
# replace spaces with plus signs in the columns with address
df_raw['location_street_address'] = df_raw.location_street_address.str.replace(' ', '+')
df_raw['location_city_name'] = df_raw.location_city_name.str.replace(' ', '+')
df_raw['location_county_name'] = df_raw.location_county_name.str.replace(' ', '+')
df_raw.head(1)

Unnamed: 0,datafile_as_of,ff,significant,serious,report_number,supplemental_number,report_received_date,report_type,operator_id,name,operator_street_address,operator_city_name,operator_state_abbreviation,operator_postal_code,local_datetime,time_zone,daylight_savings_ind,iyear,location_street_address,location_city_name,location_county_name,location_state_abbreviation,location_postal_code,location_latitude,location_longitude,total_cost,total_cost_current,injury_ind,injure,num_pub_evacuated,fatal,cause,cause_details,material_involved,material_details,narrative,unintentional_release,installation_year
0,2/28/22,NO,YES,NO,20100001,15047,3/11/10,SUPPLEMENTAL FINAL,15007,PACIFIC GAS & ELECTRIC CO,"PG&E - GAS OPERATIONS, REGULATORY COMPLIANCE 6...",SAN RAMON,CA,94583,2/13/10 23:35,,,2010,1617+EAST+9TH+STREET,STOCKTON,SAN+JOAQUIN,CA,95201,37.93188,-121.26133,102500,124764.082311,NO,0,0.0,0,OTHER OUTSIDE FORCE DAMAGE,OTHER OUTSIDE FORCE DAMAGE,OTHER,ALUMINUN,A FIRE AT AN UNOCCUPIED HOME OCCURRED AT APPRO...,10.0,


In [5]:
# count NAs
df_raw.location_street_address.isna().sum()

6192

In [6]:
# remove NAs and create a new dataframe
df = df_raw.dropna(subset=['location_street_address', 'location_city_name', 'location_county_name'])

In [7]:
len(df)

1265

In [8]:
# TESTING OUTPUT USING SAMPLE FROM: https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf

# url = 'https://geocoding.geo.census.gov/geocoder/geographies/address?street=4600+Silver+Hill+Rd&city=Washington&state=DC&benchmark=Public_AR_Census2020&vintage=Census2020_Census2020&layers=10&format=json'

# response = requests.get(url)
# data = response.json()

# data

In [9]:
# create API links for each row
urls_list = []
for row in df.iterrows():
    link = 'https://geocoding.geo.census.gov/geocoder/geographies/address?street=' + df['location_street_address'] + '&city=' + df['location_city_name'] + '&state=' + df['location_state_abbreviation'] + '&benchmark=Public_AR_Census2020&vintage=Census2020_Census2020&layers=10&format=json'
    urls_list.append(link)

print(len(urls_list))
print(urls_list[0][0])

1265
https://geocoding.geo.census.gov/geocoder/geographies/address?street=1617+EAST+9TH+STREET&city=STOCKTON&state=CA&benchmark=Public_AR_Census2020&vintage=Census2020_Census2020&layers=10&format=json


In [10]:
# request data using the api
data_requests = []
for url in urls_list[0][:400]:
    try:
        response = requests.get(url)
        data_entry = response.json()
        data_requests.append(data_entry)
    except:
        continue

len(data_requests)

397

In [11]:
# next 400
for url in urls_list[0][401:800]:
    try:
        response = requests.get(url)
        data_entry = response.json()
        data_requests.append(data_entry)
    except:
        continue

len(data_requests)

796

In [12]:
# and the last section
for url in urls_list[0][801:1265]:
    try:
        response = requests.get(url)
        data_entry = response.json()
        data_requests.append(data_entry)
    except:
        continue

len(data_requests)

1256

In [13]:
# what one entry looks like
data_requests[0]

{'result': {'input': {'benchmark': {'id': '2020',
    'benchmarkName': 'Public_AR_Census2020',
    'benchmarkDescription': 'Public Address Ranges - Census 2020 Benchmark',
    'isDefault': False},
   'vintage': {'id': '2020',
    'vintageName': 'Census2020_Census2020',
    'vintageDescription': 'Census 2020 Vintage - Census 2020 Benchmark',
    'isDefault': True},
   'address': {'street': '1617 EAST 9TH STREET',
    'city': 'STOCKTON',
    'state': 'CA'}},
  'addressMatches': [{'matchedAddress': '1617 E 9TH ST, STOCKTON, CA, 95206',
    'coordinates': {'x': -121.261536, 'y': 37.931587},
    'tigerLine': {'tigerLineId': '133852977', 'side': 'L'},
    'addressComponents': {'fromAddress': '1601',
     'toAddress': '1699',
     'preQualifier': '',
     'preDirection': 'E',
     'preType': '',
     'streetName': '9TH',
     'suffixType': 'ST',
     'suffixDirection': '',
     'suffixQualifier': '',
     'city': 'STOCKTON',
     'state': 'CA',
     'zip': '95206'},
    'geographies': {'Censu

In [14]:
# extract the sections we need into a list of dictionaries to be later converted to a dataframe
data = []
for entry in data_requests:
    each_row = {}
    each_row['street'] = entry['result']['input']['address']['street']
    each_row['city'] = entry['result']['input']['address']['city']
    each_row['state'] = entry['result']['input']['address']['state']
    
    for section in entry['result']['addressMatches']:
        for subsection in section['geographies']['Census Blocks']:
            each_row['geoid'] = subsection['GEOID'] # unique identifier, accounts for county and state
            each_row['state_code'] = subsection['STATE']
            each_row['county_code'] = subsection['COUNTY']
            each_row['tract'] = subsection['TRACT']
    
    data.append(each_row)
    
len(data)

1256

In [15]:
# convert to dataframe
df_final = pd.DataFrame(data)
len(df_final)

1256

In [16]:
df_final.head()

Unnamed: 0,street,city,state,geoid,state_code,county_code,tract
0,1617 EAST 9TH STREET,STOCKTON,CA,60770022012010.0,6.0,77.0,2201.0
1,3835 SANDPIPER COVE RUN,SOUTH BEND,IN,,,,
2,8725 COUNTRY CLUB DRIVE,PINETOP,AZ,40179649021018.0,4.0,17.0,964902.0
3,359 HAWTHORNE CIRCLE,MOUNT PROSPECT,IL,170318051114001.0,17.0,31.0,805111.0
4,303 COUNTY RD. 6100,KIRTLAND,NM,350450005071005.0,35.0,45.0,507.0


In [17]:
df_final.to_csv('../data/processed/geolocate-census-tracts.csv', index=False)