# Looking at where incidents are occurring: Merging in census tracts with rest of data
Are pipelines breaking that were installed after 2010 breaking in areas with higher population density than pre-2010?

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



## Load in data

In [2]:
df_census_tracts = pd.read_csv('../data/processed/geolocate-census-tracts.csv', dtype = str)
df_census_tracts.head()

Unnamed: 0,street,city,state,geoid,state_code,county_code,tract
0,1617 EAST 9TH STREET,STOCKTON,CA,60770022012010.0,6.0,77.0,2201.0
1,3835 SANDPIPER COVE RUN,SOUTH BEND,IN,,,,
2,8725 COUNTRY CLUB DRIVE,PINETOP,AZ,40179649021018.0,4.0,17.0,964902.0
3,359 HAWTHORNE CIRCLE,MOUNT PROSPECT,IL,170318051114001.0,17.0,31.0,805111.0
4,303 COUNTY RD. 6100,KIRTLAND,NM,350450005071005.0,35.0,45.0,507.0


In [3]:
# census population data downloaded from Decennial Census 2020 P1 Race table
df_population = pd.read_csv('../data/source/DECENNIALPL2020.P1_2022-04-26T102225_TRACTS/DECENNIALPL2020.P1_data_with_overlays_2022-04-22T122804.csv')

# drop first row
df_population = df_population.iloc[1:]

# keep only GEO_ID, NAME (area), P1_001N (total)
df_population = df_population[['GEO_ID', 'NAME', 'P1_001N']]

df_population.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,GEO_ID,NAME,P1_001N
1,1400000US01001020100,"Census Tract 201, Autauga County, Alabama",1775
2,1400000US01001020200,"Census Tract 202, Autauga County, Alabama",2055
3,1400000US01001020300,"Census Tract 203, Autauga County, Alabama",3216
4,1400000US01001020400,"Census Tract 204, Autauga County, Alabama",4246
5,1400000US01001020501,"Census Tract 205.01, Autauga County, Alabama",4322


### Clean GEOIDs for merging

In [4]:
# split population GEOIDs after US, so 1400000US01001020100 becomes 040179649021018

# test split:
# test_str='1400000US01001020100'
# test_str.split('1400000US')[1]

df_population['GEO_ID_clean'] = df_population.GEO_ID.apply(lambda geo: geo.split('1400000US')[1])
df_population.GEO_ID_clean.head(2)

1    01001020100
2    01001020200
Name: GEO_ID_clean, dtype: object

In [5]:
# remove last four characters in the census tract GEOIDs so they match

# Stockton address
# Population data:
# 06077002201
# Our tract GEOID:
# 060770022012010

# Population data:
# 04017964902
# Our tract GEOID:
# 040179649021018

df_census_tracts['geoid_clean'] = df_census_tracts["geoid"].str[:-4]
df_census_tracts.geoid_clean.head(3)

0    06077002201
1            NaN
2    04017964902
Name: geoid_clean, dtype: object

## Merge population data and our census tract numbers

In [6]:
df_merge = df_population.merge(df_census_tracts, left_on='GEO_ID_clean', right_on='geoid_clean', how='left')
df_merge.head(2)

Unnamed: 0,GEO_ID,NAME,P1_001N,GEO_ID_clean,street,city,state,geoid,state_code,county_code,tract,geoid_clean
0,1400000US01001020100,"Census Tract 201, Autauga County, Alabama",1775,1001020100,,,,,,,,
1,1400000US01001020200,"Census Tract 202, Autauga County, Alabama",2055,1001020200,,,,,,,,


## Merge with pipeline incidents data

In [7]:
df_pipeline = pd.read_csv('../data/processed/pipeline_incidents_2010_present_all_CLEAN.csv')
df_pipeline.head(2)

Unnamed: 0,datafile_as_of,ff,significant,serious,report_number,supplemental_number,report_received_date,report_type,operator_id,name,operator_street_address,operator_city_name,operator_state_abbreviation,operator_postal_code,local_datetime,time_zone,daylight_savings_ind,iyear,location_street_address,location_city_name,location_county_name,location_state_abbreviation,location_postal_code,location_latitude,location_longitude,total_cost,total_cost_current,injury_ind,injure,num_pub_evacuated,fatal,cause,cause_details,material_involved,material_details,narrative,unintentional_release,installation_year,decade
0,2/28/22,NO,YES,NO,20100001,15047,3/11/10,SUPPLEMENTAL FINAL,15007,PACIFIC GAS & ELECTRIC CO,"PG&E - GAS OPERATIONS, REGULATORY COMPLIANCE 6...",SAN RAMON,CA,94583,2/13/10 23:35,,,2010,1617 EAST 9TH STREET,STOCKTON,SAN JOAQUIN,CA,95201,37.93188,-121.26133,102500,124764.082311,NO,0,0.0,0,OTHER OUTSIDE FORCE DAMAGE,OTHER OUTSIDE FORCE DAMAGE,OTHER,ALUMINUN,A FIRE AT AN UNOCCUPIED HOME OCCURRED AT APPRO...,10.0,,
1,2/28/22,NO,NO,NO,20100002,15553,3/12/10,SUPPLEMENTAL FINAL,13730,NORTHERN INDIANA PUBLIC SERVICE CO,801 E 86TH AVENUE,MERRILLVILLE,IN,46410,2/16/10 10:38,,,2010,3835 SANDPIPER COVE RUN,SOUTH BEND,ST. JOSEPH,IN,46628,41.72931,-86.27531,57500,68334.121206,NO,0,20.0,0,EXCAVATION DAMAGE,EXCAVATION DAMAGE BY THIRD PARTY,PLASTIC,,"CONTRACTOR BORING ELECTRIC LINE PARALLEL TO 3""...",310.0,1990-01-01,1990-1999


In [None]:
df_final = df_pipeline.merge(df_merge, left_on='location_street_address', right_on='street')
df_final.head(2)

In [None]:
# Save final dataframe
df_final.to_csv('../data/processed/merged-located-pipelines.csv', index=False)