# Pipeline incidents after 2010

Data from [PHMSA](https://www.phmsa.dot.gov/data-and-statistics/pipeline/source-data)
* Includes both 'Significant' and 'non-significant' events
* Events include the following: 
    * Gas distribution
    * Gas Transmission and Gathering
    * Hazardous liquid
    
    **NOTE:** Liquified Natural Gas incidents are not included in this analysis (32 total LNG incidents since 2011)

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



In [2]:
df = pd.read_csv('../data/processed/pipeline_incidents_2010_present_all_CLEAN.csv')
df.head(2)

Unnamed: 0,datafile_as_of,ff,significant,serious,report_number,supplemental_number,report_received_date,report_type,operator_id,name,operator_street_address,operator_city_name,operator_state_abbreviation,operator_postal_code,local_datetime,time_zone,daylight_savings_ind,iyear,location_street_address,location_city_name,location_county_name,location_state_abbreviation,location_postal_code,location_latitude,location_longitude,total_cost,total_cost_current,injury_ind,injure,num_pub_evacuated,fatal,cause,cause_details,material_involved,material_details,narrative,unintentional_release,installation_year,decade,year_dt,pipeline_age
0,2/28/22,NO,YES,NO,20100001,15047,3/11/10,SUPPLEMENTAL FINAL,15007,PACIFIC GAS & ELECTRIC CO,"PG&E - GAS OPERATIONS, REGULATORY COMPLIANCE 6...",SAN RAMON,CA,94583,2/13/10 23:35,,,2010.0,1617 EAST 9TH STREET,STOCKTON,SAN JOAQUIN,CA,95201,37.93188,-121.26133,102500,124764.082311,NO,0,0.0,0,OTHER OUTSIDE FORCE DAMAGE,OTHER OUTSIDE FORCE DAMAGE,OTHER,ALUMINUN,A FIRE AT AN UNOCCUPIED HOME OCCURRED AT APPRO...,10.0,,,,
1,2/28/22,NO,NO,NO,20100002,15553,3/12/10,SUPPLEMENTAL FINAL,13730,NORTHERN INDIANA PUBLIC SERVICE CO,801 E 86TH AVENUE,MERRILLVILLE,IN,46410,2/16/10 10:38,,,2010.0,3835 SANDPIPER COVE RUN,SOUTH BEND,ST. JOSEPH,IN,46628,41.72931,-86.27531,57500,68334.121206,NO,0,20.0,0,EXCAVATION DAMAGE,EXCAVATION DAMAGE BY THIRD PARTY,PLASTIC,,"CONTRACTOR BORING ELECTRIC LINE PARALLEL TO 3""...",310.0,1990-01-01,1990-1999,1990.0,20.0


## Incidents by decade

In [3]:
df['installation_year'] = pd.to_datetime(df['installation_year'], format='%Y-%m-%d')

In [4]:
# Create column for decade of installation
conditions = [
    (df['installation_year'] >= '1900-01-01') & (df['installation_year'] < '1910-01-01'),
    (df['installation_year'] >= '1910-01-01') & (df['installation_year'] < '1920-01-01'),
    (df['installation_year'] >= '1920-01-01') & (df['installation_year'] < '1930-01-01'),
    (df['installation_year'] >= '1930-01-01') & (df['installation_year'] < '1940-01-01'),
    (df['installation_year'] >= '1940-01-01') & (df['installation_year'] < '1950-01-01'),
    (df['installation_year'] >= '1950-01-01') & (df['installation_year'] < '1960-01-01'),
    (df['installation_year'] >= '1960-01-01') & (df['installation_year'] < '1970-01-01'),
    (df['installation_year'] >= '1970-01-01') & (df['installation_year'] < '1980-01-01'),
    (df['installation_year'] >= '1980-01-01') & (df['installation_year'] < '1990-01-01'),
    (df['installation_year'] >= '1990-01-01') & (df['installation_year'] < '2000-01-01'),
    (df['installation_year'] >= '2000-01-01') & (df['installation_year'] < '2010-01-01'),
    df['installation_year'] >= '2010-01-01'
]

outputs = [
    '1900-1909', '1910-1919', '1920-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979',
    '1980-1989', '1990-1999', '2000-2009', '>2010'
]

df['decade'] = np.select(conditions, outputs, np.nan)
df.decade.unique()

array(['nan', '1990-1999', '1970-1979', '1980-1989', '1960-1969',
       '2000-2009', '1950-1959', '1940-1949', '>2010', '1920-1929',
       '1930-1939', '1900-1909', '1910-1919'], dtype=object)

In [5]:
# Accidents with post 2010 installation year for how many accidents with ANY installation year that occurred post 2010
len(df[df.decade == '>2010'])/ len(df[df.iyear >= 2010])

# 20.5% of all accidents that occurred post 2010 had installation years after 2010

0.20469395919455927

## Overview of incidents

In [6]:
# Total cost adjusted for inflation
df['total_cost_current'].sum()

7994914265.472826

In [7]:
# Total injuries 
df.injure.sum()

716

In [8]:
# Total deaths
df.fatal.sum()

156

In [9]:
# Total incidents
df.shape

(7499, 41)

## Incidents per miles of pipeline
Mileage data, last updated May 2, 2022 from [PHMSA](https://www.phmsa.dot.gov/data-and-statistics/pipeline/annual-report-mileage-hazardous-liquid-or-carbon-dioxide-systems)

In [10]:
# Count number of incidents by installation year
df_count = df.installation_year.dt.strftime('%Y').value_counts().sort_index().reset_index()
df_count.columns = ['installation_year', 'count']
df_count.head()

Unnamed: 0,installation_year,count
0,1900,3
1,1901,1
2,1902,1
3,1903,2
4,1906,2


In [11]:
# Load in data with total mileage of pipelines each year
df_mileage = pd.read_csv('../data/processed/pipeline-mileage-over-time.csv')

In [12]:
# Convert mileage column to string
df_mileage['year'] = df_mileage.year.astype(str)

In [13]:
# Merge data on pipeline mileage with data on number of incidents
df_merge = df_count.merge(df_mileage, left_on='installation_year', right_on='year', how='left')
df_merge.head()

Unnamed: 0,installation_year,count,year,Gas Distribution,Gas Transmission and Gathering,Hazardous Liquid,total
0,1900,3,,,,,
1,1901,1,,,,,
2,1902,1,,,,,
3,1903,2,,,,,
4,1906,2,,,,,


In [14]:
# Create column with pipeline incidents as a percent of total pipeline mileage
df_merge['percent_incidents'] = (df_merge['count'] / df_merge['total'])*100

In [15]:
df_merge.sort_values('percent_incidents', ascending=False).head(10)

Unnamed: 0,installation_year,count,year,Gas Distribution,Gas Transmission and Gathering,Hazardous Liquid,total,percent_incidents
105,2015,202,2015,2190257.0,318949.0,208622.0,2717828.0,0.007432
104,2014,195,2014,2169375.0,319354.0,199795.0,2688524.0,0.007253
106,2016,180,2016,2211522.0,318218.0,212164.0,2741904.0,0.006565
102,2012,154,2012,2138000.0,319926.0,186221.0,2644147.0,0.005824
103,2013,154,2013,2149819.0,320257.0,192412.0,2662488.0,0.005784
108,2018,130,2018,2238709.0,319542.0,219137.0,2777388.0,0.004681
109,2019,128,2019,2262931.0,320101.0,225001.0,2808033.0,0.004558
107,2017,117,2017,2226053.0,318832.0,216052.0,2760937.0,0.004238
99,2009,107,2009,2086689.0,324936.0,175965.0,2587590.0,0.004135
100,2010,98,2010,2102483.0,324432.0,181986.0,2608901.0,0.003756


In [16]:
df_merge.to_csv('../data/processed/pipeline-incidents-normalized-by-mileage.csv', index=False)