# Patterns by decade

In [18]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('../data/processed/pipeline_incidents_2010_present_all_CLEAN.csv')
df.head(2)

Unnamed: 0,datafile_as_of,ff,significant,serious,report_number,supplemental_number,report_received_date,report_type,operator_id,name,operator_street_address,operator_city_name,operator_state_abbreviation,operator_postal_code,local_datetime,time_zone,daylight_savings_ind,iyear,location_street_address,location_city_name,location_county_name,location_state_abbreviation,location_postal_code,location_latitude,location_longitude,total_cost,total_cost_current,injury_ind,injure,num_pub_evacuated,fatal,cause,cause_details,narrative,unintentional_release,installation_year
0,2/28/22,NO,YES,NO,20100001,15047,3/11/10,SUPPLEMENTAL FINAL,15007,PACIFIC GAS & ELECTRIC CO,"PG&E - GAS OPERATIONS, REGULATORY COMPLIANCE 6...",SAN RAMON,CA,94583,2/13/10 23:35,,,2010,1617 EAST 9TH STREET,STOCKTON,SAN JOAQUIN,CA,95201,37.93188,-121.26133,102500,124764.082311,NO,0,0.0,0,OTHER OUTSIDE FORCE DAMAGE,OTHER OUTSIDE FORCE DAMAGE,A FIRE AT AN UNOCCUPIED HOME OCCURRED AT APPRO...,10.0,
1,2/28/22,NO,NO,NO,20100002,15553,3/12/10,SUPPLEMENTAL FINAL,13730,NORTHERN INDIANA PUBLIC SERVICE CO,801 E 86TH AVENUE,MERRILLVILLE,IN,46410,2/16/10 10:38,,,2010,3835 SANDPIPER COVE RUN,SOUTH BEND,ST. JOSEPH,IN,46628,41.72931,-86.27531,57500,68334.121206,NO,0,20.0,0,EXCAVATION DAMAGE,EXCAVATION DAMAGE BY THIRD PARTY,"CONTRACTOR BORING ELECTRIC LINE PARALLEL TO 3""...",310.0,1990-01-01


In [4]:
df['installation_year'] = pd.to_datetime(df['installation_year'], format='%Y-%m-%d')

In [21]:
# split into decades
conditions = [
    (df['installation_year'] >= '1900-01-01') & (df['installation_year'] < '1910-01-01'),
    (df['installation_year'] >= '1910-01-01') & (df['installation_year'] < '1920-01-01'),
    (df['installation_year'] >= '1920-01-01') & (df['installation_year'] < '1930-01-01'),
    (df['installation_year'] >= '1930-01-01') & (df['installation_year'] < '1940-01-01'),
    (df['installation_year'] >= '1940-01-01') & (df['installation_year'] < '1950-01-01'),
    (df['installation_year'] >= '1950-01-01') & (df['installation_year'] < '1960-01-01'),
    (df['installation_year'] >= '1960-01-01') & (df['installation_year'] < '1970-01-01'),
    (df['installation_year'] >= '1970-01-01') & (df['installation_year'] < '1980-01-01'),
    (df['installation_year'] >= '1980-01-01') & (df['installation_year'] < '1990-01-01'),
    (df['installation_year'] >= '1990-01-01') & (df['installation_year'] < '2000-01-01'),
    (df['installation_year'] >= '2000-01-01') & (df['installation_year'] < '2010-01-01'),
    df['installation_year'] >= '2010-01-01'
]

outputs = [
    '1900-1909', '1910-1919', '1920-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979',
    '1980-1989', '1990-1999', '2000-2009', '>2010'
]

df['decade'] = np.select(conditions, outputs, np.nan)
df.decade.unique()

array(['nan', '1990-1999', '1970-1979', '1980-1989', '1960-1969',
       '2000-2009', '1950-1959', '1940-1949', '>2010', '1920-1929',
       '1930-1939', '1900-1909', '1910-1919'], dtype=object)

## Pipeline accidents occuring after 2010, for pipelines built at any time

In [22]:
# total number of fatalities per decade
df.groupby('decade').fatal.sum()

decade
1900-1909     0
1910-1919     1
1920-1929     9
1930-1939     0
1940-1949     7
1950-1959    20
1960-1969    21
1970-1979    11
1980-1989    17
1990-1999    11
2000-2009    13
>2010        19
nan          27
Name: fatal, dtype: int64

In [24]:
# total number of injuries per decade
df.groupby('decade').injure.sum()

decade
1900-1909      1
1910-1919      1
1920-1929     12
1930-1939     10
1940-1949     23
1950-1959    102
1960-1969     88
1970-1979     56
1980-1989     62
1990-1999     56
2000-2009     55
>2010        106
nan          144
Name: injure, dtype: int64

In [25]:
# total cost per decade
df.groupby('decade').total_cost.sum()

decade
1900-1909       6306582
1910-1919       2269162
1920-1929      20347696
1930-1939      43592639
1940-1949     219590037
1950-1959    1105906750
1960-1969    3354332951
1970-1979     600178891
1980-1989     218618557
1990-1999     551108794
2000-2009     407268158
>2010         367858122
nan           278520891
Name: total_cost, dtype: int64