# 3 - Select columns, filter onshore

In [1]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

In [2]:
import rpy2.rinterface

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R
suppressMessages(library(tidyverse))

## 3.1 Extract relevant columns of the pipeline incidents dataset

In [5]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', 
                          sheet_name=1)
incidents_selected = incidents[['OPERATOR_ID', 'LOCAL_DATETIME', 'NAME', 'COMMODITY_RELEASED_TYPE', 
                                'SERIOUS', 'SIGNIFICANT', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE', 
                                'ON_OFF_SHORE']].copy()

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY_RELEASED_TYPE,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ON_OFF_SHORE
781,2552,2012-04-10 07:00:00,COLONIAL PIPELINE CO,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,NO,32.02821,-89.10162,ONSHORE
1411,11169,2013-11-06 09:13:00,"ENBRIDGE ENERGY, LIMITED PARTNERSHIP",CRUDE OIL,NO,NO,46.69,-92.06,ONSHORE
1687,22855,2014-06-11 11:19:00,"KOCH PIPELINE COMPANY, L.P.",CRUDE OIL,NO,YES,26.650465,-98.460842,ONSHORE
3751,300,2019-04-23 15:00:00,"PLAINS PIPELINE, L.P.",CRUDE OIL,NO,NO,34.69902,-97.6944,ONSHORE
2209,32147,2015-09-02 08:39:00,MARATHON PIPE LINE LLC,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,NO,40.708665,-80.693831,ONSHORE


In [6]:
import numpy as np

np.unique(incidents_selected['COMMODITY_RELEASED_TYPE'])

array(['BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)',
       'CO2 (CARBON DIOXIDE)', 'CRUDE OIL',
       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS',
       'REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS'],
      dtype=object)

In [7]:
incidents_selected['ON_OFF_SHORE'].value_counts()

ONSHORE     3791
OFFSHORE      28
Name: ON_OFF_SHORE, dtype: int64

### 3.1.1 Fix data types

In [8]:
incidents_selected.dtypes

OPERATOR_ID                         int64
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

In [9]:
incidents_selected['OPERATOR_ID'] = incidents_selected['OPERATOR_ID'].astype(str)
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

Make sure SERIOUS and SIGNIFICANT are booleans.

In [10]:
(incidents_selected[['SERIOUS']] == 'YES')['SERIOUS'].value_counts()

False    3803
True       16
Name: SERIOUS, dtype: int64

In [11]:
incidents_selected['SERIOUS'] = incidents_selected[['SERIOUS']] == 'YES'

In [12]:
(incidents_selected[['SIGNIFICANT']] == 'YES')['SIGNIFICANT'].value_counts()

False    2364
True     1455
Name: SIGNIFICANT, dtype: int64

In [13]:
incidents_selected['SIGNIFICANT'] = incidents_selected[['SIGNIFICANT']] == 'YES'

In [14]:
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                              bool
SIGNIFICANT                          bool
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

### 3.1.2 Recode on/offshore to boolean, fix column names

In [15]:
incidents_selected['ONSHORE'] = incidents_selected[['ON_OFF_SHORE']] == 'ONSHORE'
incidents_selected = incidents_selected.drop(columns=['ON_OFF_SHORE'])
incidents_selected = incidents_selected.rename(columns={'COMMODITY_RELEASED_TYPE': 'COMMODITY'})

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ONSHORE
2978,31174,2017-06-27 17:05:00,"SHELL PIPELINE CO., L.P.",CRUDE OIL,False,False,35.649592,-119.745399,True
824,31174,2012-05-08 08:40:00,"SHELL PIPELINE CO., L.P.",CRUDE OIL,False,False,30.0064,-90.4013,True
359,15915,2011-01-28 10:00:00,"PIPELINES OF PUERTO RICO INCD, THE",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,True,18.44801,-66.07476,True
2592,18718,2016-07-25 07:45:00,SUNOCO PIPELINE L.P.,CRUDE OIL,False,True,32.06899,-96.47586,True
3804,4906,2019-06-11 13:50:00,EXXONMOBIL PIPELINE CO,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,False,True,30.173127,-90.787278,True


In [16]:
incidents_selected.to_feather(f'../preprocessed_data/incidents_selected_{today}.feather')

## 3.2 Extract relevant columns of the pipeline system dataset (2010-)

### 3.2.1 Select relevant columns

In [17]:
from os import listdir

pipelines_2010_present = [file for file in listdir('../data/pipelines_2010_present_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2010_present = pd.concat([pd.read_excel(f'../data/pipelines_2010_present_2019-08-02/{file}', skiprows=2) 
                                    for file in pipelines_2010_present])
pipelines_2010_present = pipelines_2010_present.reset_index(drop = True)

pipelines_2010_present.sample(5)

Unnamed: 0,DATAFILE_AS_OF,REPORT_YEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,OPERATOR_ID,PARTA2NAMEOFCOMP,PARTA4STREET,PARTA4CITY,PARTA4STATE,PARTA4ZIP,...,PARTE2010HF,PARTE2010LF,PARTE2010TOTAL,PARTETOTAL,PARTETOTALHF,PARTETOTALLF,REPORT_SUBMISSION_TYPE,REPORT_DATE,FILING_DATE,FORM_REV
5214,2019-07-01 11:08:29,2015,20161969,14214,39090,"NUTAAQ PIPELINE, LLC",601 W 5TH AVE SUITE 310,ANCHORAGE,AK,99501,...,0.01,0.0,0.01,25.05,25.05,0.0,SUPPLEMENTAL,2016-06-09 18:27:12,2016-06-09 19:18:39,7000-1.1 (Rev. 06-2014)
4031,2019-07-01 11:21:18,2016,20172861,15240,994,"WILLIAMS FIELD SERVICES - GULF COAST COMPANY, LP",ONE WILLIAMS CENTER P.O. BOX 645,TULSA,OK,74172,...,0.01,0.0,0.01,28.87,2.33,26.54,INITIAL,2017-06-15 11:18:09,2017-06-15 11:18:09,7000-1.1 (Rev. 06-2014)
2054,2019-07-01 10:43:40,2012,20130263,12062,31554,CHEVRON PETROCHEMICAL PIPELINE LLC,4800 FOURANCE PLACE,HOUSTON,TX,774012324,...,0.0,0.0,0.0,174.5,1.9,172.6,INITIAL,2013-06-13 09:44:16,2013-06-13 09:44:16,7000-1.1 (Rev. 06-2011)
4062,2019-07-01 11:21:18,2016,20172390,16288,3535,DOW PIPELINE CO,1000 COUNTY ROAD 340,ANGLETON,TX,77515,...,0.0,0.0,0.0,197.43,135.66,61.77,SUPPLEMENTAL,2017-03-10 16:25:27,2018-08-26 14:15:43,7000-1.1 (Rev. 06-2014)
715,2019-07-01 10:36:07,2011,20120197,11792,2767,COOK INLET PIPE LINE CO,909 WEST 9TH STREET,ANCHORAGE,AK,99501,...,0.0,0.0,0.0,34.0,0.0,34.0,SUPPLEMENTAL,2012-06-12 16:40:14,2013-05-22 15:52:29,7000-1.1 (Rev. 06-2011)


In [18]:
pipelines_2010_selected = pipelines_2010_present[[
    'OPERATOR_ID', 'REPORT_YEAR', 'PARTA2NAMEOFCOMP', 'PARTA5COMMODITY', 'PARTBHCAONSHORE', 
    'PARTEUNKNTOTAL', 'PARTEPRE40TOTAL', 'PARTE1940TOTAL', 'PARTE1950TOTAL', 'PARTE1960TOTAL', 
    'PARTE1970TOTAL', 'PARTE1980TOTAL', 'PARTE1990TOTAL', 'PARTE2000TOTAL', 'PARTE2010TOTAL',
    'PARTBHCAOFFSHORE', 'PARTBHCATOTAL']].copy()

pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,REPORT_YEAR,PARTA2NAMEOFCOMP,PARTA5COMMODITY,PARTBHCAONSHORE,PARTEUNKNTOTAL,PARTEPRE40TOTAL,PARTE1940TOTAL,PARTE1950TOTAL,PARTE1960TOTAL,PARTE1970TOTAL,PARTE1980TOTAL,PARTE1990TOTAL,PARTE2000TOTAL,PARTE2010TOTAL,PARTBHCAOFFSHORE,PARTBHCATOTAL
721,3535,2011,DOW PIPELINE CO,HVL,37.5,0.0,0.0,0.0,0.0,103.0,44.0,57.0,0.0,37.0,0.0,,37.5
2226,32503,2012,"CALUMET SHREVEPORT FUELS, LLC",Crude Oil,8.0,0.0,0.0,0.0,0.0,0.0,7.99,0.0,0.0,0.0,0.01,,8.0
191,30781,2017,OLYMPIC PIPE LINE COMPANY,Refined and/or Petroleum Product (non-HVL),380.7,0.0,0.0,0.0,0.0,273.3,85.5,0.6,4.3,4.9,0.0,,380.7
763,12628,2011,EXXONMOBIL PIPELINE CO,HVL,66.0,0.0,0.0,0.0,80.0,56.5,73.0,15.0,7.0,12.5,0.0,,66.0
3964,39654,2018,ORYX DELAWARE OIL TRANSPORT LLC,Crude Oil,7.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.93,,7.11


In [19]:
pipelines_2010_selected.dtypes

OPERATOR_ID           int64
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCAONSHORE     float64
PARTEUNKNTOTAL      float64
PARTEPRE40TOTAL     float64
PARTE1940TOTAL      float64
PARTE1950TOTAL      float64
PARTE1960TOTAL      float64
PARTE1970TOTAL      float64
PARTE1980TOTAL      float64
PARTE1990TOTAL      float64
PARTE2000TOTAL      float64
PARTE2010TOTAL      float64
PARTBHCAOFFSHORE    float64
PARTBHCATOTAL       float64
dtype: object

In [20]:
pipelines_2010_selected['OPERATOR_ID'] = pipelines_2010_selected['OPERATOR_ID'].astype(str)
pipelines_2010_selected['OPERATOR_ID'].dtype

dtype('O')

In [21]:
pipelines_2010_selected = pipelines_2010_selected.rename(
    columns={'REPORT_YEAR': 'YEAR', 'PARTA2NAMEOFCOMP': 'NAME', 'PARTA5COMMODITY': 'COMMODITY', 'PARTBHCAONSHORE': "MILES", 
             'PARTBHCAOFFSHORE': 'OFFSHORE_MILES', 'PARTBHCATOTAL': 'TOTAL_MILES', 'PARTEUNKNTOTAL': 'AGE_UNKNOWN_MILES', 
             'PARTEPRE40TOTAL': 'MILES_PRE_1940', 'PARTE1940TOTAL': 'MILES_1940', 'PARTE1950TOTAL': 'MILES_1950', 
             'PARTE1960TOTAL': 'MILES_1960', 'PARTE1970TOTAL': 'MILES_1970', 'PARTE1980TOTAL': 'MILES_1980', 
             'PARTE1990TOTAL': 'MILES_1990', 'PARTE2000TOTAL': 'MILES_2000', 'PARTE2010TOTAL': 'MILES_2010'})
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES
3365,473,2018,ANADARKO PETROLEUM CORP,HVL,5.2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,16.5,37.55,,5.2
4371,32009,2016,EXXONMOBIL OIL CORPORATION-TERMINALS,Refined and/or Petroleum Product (non-HVL),0.21,0.0,0.0,0.0,0.0,0.27,0.0,0.0,0.16,0.0,1.95,,0.21
2775,32597,2013,"ENLINK PERMIAN II, LLC",HVL,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.604,1.25,,0.0
3005,22830,2010,WOLVERINE PIPELINE CO,Refined and/or Petroleum Product (non-HVL),551.13,174.0,0.0,287.0,180.0,41.0,207.0,0.0,6.0,65.0,1.0,,551.13
3823,39046,2018,"BRIDGER SWAN RANCH, LLC",Crude Oil,4.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,4.46


## 3.3 Extract relevant columns of the pipeline system dataset (2004)

In [22]:
pipelines_2004_2009 = [file for file in listdir('../data/pipelines_2004_2009_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2004_2009 = pd.concat([pd.read_excel(f'../data/pipelines_2004_2009_2019-08-02/{file}') 
                                 for file in pipelines_2004_2009])
pipelines_2004_2009 = pipelines_2004_2009.reset_index(drop = True)

pipelines_2004_2009.sample(5)

Unnamed: 0,ORIGINAL,SUPPLEMENT,YR,RPTID,HL_SUP_ID,SYSTEM_TYPE,OPERATOR_ID,NAME,PARENT,OFSTREET,...,BA_5,BA_6,BA_7,BA_8,DOR,DOE,PNAME,PPHONE,PEMAIL,PFAX
2003,Y,N,2004,20040072,97,CRUDE OIL,26303,UNOCAL MIDSTREAM & TRADE,UNION OIL COMPANY OF CALIFORNIA,HIGHWAY 366 OR PO BOX 237,...,0.0,0.0,0.0,,2005-06-06,2005-06-06 14:36:58,LARRY SAVAGLIO,4097243353,LJSAVAGL@UNOCAL.COM,4097240000.0
2268,Y,N,2004,20040088,113,HVLS,32004,QUESTAR GAS MANAGEMENT,QUESTAR CORPORATION,1955 BLAIRTOWN ROAD,...,0.0,0.0,0.0,,2005-03-07,2005-06-07 00:00:00,NATHAN UHRIG PSM COORDINATOR,3073527568,NATHAN.UHRIG@QUESTAR.COM,3073530000.0
2404,Y,N,2007,20070108,3470,PETROLEUM & REFINED PRODUCTS,26049,"COUNTRYMARK COOPERATIVE, LLP",,1200 REFINERY RD,...,0.0,24.0,0.0,,2008-05-06,2008-05-06 00:00:00,"RICHARD HARDY ""PIPELINE INTEGRITY MANAGER""",8128388505,HARDY@COUNTRYMARK.COM,8128330000.0
1638,N,Y,2005,20050430,6582,HVLS,31517,SOLUTIA INC,,FM 2917 & SOLUTIA DRIVE,...,0.0,0.0,0.0,,2006-10-12,2010-04-16 13:06:12,ROBERT NEIBERT PIPELINE MANAGER,2812284563,RANEIB@ASCENDMATERIALS.COM,
2230,N,Y,2004,20040387,1062,CRUDE OIL,32283,FRONT RANGE PIPELINE LLC,CHS INC,803 HIGHWAY 212 SOUTH,...,0.0,0.0,0.0,,2005-06-27,2006-05-30 00:00:00,"MICK GEE, ENGINEERING MANAGER",4066285302,MICK.GEE@CHSINC.COM,4066290000.0


### 3.3.1 Clean name column

In [23]:
pipelines_2004_2009['NAME_FIXED'] = np.where(pd.isnull(pipelines_2004_2009['PARENT']), 
                                             pipelines_2004_2009['NAME'], 
                                             pipelines_2004_2009['PARENT'])
pd.isnull(pipelines_2004_2009['NAME_FIXED']).value_counts()

False    2730
Name: NAME_FIXED, dtype: int64

In [24]:
pipelines_2004_2009[['NAME_FIXED', 'NAME', 'PARENT']].sample(5)

Unnamed: 0,NAME_FIXED,NAME,PARENT
57,VOPAK TERMINAL LOS ANGELES INC.,VOPAK TERMINAL LOS ANGELES INC.,
734,CALNEV PIPELINE CO.,CALNEV PIPELINE CO.,
135,NATIONAL COOP REFINERY ASSCO,NATIONAL COOP REFINERY ASSCO,
874,STONE ENERGY CORPORATION,STONE ENERGY CORPORATION,
1400,WILLIAMS MIDSTREAM,WILLIAMS FIELD SERVICES,WILLIAMS MIDSTREAM


### 3.3.2 Select columns

In [25]:
pipelines_2004_selected = pipelines_2004_2009[['OPERATOR_ID', 'YR', 'NAME_FIXED', 'SYSTEM_TYPE', 'HCAONM', 'ERWTM_1',
                                               'ERWTM_2', 'ERWTM_3', 'ERWTM_4', 'ERWTM_5', 'ERWTM_6', 'ERWTM_7',
                                               'ERWTM_8', 'HCAOFFM', 'HCAMT']].copy()
pipelines_2004_selected.dtypes

OPERATOR_ID      int64
YR               int64
NAME_FIXED      object
SYSTEM_TYPE     object
HCAONM         float64
ERWTM_1        float64
ERWTM_2        float64
ERWTM_3        float64
ERWTM_4        float64
ERWTM_5        float64
ERWTM_6        float64
ERWTM_7        float64
ERWTM_8        float64
HCAOFFM        float64
HCAMT          float64
dtype: object

In [26]:
pipelines_2004_selected['OPERATOR_ID'] = pipelines_2004_selected['OPERATOR_ID'].astype(str)
pipelines_2004_selected.dtypes

OPERATOR_ID     object
YR               int64
NAME_FIXED      object
SYSTEM_TYPE     object
HCAONM         float64
ERWTM_1        float64
ERWTM_2        float64
ERWTM_3        float64
ERWTM_4        float64
ERWTM_5        float64
ERWTM_6        float64
ERWTM_7        float64
ERWTM_8        float64
HCAOFFM        float64
HCAMT          float64
dtype: object

In [27]:
pipelines_2004_selected = pipelines_2004_selected.rename(
    columns={'YR': 'YEAR', 'NAME_FIXED': 'NAME', 'HCAONM': 'MILES', 'HCAOFFM': 'OFFSHORE_MILES', 
             'HCAMT': 'TOTAL_MILES', 'SYSTEM_TYPE': 'COMMODITY', 
             'ERWTM_1': 'MILES_PRE_1940', 
             'ERWTM_2': 'MILES_1940',
             'ERWTM_3': 'MILES_1950',
             'ERWTM_4': 'MILES_1960',
             'ERWTM_5': 'MILES_1970',
             'ERWTM_6': 'MILES_1980',
             'ERWTM_7': 'MILES_1990',
             'ERWTM_8': 'MILES_2000'})
pipelines_2004_selected['MILES_2010'] = 0.0
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,OFFSHORE_MILES,TOTAL_MILES,MILES_2010
799,4906,2006,EXXONMOBIL PIPELINE COMPANY,CRUDE OIL,1176.0,46.0,66.0,606.0,275.0,354.0,141.0,175.0,46.0,100.0,1276.0,0.0
1307,32179,2009,EXXONMOBIL BILLINGS REFINERY,CRUDE OIL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1508,30825,2005,CITGO PETROLEUM,HVLS,7.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.09,0.0
1784,32011,2005,"HOLLY ENERGY PARTNERS - OPERATING, L.P.",PETROLEUM & REFINED PRODUCTS,267.0,0.0,0.0,411.0,195.0,0.0,148.0,97.0,0.0,0.0,267.0,0.0
2268,32004,2004,QUESTAR CORPORATION,HVLS,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


### 3.3.3 Handle "duplicate" observations

How are the instances of diverging names treated by Pandas?

In [28]:
%%R -i pipelines_2004_selected
glimpse(pipelines_2004_selected)

Observations: 2,730
Variables: 16
$ OPERATOR_ID    [3m[90m<chr>[39m[23m "31336", "4805", "8175", "26302", "32147", "4906", "19…
$ YEAR           [3m[90m<int>[39m[23m 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, …
$ NAME           [3m[90m<chr>[39m[23m "CHEVRON U.S.A. INC", "EXPLORER PIPELINE CO", "INTERST…
$ COMMODITY      [3m[90m<chr>[39m[23m "CRUDE OIL", "PETROLEUM & REFINED PRODUCTS", "PETROLEU…
$ MILES          [3m[90m<dbl>[39m[23m 6.000, 1204.110, 8.000, 13.000, 1629.158, 373.000, 40.…
$ MILES_PRE_1940 [3m[90m<dbl>[39m[23m 0.000, 0.000, 0.000, 0.000, 26.841, 1.000, 0.000, 0.00…
$ MILES_1940     [3m[90m<dbl>[39m[23m 0.000, 0.000, 0.000, 0.000, 119.533, 101.000, 0.000, 0…
$ MILES_1950     [3m[90m<dbl>[39m[23m 0.000, 7.200, 0.000, 0.000, 585.263, 126.000, 0.000, 0…
$ MILES_1960     [3m[90m<dbl>[39m[23m 10.647, 7.800, 11.600, 0.000, 107.873, 342.000, 0.000,…
$ MILES_1970     [3m[90m<dbl>[39m[23m 0.000, 593.200, 0.000, 0.000, 243.374, 43.

In [29]:
%%R -i pipelines_2004_selected

pipelines_2004_selected <- pipelines_2004_selected %>%
    group_by(OPERATOR_ID, YEAR, COMMODITY) %>%
# We lose some information by how we create the name column, but since we mostly use the OPERATOR_ID, it's alright.
    summarize(NAME = first(NAME), 
              MILES = sum(MILES), 
              MILES_PRE_1940 = sum(MILES_PRE_1940), 
              MILES_1940 = sum(MILES_1940), 
              MILES_1950 = sum(MILES_1950), 
              MILES_1960 = sum(MILES_1960), 
              MILES_1970 = sum(MILES_1970), 
              MILES_1980 = sum(MILES_1980), 
              MILES_1990 = sum(MILES_1990), 
              MILES_2000 = sum(MILES_2000), 
              MILES_2010 = 0,
              OFFSHORE_MILES = sum(OFFSHORE_MILES), 
              TOTAL_MILES = sum(TOTAL_MILES), 
              AGE_UNKNOWN_MILES = 0)
pipelines_2004_selected <- as.data.frame(pipelines_2004_selected)
    
glimpse(pipelines_2004_selected)

Observations: 2,577
Variables: 17
$ OPERATOR_ID       [3m[90m<chr>[39m[23m "10012", "10012", "10012", "10012", "10012", "10012…
$ YEAR              [3m[90m<int>[39m[23m 2004, 2004, 2005, 2005, 2006, 2006, 2007, 2007, 200…
$ COMMODITY         [3m[90m<chr>[39m[23m "HVLS", "PETROLEUM & REFINED PRODUCTS", "HVLS", "PE…
$ NAME              [3m[90m<chr>[39m[23m "KANEB PIPE LINE COMPANY LLC", "KANEB PIPE LINE COM…
$ MILES             [3m[90m<dbl>[39m[23m 1191.000, 1196.000, 1616.000, 530.000, 1758.000, 72…
$ MILES_PRE_1940    [3m[90m<dbl>[39m[23m 0, 22, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ MILES_1940        [3m[90m<dbl>[39m[23m 0.000, 0.000, 0.000, 0.000, 0.000, 12.000, 0.000, 1…
$ MILES_1950        [3m[90m<dbl>[39m[23m 0.000, 582.000, 0.000, 582.000, 9.000, 50.000, 140.…
$ MILES_1960        [3m[90m<dbl>[39m[23m 1222.140, 941.000, 1222.000, 661.000, 0.000, 176.00…
$ MILES_1970        [3m[90m<dbl>[39m[23m 709.000, 13.000, 709.000, 13.000, 0.000

In [30]:
pipelines_2004_selected = %Rget pipelines_2004_selected
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,NAME,MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES,AGE_UNKNOWN_MILES
1498,31579,2009,PETROLEUM & REFINED PRODUCTS,"MAGELLAN PIPELINES HOLDINGS, LP",14.0,0.0,0.0,0.0,11.0,0.0,1.0,0.0,1.0,0.0,0.0,14.0,0.0
1119,31174,2004,PETROLEUM & REFINED PRODUCTS,SHELL OIL PRODUCTS US,364.0,0.0,0.0,66.0,115.0,87.0,29.0,57.0,2.0,0.0,0.0,364.0,0.0
788,2731,2009,HVLS,CHEVRON PIPE LINE CO,443.34,1249.29,257.05,448.2,905.75,519.06,130.43,85.82,91.36,0.0,0.0,443.34,0.0
1785,31864,2007,CRUDE OIL,APACHE CORPORATION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0
458,2190,2007,PETROLEUM & REFINED PRODUCTS,CENTRAL FLORIDA PIPELINE CORP,168.0,0.0,0.0,0.0,0.0,3.79,1.59,0.5,0.5,0.0,0.0,168.0,0.0


## 3.4 Merge 2010- and 2004- data

### 3.4.1 Unify commodity names

In [31]:
new_names_2010 = {'Crude Oil': 'crude', 
                  'CO2': 'co2',
                  'Fuel Grade Ethanol (dedicated system)': 'fge', 
                  'HVL': 'hvl',
                  'Refined and/or Petroleum Product (non-HVL)': 'non-hvl'}

pipelines_2010_selected = pipelines_2010_selected.replace({'COMMODITY': new_names_2010})
pipelines_2010_selected.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES
2847,39071,2013,HOLLIMON OIL CORPORATION,crude,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
82,12628,2017,MOBIL PIPE LINE COMPANY,hvl,119.76,15.88,0.0,0.0,19.7,0.63,53.31,2.29,1.02,10.33,0.0,,119.76
2497,30825,2013,CITGO PETROLEUM CORPORATION (REFINERY),hvl,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,8.0


In [32]:
new_names_2004 = {'CRUDE OIL': 'crude', 
                  'HVLS': 'hvl', 
                  'PETROLEUM & REFINED PRODUCTS': 'non-hvl'}

pipelines_2004_selected = pipelines_2010_selected.replace({'COMMODITY': new_names_2004})
pipelines_2004_selected.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES
105,18275,2017,SHELL CHEMICAL CO,non-hvl,6.3,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,0.0,0.0,6.3
3610,31476,2018,ROSE ROCK MIDSTREAM L.P.,non-hvl,27.155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.843,,27.155
4210,30735,2016,TESORO ALASKA PIPELINE COMPANY LLC,non-hvl,69.2,0.0,0.0,0.0,0.0,0.0,54.8,0.0,0.5,0.4,0.0,,69.2


### 3.4.2 Merge

In [33]:
pipelines_2004_selected['YEAR'] = pipelines_2004_selected['YEAR'].astype('int64')
pipelines_2004_selected.dtypes

OPERATOR_ID           object
YEAR                   int64
NAME                  object
COMMODITY             object
MILES                float64
AGE_UNKNOWN_MILES    float64
MILES_PRE_1940       float64
MILES_1940           float64
MILES_1950           float64
MILES_1960           float64
MILES_1970           float64
MILES_1980           float64
MILES_1990           float64
MILES_2000           float64
MILES_2010           float64
OFFSHORE_MILES       float64
TOTAL_MILES          float64
dtype: object

In [34]:
pipelines_2010_selected.dtypes

OPERATOR_ID           object
YEAR                   int64
NAME                  object
COMMODITY             object
MILES                float64
AGE_UNKNOWN_MILES    float64
MILES_PRE_1940       float64
MILES_1940           float64
MILES_1950           float64
MILES_1960           float64
MILES_1970           float64
MILES_1980           float64
MILES_1990           float64
MILES_2000           float64
MILES_2010           float64
OFFSHORE_MILES       float64
TOTAL_MILES          float64
dtype: object

Making some adjustments to make the merging seamless.

In [35]:
pipelines_2004_selected = pipelines_2004_selected[['OPERATOR_ID', 'YEAR', 'NAME', 'COMMODITY', 'MILES', 
                                                   'AGE_UNKNOWN_MILES', 'MILES_PRE_1940', 'MILES_1940', 
                                                   'MILES_1950', 'MILES_1960', 'MILES_1970', 'MILES_1980', 
                                                   'MILES_1990', 'MILES_2000', 'MILES_2010', 'OFFSHORE_MILES', 
                                                   'TOTAL_MILES']]

In [36]:
pre_sample = pd.concat([pipelines_2010_selected, pipelines_2004_selected])
pre_sample.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES
423,32488,2017,"ENI US OPERATING CO, INC",crude,15.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.3,3.6,18.9
3824,39047,2018,"GEL OFFSHORE PIPELINE, LLC",crude,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4526,38964,2016,PHILADELPHIA ENERGY SOLUTIONS REFINING AND MAR...,non-hvl,15.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.66,0.0,0.0,,15.24
3391,2731,2018,CHEVRON PIPE LINE CO,non-hvl,283.1,0.0,0.0,9.9,0.0,86.1,33.7,1.3,11.7,7.1,0.3,,283.1
4139,19410,2016,THUMS LONG BEACH CO,hvl,0.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.27


In [37]:
%%R -i pre_sample
nrow(pre_sample %>%
    filter(OPERATOR_ID == '31618') %>%
    filter(YEAR == '2017') %>%
    filter(COMMODITY == 'hvl'))

[1] 2


## 3.5 Calculate percentage offshore and average age

In [38]:
def calc_avg_age(df):
    avg_age = ((df['MILES_PRE_1940'] * 90 + 
                df['MILES_1940'] * 75 + 
                df['MILES_1950'] * 65 + 
                df['MILES_1960'] * 55 + 
                df['MILES_1970'] * 45 + 
                df['MILES_1980'] * 35 + 
                df['MILES_1990'] * 25 + 
                df['MILES_2000'] * 15 + 
                df['MILES_2010'] * 5) /
               (df['MILES_PRE_1940'] + df['MILES_1940'] + df['MILES_1950'] + df['MILES_1960'] + 
                df['MILES_1970'] + df['MILES_1980'] + df['MILES_1990'] + df['MILES_2000'] + df['MILES_2010']))
    return avg_age

In [39]:
pre_sample['AVG_AGE'] = calc_avg_age(pre_sample)
pre_sample['PERC_OFFSHORE'] = pre_sample['OFFSHORE_MILES'].fillna(0.0) / (pre_sample['TOTAL_MILES'].fillna(0.0) + 0.1)


pre_sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES,AVG_AGE,PERC_OFFSHORE
2376,11551,2013,"DELEK LOGISTICS OPERATING, LLC.",crude,146.88,272.79,0.0,102.98,43.69,0.0,2.73,0.0,4.5,96.62,10.65,,146.88,47.100548,0.0
4590,39229,2016,CAMINO REAL GATHERING CO LLC,crude,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.68,,0.0,5.0,0.0
1157,32602,2011,OXY USA INC,crude,5.542,0.0,0.0,0.0,0.0,0.0,2.029,0.0,0.0,3.513,0.0,,5.542,25.983399,0.0


## 3.6 Filter commodities and remove offshore operators

### 3.6.1 Commodities

In [40]:
pre_sample = pre_sample[pre_sample['COMMODITY'].isin(['crude', 'hvl', 'non-hvl'])]
pre_sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES,AVG_AGE,PERC_OFFSHORE
2813,38926,2013,WILDCAT MIDSTREAM OPERATING LLC,hvl,1.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.1,,1.21,5.0,0.0
1129,32483,2011,"BRIDGER LAKE, LLC",crude,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,,0.0
4993,31776,2015,CONTINUUM MIDSTREAM LLC,hvl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.34,0.0,,0.0,15.0,0.0


### 3.6.2 Offshore operators

### Double check that we are filtering the correct observations.

In [41]:
# For the operators, we remove only those segments (commodities) that have a share of offshore. 
pre_sample['OFFSHORE_MAX'] = (pre_sample['PERC_OFFSHORE'].
                              groupby([pre_sample['OPERATOR_ID'], 
                              pre_sample['COMMODITY']]).transform('max'))
pre_sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES,AVG_AGE,PERC_OFFSHORE,OFFSHORE_MAX
2743,32455,2013,"HAWTHORN OIL TRANSPORTATION (OKLAHOMA), INC",crude,8.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.41,,8.75,5.0,0.0,0.0
4646,39474,2016,"OXY MIDSTREAM OPERATING COMPANY, LLC",hvl,6.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.1,,6.91,5.0,0.0,0.0
5290,99001,2015,DEFENSE FUEL SUPPLY POINT,non-hvl,4.72,0.0,0.0,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,,4.72,55.0,0.0,0.0


In [42]:
%%R -i pre_sample
pre_sample %>%
    group_by(OPERATOR_ID, COMMODITY) %>%
    summarize(max_offshore = max(PERC_OFFSHORE)) %>%
    mutate(max_offshore = sprintf('%0.2f', max_offshore)) %>%
    {table(.$max_offshore)}


0.00 0.01 0.02 0.03 0.04 0.08 0.09 0.11 0.13 0.14 0.20 0.21 0.25 0.34 0.38 0.45 
 815    1    1    1    1    2    2    1    1    1    1    1    1    1    1    1 
0.56 0.62 0.69 0.70 0.78 0.84 0.87 0.91 0.96 0.97 0.98 0.99 1.00 
   1    1    1    1    1    1    3    2    1    4    3    8    4 


In [43]:
%%R -i pre_sample 
pre_sample %>%
    group_by(OPERATOR_ID, COMMODITY) %>%
    summarize(max_offshore = first(OFFSHORE_MAX)) %>%
    mutate(max_offshore = sprintf('%0.2f', max_offshore)) %>%
    {table(.$max_offshore)}


0.00 0.01 0.02 0.03 0.04 0.08 0.09 0.11 0.13 0.14 0.20 0.21 0.25 0.34 0.38 0.45 
 815    1    1    1    1    2    2    1    1    1    1    1    1    1    1    1 
0.56 0.62 0.69 0.70 0.78 0.84 0.87 0.91 0.96 0.97 0.98 0.99 1.00 
   1    1    1    1    1    1    3    2    1    4    3    8    4 


The results are the same, so we defined this variable correctly. Seems like we don't loose too many operators when we drop all that have any share in offshore.

In [44]:
%%R -i pre_sample
pre_sample %>%
    group_by(OPERATOR_ID, COMMODITY) %>%
    summarize(max_offshore = max(PERC_OFFSHORE)) %>%
    filter(max_offshore < 0.1) %>%
    mutate(max_offshore = sprintf('%0.2f', max_offshore)) %>%
    {table(.$max_offshore)}


0.00 0.01 0.02 0.03 0.04 0.08 0.09 
 815    1    1    1    1    2    2 


### Filter

In [45]:
len(pre_sample)

9970

In [46]:
pre_sample = pre_sample.loc[pre_sample['OFFSHORE_MAX'] == 0.0].reset_index(drop=True)
pre_sample = pre_sample.drop(columns=['OFFSHORE_MILES', 'TOTAL_MILES', 'OFFSHORE_MAX'])
pre_sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,AVG_AGE,PERC_OFFSHORE
7482,32109,2010,ONEOK NGL PIPELINE LP,non-hvl,291.28,0.0,0.0,0.0,221.59,163.34,0.0,35.93,14.73,0.15,0.0,57.408317,0.0
8114,39649,2018,"IRONWOOD MIDSTREAM ENERGY PARTNERS, LLC",crude,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.88,5.0,0.0
6008,32096,2014,WYNNEWOOD REFINERY COMPANY,crude,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0.0,15.0,0.0


In [47]:
len(pre_sample)

9296

## 3.7 Save results

In [48]:
pre_sample.to_feather(f'../preprocessed_data/pre_sample_{today}.feather')

## 3.7.1 Write original data to .feather for reference

Some columns get erroneously read to data type 'O'. We convert those manually to str type.

In [49]:
pipelines_2010_present.loc[:, pipelines_2010_present.dtypes == 'O'] = pipelines_2010_present.loc[
    :, pipelines_2010_present.dtypes == 'O'].astype(str)

pipelines_2004_2009.loc[:, pipelines_2004_2009.dtypes == 'O'] = pipelines_2004_2009.loc[
    :, pipelines_2004_2009.dtypes == 'O'].astype(str)

incidents.loc[:, incidents.dtypes == 'O'] = incidents.loc[
    :, incidents.dtypes == 'O'].astype(str)

In [50]:
pipelines_2010_present.to_feather(f'../data/pipelines_2010_{today}.feather')
pipelines_2004_2009.to_feather(f'../data/pipelines_2004_{today}.feather')
incidents.to_feather(f'../data/incidents_{today}.feather')

In [51]:
%%R -i pre_sample
nrow(pre_sample %>%
    filter(OPERATOR_ID == 31618) %>%
    filter(YEAR == 2017) %>%
    filter(COMMODITY == 'hvl'))

[1] 2
