# 3 - Select columns, filter onshore

In [1]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

## 3.1 Extract relevant columns of the pipeline incidents dataset

In [2]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', 
                          sheet_name=1)
incidents_selected = incidents[['OPERATOR_ID', 'LOCAL_DATETIME', 'NAME', 'COMMODITY_RELEASED_TYPE', 
                                'SERIOUS', 'SIGNIFICANT', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE', 
                                'ON_OFF_SHORE']].copy()

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY_RELEASED_TYPE,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ON_OFF_SHORE
662,22610,2011-11-30 06:49:00,"MAGELLAN PIPELINE COMPANY, LP",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,YES,29.79428,-95.28162,ONSHORE
750,22610,2012-03-09 09:58:00,"MAGELLAN PIPELINE COMPANY, LP",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,NO,41.36417,-96.05833,ONSHORE
964,26085,2012-09-28 12:10:00,"PLAINS MARKETING, L.P.",CRUDE OIL,NO,YES,33.86206,-118.22369,ONSHORE
1926,2552,2015-01-20 10:15:00,COLONIAL PIPELINE CO,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,NO,39.849005,-75.502586,ONSHORE
2668,31672,2016-09-22 10:23:00,"CHAPARRAL ENERGY, LLC",CO2 (CARBON DIOXIDE),NO,YES,36.202923,-100.920509,ONSHORE


In [3]:
import numpy as np

np.unique(incidents_selected['COMMODITY_RELEASED_TYPE'])

array(['BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)',
       'CO2 (CARBON DIOXIDE)', 'CRUDE OIL',
       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS',
       'REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS'],
      dtype=object)

In [4]:
incidents_selected['ON_OFF_SHORE'].value_counts()

ONSHORE     3791
OFFSHORE      28
Name: ON_OFF_SHORE, dtype: int64

### 3.1.1 Fix data types

In [5]:
incidents_selected.dtypes

OPERATOR_ID                         int64
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

In [6]:
incidents_selected['OPERATOR_ID'] = incidents_selected['OPERATOR_ID'].astype(str)
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

Make sure SERIOUS and SIGNIFICANT are booleans.

In [7]:
(incidents_selected[['SERIOUS']] == 'YES')['SERIOUS'].value_counts()

False    3803
True       16
Name: SERIOUS, dtype: int64

In [8]:
incidents_selected['SERIOUS'] = incidents_selected[['SERIOUS']] == 'YES'

In [9]:
(incidents_selected[['SIGNIFICANT']] == 'YES')['SIGNIFICANT'].value_counts()

False    2364
True     1455
Name: SIGNIFICANT, dtype: int64

In [10]:
incidents_selected['SIGNIFICANT'] = incidents_selected[['SIGNIFICANT']] == 'YES'

In [11]:
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                              bool
SIGNIFICANT                          bool
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

### 3.1.2 Recode on/offshore to boolean, fix column names

In [12]:
incidents_selected['ONSHORE'] = incidents_selected[['ON_OFF_SHORE']] == 'ONSHORE'
incidents_selected = incidents_selected.drop(columns=['ON_OFF_SHORE'])
incidents_selected = incidents_selected.rename(columns={'COMMODITY_RELEASED_TYPE': 'COMMODITY'})

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ONSHORE
2038,31618,2015-04-16 07:45:00,ENTERPRISE PRODUCTS OPERATING LLC,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,False,False,29.57863,-95.017273,True
3400,39153,2018-07-05 09:00:00,"ENLINK MIDSTREAM OPERATING, LP",CRUDE OIL,False,False,35.843945,-98.243177,True
1247,26041,2013-05-24 17:30:00,"KINDER MORGAN LIQUID TERMINALS, LLC",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,False,40.52243,-74.25384,True
3487,31574,2018-09-18 07:45:00,"WESTERN REFINING PIPELINE, LLC",CRUDE OIL,False,False,32.0644,-103.67686,True
2666,31189,2016-09-19 06:45:00,BP PIPELINE (NORTH AMERICA) INC.,CRUDE OIL,False,True,35.989585,-96.578637,True


In [13]:
incidents_selected.to_feather(f'../preprocessed_data/incidents_selected_{today}.feather')

## 3.2 Extract relevant columns of the pipeline system dataset (2010-)

### 3.2.1 Select relevant columns

In [14]:
from os import listdir

pipelines_2010_present = [file for file in listdir('../data/pipelines_2010_present_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2010_present = pd.concat([pd.read_excel(f'../data/pipelines_2010_present_2019-08-02/{file}', skiprows=2) 
                                    for file in pipelines_2010_present])
pipelines_2010_present = pipelines_2010_present.reset_index(drop = True)

pipelines_2010_present.sample(5)

Unnamed: 0,DATAFILE_AS_OF,REPORT_YEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,OPERATOR_ID,PARTA2NAMEOFCOMP,PARTA4STREET,PARTA4CITY,PARTA4STATE,PARTA4ZIP,...,PARTE2010HF,PARTE2010LF,PARTE2010TOTAL,PARTETOTAL,PARTETOTALHF,PARTETOTALLF,REPORT_SUBMISSION_TYPE,REPORT_DATE,FILING_DATE,FORM_REV
2530,2019-07-01 10:54:15,2013,20140850,13168,31178,XTO ENERGY INC,810 HOUSTON STREET,FORT WORTH,TX,76102,...,0.0,0.0,0.0,0.0,0.0,0.0,SUPPLEMENTAL,2014-06-12 15:35:50,2014-07-08 08:49:54,7000-1.1 (Rev. 06-2011)
1608,2019-07-01 11:03:57,2014,20151378,13874,32409,"GENESIS C02 PIPELINE, L.P.",919 MILAM; SUITE 2100,HOUSTON,TX,77002-5417,...,0.0,0.0,0.0,8.99,8.99,0.0,SUPPLEMENTAL,2015-06-11 10:17:36,2015-08-04 08:52:46,7000-1.1 (Rev. 06-2014)
4164,2019-07-01 11:21:18,2016,20172687,15062,26041,"KINDER MORGAN LIQUID TERMINALS, LLC",1001 LOUISIANA STREET,HOUSTON,TX,77002,...,0.0,0.0,0.0,0.72,0.72,0.0,INITIAL,2017-06-13 18:10:18,2017-06-13 18:10:18,7000-1.1 (Rev. 06-2014)
478,2019-07-01 11:28:09,2017,20183129,15634,38926,BLACK BEAR MIDSTREAM LLC,9805 KATY FREEWAY SUITE 950,HOUSTON,TX,77024,...,40.1,0.0,40.1,40.1,40.1,0.0,INITIAL,2018-05-23 11:25:23,2018-05-23 11:25:23,7000-1.1 (Rev. 06-2014)
2861,2019-07-01 10:54:15,2013,20141121,13212,39191,CONSOLIDATED EDISON CO OF NY,4 IRVING PLACE,NEW YORK,NY,10003,...,0.0,0.0,0.0,0.0,0.0,0.0,INITIAL,2014-11-12 16:58:33,2014-11-12 16:58:33,7000-1.1 (Rev. 06-2011)


In [15]:
pipelines_2010_selected = pipelines_2010_present[[
    'OPERATOR_ID', 'REPORT_YEAR', 'PARTA2NAMEOFCOMP', 'PARTA5COMMODITY', 'PARTBHCAONSHORE', 
    'PARTEUNKNTOTAL', 'PARTEPRE40TOTAL', 'PARTE1940TOTAL', 'PARTE1950TOTAL', 'PARTE1960TOTAL', 
    'PARTE1970TOTAL', 'PARTE1980TOTAL', 'PARTE1990TOTAL', 'PARTE2000TOTAL', 'PARTE2010TOTAL',
    'PARTBHCAOFFSHORE', 'PARTBHCATOTAL']].copy()

pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,REPORT_YEAR,PARTA2NAMEOFCOMP,PARTA5COMMODITY,PARTBHCAONSHORE,PARTEUNKNTOTAL,PARTEPRE40TOTAL,PARTE1940TOTAL,PARTE1950TOTAL,PARTE1960TOTAL,PARTE1970TOTAL,PARTE1980TOTAL,PARTE1990TOTAL,PARTE2000TOTAL,PARTE2010TOTAL,PARTBHCAOFFSHORE,PARTBHCATOTAL
4534,39013,2016,TESORO SOCAL PIPELINE COMPANY LLC,Refined and/or Petroleum Product (non-HVL),106.2,28.7,0.03,0.0,18.0,8.4,18.0,5.9,7.4,7.4,0.1,,106.2
3055,30683,2010,WESTLAKE PETROCHEMICALS LLC,Refined and/or Petroleum Product (non-HVL),0.663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.663
4960,31570,2015,TESORO HIGH PLAINS PIPELINE COMPANY LLC,Crude Oil,16.1,4.4,0.0,0.0,155.2,40.0,38.8,73.9,72.2,0.0,18.7,,16.1
3253,32096,2010,WYNNEWOOD REFINERY COMPANY,Crude Oil,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0.0,,2.1
2945,12105,2010,"Magellan Ammonia Pipeline, L.P.",HVL,273.0,0.0,0.0,0.0,0.0,678.0,404.0,0.0,0.5,0.0,0.0,,273.0


In [16]:
pipelines_2010_selected.dtypes

OPERATOR_ID           int64
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCAONSHORE     float64
PARTEUNKNTOTAL      float64
PARTEPRE40TOTAL     float64
PARTE1940TOTAL      float64
PARTE1950TOTAL      float64
PARTE1960TOTAL      float64
PARTE1970TOTAL      float64
PARTE1980TOTAL      float64
PARTE1990TOTAL      float64
PARTE2000TOTAL      float64
PARTE2010TOTAL      float64
PARTBHCAOFFSHORE    float64
PARTBHCATOTAL       float64
dtype: object

In [17]:
pipelines_2010_selected['OPERATOR_ID'] = pipelines_2010_selected['OPERATOR_ID'].astype(str)
pipelines_2010_selected['OPERATOR_ID'].dtype

dtype('O')

In [18]:
pipelines_2010_selected = pipelines_2010_selected.rename(
    columns={'REPORT_YEAR': 'YEAR', 'PARTA2NAMEOFCOMP': 'NAME', 'PARTA5COMMODITY': 'COMMODITY', 'PARTBHCAONSHORE': "MILES", 
             'PARTBHCAOFFSHORE': 'OFFSHORE_MILES', 'PARTBHCATOTAL': 'TOTAL_MILES', 'PARTEUNKNTOTAL': 'AGE_UNKNOWN_MILES', 
             'PARTEPRE40TOTAL': 'MILES_PRE_1940', 'PARTE1940TOTAL': 'MILES_1940', 'PARTE1950TOTAL': 'MILES_1950', 
             'PARTE1960TOTAL': 'MILES_1960', 'PARTE1970TOTAL': 'MILES_1970', 'PARTE1980TOTAL': 'MILES_1980', 
             'PARTE1990TOTAL': 'MILES_1990', 'PARTE2000TOTAL': 'MILES_2000', 'PARTE2010TOTAL': 'MILES_2010'})
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES
5222,39108,2015,BLUEFISH PIPELINE LLC,Refined and/or Petroleum Product (non-HVL),4.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,4.01
872,30782,2011,HARVEST PIPELINE COMPANY,Crude Oil,111.306,0.42,0.0,0.0,0.0,13.995,0.71,4.94,138.66,4.496,78.045,,111.306
1808,2731,2012,CHEVRON PIPE LINE CO,Crude Oil,357.7,68.0,0.0,128.0,333.0,223.0,186.0,180.0,15.0,0.0,0.0,3.5,361.2
473,38919,2017,LEGACY RESERVES OPERATING LP,Crude Oil,0.0,0.0,0.0,0.0,0.0,11.4,0.0,0.0,0.0,0.0,0.0,,0.0
3242,32051,2010,"MARTIN OPERATING PARTNERSHIP, L.P.",HVL,10.0,207.845,0.0,0.0,0.0,0.0,3.55,0.0,0.0,1.39,0.0,,10.0


### 3.2.2 Calculate percentage offshore pipelines and average age of pipelines

In [19]:
def calc_avg_age(df):
    avg_age = ((df['MILES_PRE_1940'] * 90 + 
                df['MILES_1940'] * 75 + 
                df['MILES_1950'] * 65 + 
                df['MILES_1960'] * 55 + 
                df['MILES_1970'] * 45 + 
                df['MILES_1980'] * 35 + 
                df['MILES_1990'] * 25 + 
                df['MILES_2000'] * 15 + 
                df['MILES_2010'] * 5) /
               (df['MILES_PRE_1940'] + df['MILES_1940'] + df['MILES_1950'] + df['MILES_1960'] + 
                df['MILES_1970'] + df['MILES_1980'] + df['MILES_1990'] + df['MILES_2000'] + df['MILES_2010']))
    return avg_age

pipelines_2010_selected['AVG_AGE'] = calc_avg_age(pipelines_2010_selected)

In [20]:
def calc_perc_offshore(offshore_miles: pd.Series, total_miles: pd.Series):
    offshore_miles = offshore_miles.fillna(0.0)
    total_miles = total_miles.fillna(0.0)
    
    # Adding +0.1 Miles to avoid division by zero
    perc_offshore = (offshore_miles / (total_miles + 0.1))
    return offshore_miles

In [21]:
pipelines_2010_selected['OFFSHORE_MILES'].isna().value_counts()

True     4667
False     627
Name: OFFSHORE_MILES, dtype: int64

In [22]:
(pipelines_2010_selected['TOTAL_MILES'] == 0.0).value_counts()

False    4192
True     1102
Name: TOTAL_MILES, dtype: int64

In [23]:
# Add 0.1 mile to divisor to avoid NaNs
pipelines_2010_selected['PERC_OFFSHORE'] = calc_perc_offshore(pipelines_2010_selected['OFFSHORE_MILES'],
                                                              pipelines_2010_selected['TOTAL_MILES'])
pipelines_2010_selected[['OPERATOR_ID', 'YEAR', 'COMMODITY', 'AVG_AGE', 'PERC_OFFSHORE']].sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,AVG_AGE,PERC_OFFSHORE
1172,39038,2011,Crude Oil,35.0,0.0
278,31579,2017,Refined and/or Petroleum Product (non-HVL),31.193353,0.0
914,31189,2011,HVL,43.148148,0.0
4004,39910,2018,HVL,,0.0
3700,32109,2018,HVL,31.470428,0.0


In [24]:
pipelines_2010_selected['PERC_OFFSHORE'].isna().value_counts()

False    5294
Name: PERC_OFFSHORE, dtype: int64

### 3.2.3 Filter onshore

In [25]:
# For the operators, we remove only those segments (commodities) that have a share of offshore. 
pipelines_2010_selected['OFFSHORE_MAX'] = (pipelines_2010_selected['PERC_OFFSHORE'].
                                           groupby([pipelines_2010_selected['OPERATOR_ID'], 
                                                    pipelines_2010_selected['COMMODITY']]).transform('max'))
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,OFFSHORE_MILES,TOTAL_MILES,AVG_AGE,PERC_OFFSHORE,OFFSHORE_MAX
3844,39122,2018,UTICA EAST OHIO MIDSTREAM LLC,HVL,2.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.8,,2.9,5.0,0.0,0.0
4182,26099,2016,TAMPA BAY PIPELINE CO.,HVL,92.8,0.0,0.0,0.0,0.0,0.0,45.82,28.99,1.19,20.75,1.5,,92.8,34.86056,0.0,0.0
3353,39014,2010,"GALVESTON BAY ENERGY, LLC",Crude Oil,,13.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,13.2
3510,26103,2018,"TEXAS EASTMAN DIVISION, EASTMAN CHEMICAL CO",HVL,116.26,0.0,0.0,0.0,0.0,183.0,0.0,218.1,0.0,0.0,1.6,,116.26,43.969456,0.0,0.0
1441,31443,2014,"ALON USA, LP",Refined and/or Petroleum Product (non-HVL),12.6,0.0,0.0,0.0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,,12.6,45.0,0.0,0.0


In [26]:
len(pipelines_2010_selected)

5294

In [27]:
pipelines_2010_selected = pipelines_2010_selected.loc[pipelines_2010_selected['OFFSHORE_MAX'] == 0.0].reset_index(drop=True)
pipelines_2010_selected = pipelines_2010_selected.drop(columns=
                                                       ['OFFSHORE_MILES', 'TOTAL_MILES', 'OFFSHORE_MAX'])
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,AVG_AGE,PERC_OFFSHORE
2351,31059,2013,BASF CORPORATION,HVL,5.1,0.0,0.0,0.0,0.0,0.0,0.24,7.24,0.0,0.0,0.0,35.320856,0.0
3177,7063,2018,HARBOR PIPELINE CO,Refined and/or Petroleum Product (non-HVL),78.41,0.0,0.0,0.0,77.36,0.0,0.0,0.0,0.0,0.0,3.03,62.738525,0.0
4573,30909,2015,TRANSMONTAIGNE OPERATING COMPANY L.P.,Refined and/or Petroleum Product (non-HVL),66.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2343,31023,2013,CITGO REFINING & CHEMICAL CO. L.P.,Crude Oil,7.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
0,300,2017,"PLAINS PIPELINE, L.P.",HVL,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,5.004173,0.0


In [28]:
len(pipelines_2010_selected)

4957

In [29]:
len(np.unique(pipelines_2010_selected['OPERATOR_ID']))

667

## 3.3 Extract relevant columns of the pipeline system dataset (2004)

In [30]:
pipelines_2004_2009 = [file for file in listdir('../data/pipelines_2004_2009_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2004_2009 = pd.concat([pd.read_excel(f'../data/pipelines_2004_2009_2019-08-02/{file}') 
                                 for file in pipelines_2004_2009])
pipelines_2004_2009 = pipelines_2004_2009.reset_index(drop = True)

pipelines_2004_2009.sample(5)

Unnamed: 0,ORIGINAL,SUPPLEMENT,YR,RPTID,HL_SUP_ID,SYSTEM_TYPE,OPERATOR_ID,NAME,PARENT,OFSTREET,...,BA_5,BA_6,BA_7,BA_8,DOR,DOE,PNAME,PPHONE,PEMAIL,PFAX
729,Y,N,2006,20060160,2397,CO2 OR OTHER,2731,CHEVRON PIPE LINE COMPANY,,4800 FOURNACE PLACE,...,0.0,0.0,0.0,,2007-06-08,2007-06-08 15:11:46,J. R. BURKE,7134320000.0,RBURKE@CHEVRON.COM,7134320000.0
1451,N,Y,2005,20050199,1150,CRUDE OIL,31618,ENTERPRISE PRODUCTS OPERATING LP,,2727 NORTH WEST LOOP,...,0.0,0.0,0.0,,2006-06-08,2006-06-08 16:14:27,JOHN JEWETT,7138037918.0,JJEWETT@EPROD.COM,7138040000.0
7,Y,N,2008,20080146,4787,CRUDE OIL,31552,ALPINE TRANSPORTATION CO.,,"700 G STREET, P. O. BOX 100360",...,0.0,0.0,0.0,,2009-05-12,2009-05-12 19:31:38,"SHERRY TIMMERMAN, DOT PROGRAM COORDINATOR",9072633704.0,SHERRY.A.TIMMERMAN@CONOCOPHILLIPS.COM,9072633748.0
579,Y,N,2006,20060254,2511,HVLS,12624,EXXONMOBIL OIL CORPORATION - BEAUMONT REFINERY...,,800 BELL STREET,...,0.0,0.0,0.0,,2007-06-14,2007-06-14 00:00:00,THAD MASSENGALE,7136560000.0,THAD.MASSENGALE@EXXONMOBIL.COM,7136570000.0
1306,Y,N,2009,20090188,7066,HVLS,31875,MERIT ENERGY COMPANY,,1510 EAST THOMAS ROAD,...,0.0,0.0,0.0,0.0,2010-06-02,2010-06-02 11:52:56,BILL ELLSWORTH,3073282345.0,BILL.LONEY@MERITENERGY.COM,


### 3.3.1 Clean name column

In [31]:
pipelines_2004_2009['NAME_FIXED'] = np.where(pd.isnull(pipelines_2004_2009['PARENT']), 
                                             pipelines_2004_2009['NAME'], 
                                             pipelines_2004_2009['PARENT'])
pd.isnull(pipelines_2004_2009['NAME_FIXED']).value_counts()

False    2730
Name: NAME_FIXED, dtype: int64

In [32]:
pipelines_2004_2009[['NAME_FIXED', 'NAME', 'PARENT']].sample(5)

Unnamed: 0,NAME_FIXED,NAME,PARENT
2145,MOBIL CORP,MOBIL CORP,
1633,FPL GROUP,FLORIDA POWER & LIGHT,FPL GROUP
1678,"ENBRIDGE ENERGY PARTNERS, L.P.","ENBRIDGE ENERGY, LIMITED PARTNERSHIP","ENBRIDGE ENERGY PARTNERS, L.P."
2593,"PETROLOGISTICS OLEFINS, LLC","PETROLOGISTICS OLEFINS, LLC",
2272,ONEOK PARTNERS,"BEAR PAW ENERGY, LLC",ONEOK PARTNERS


### 3.3.2 Select columns

In [33]:
pipelines_2004_selected = pipelines_2004_2009[['OPERATOR_ID', 'YR', 'NAME_FIXED', 'SYSTEM_TYPE', 'HCAONM', 'ERWTM_1',
                                               'ERWTM_2', 'ERWTM_3', 'ERWTM_4', 'ERWTM_5', 'ERWTM_6', 'ERWTM_7',
                                               'ERWTM_8', 'HCAOFFM', 'HCAMT']].copy()
pipelines_2004_selected.dtypes

OPERATOR_ID      int64
YR               int64
NAME_FIXED      object
SYSTEM_TYPE     object
HCAONM         float64
ERWTM_1        float64
ERWTM_2        float64
ERWTM_3        float64
ERWTM_4        float64
ERWTM_5        float64
ERWTM_6        float64
ERWTM_7        float64
ERWTM_8        float64
HCAOFFM        float64
HCAMT          float64
dtype: object

In [34]:
pipelines_2004_selected['OPERATOR_ID'] = pipelines_2004_selected['OPERATOR_ID'].astype(str)
pipelines_2004_selected.dtypes

OPERATOR_ID     object
YR               int64
NAME_FIXED      object
SYSTEM_TYPE     object
HCAONM         float64
ERWTM_1        float64
ERWTM_2        float64
ERWTM_3        float64
ERWTM_4        float64
ERWTM_5        float64
ERWTM_6        float64
ERWTM_7        float64
ERWTM_8        float64
HCAOFFM        float64
HCAMT          float64
dtype: object

In [35]:
pipelines_2004_selected = pipelines_2004_selected.rename(
    columns={'YR': 'YEAR', 'NAME_FIXED': 'NAME', 'HCAONM': 'MILES', 'ERWTM_1': 'MILES_PRE_1940', 
             'HCAOFFM': 'MILES_OFFSHORE', 'HCAMT': 'MILES_TOTAL', 'SYSTEM_TYPE': 'COMMODITY', 
             'ERWTM_2': 'MILES_1940',
             'ERWTM_3': 'MILES_1950',
             'ERWTM_4': 'MILES_1960',
             'ERWTM_5': 'MILES_1970',
             'ERWTM_6': 'MILES_1980',
             'ERWTM_7': 'MILES_1990',
             'ERWTM_8': 'MILES_2000'})
pipelines_2004_selected['MILES_2010'] = 0.0
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_OFFSHORE,MILES_TOTAL,MILES_2010
2622,31583,2007,"TESORO REFINING & MARKETING CO., MOUNTAIN REGION",CRUDE OIL,5.2,2.3,0.0,0.0,0.9,0.0,0.0,2.0,0.0,0.0,5.2,0.0
2228,31130,2004,"DUKE ENERGY FIELD SERVICES, LP",HVLS,2.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0,2.0,0.0
2133,18386,2004,BP PIPELINES NORTH AMERICA INC.,PETROLEUM & REFINED PRODUCTS,393.0,140.0,2.0,137.0,87.0,96.0,8.0,6.0,1.0,0.0,393.0,0.0
1623,31684,2005,CONOCOPHILLIPS PIPE LINE CO.,CRUDE OIL,1353.0,573.0,637.0,1024.0,741.0,98.0,117.0,108.0,17.0,0.0,1353.0,0.0
1863,26149,2004,ALYESKA PIPELINE SERVICE COMPANY,CRUDE OIL,171.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,171.0,0.0


### 3.3.3 Calculate on/offshore, age

In [36]:
pipelines_2004_selected['AVG_AGE'] = calc_avg_age(pipelines_2004_selected)

In [37]:
pipelines_2004_selected['PERC_OFFSHORE'] = (pipelines_2004_selected['MILES_OFFSHORE'].fillna(0.0) / 
                                            (pipelines_2004_selected['MILES_TOTAL'].fillna(0.0) + 0.1))
# Function somehow does not work (does not divide ?!)
# calc_perc_offshore(pipelines_2004_selected['MILES_OFFSHORE'], pipelines_2004_selected['ON_AND_OFFSHORE'])
pipelines_2004_selected = pipelines_2004_selected.drop(columns = ['MILES_OFFSHORE', 'MILES_TOTAL'])
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,AVG_AGE,PERC_OFFSHORE
2499,30666,2007,"ENMARK ENERGY, INC",CO2 OR OTHER,70.0,0.0,0.0,0.0,0.0,5.0,47.0,0.0,18.0,0.0,30.571429,0.0
1113,31897,2009,"THE GEORGE R. BROWN PARTNERSHIP, L.P.",CO2 OR OTHER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.7,0.0,15.0,0.0
368,22430,2008,WEST SHORE PIPELINE CO,PETROLEUM & REFINED PRODUCTS,462.726,0.0,33.532,197.464,275.15,3.184,0.0,0.0,0.0,0.0,60.131133,0.0
1520,30829,2005,"TTEPPCO CRUDE PIPELINE, L.P.",CRUDE OIL,336.0,1.0,28.0,15.0,0.0,7.0,0.0,28.0,0.0,0.0,52.911392,0.0
95,31455,2008,CONNACHER OIL AND GAS LTD.,CRUDE OIL,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6,0.0,15.0,0.0


## 3.4 Save results

In [38]:
pipelines_2004_selected.to_feather(f'../preprocessed_data/pipelines_2004_selected_{today}.feather')

In [39]:
pipelines_2010_selected.to_feather(f'../preprocessed_data/pipelines_2010_selected_{today}.feather')

## 3.4.1 Write original data to .feather for reference

Some columns get erroneously read to data type 'O'. We convert those manually to str type.

In [40]:
pipelines_2010_present.loc[:, pipelines_2010_present.dtypes == 'O'] = pipelines_2010_present.loc[
    :, pipelines_2010_present.dtypes == 'O'].astype(str)

pipelines_2004_2009.loc[:, pipelines_2004_2009.dtypes == 'O'] = pipelines_2004_2009.loc[
    :, pipelines_2004_2009.dtypes == 'O'].astype(str)

incidents.loc[:, incidents.dtypes == 'O'] = incidents.loc[
    :, incidents.dtypes == 'O'].astype(str)

In [41]:
pipelines_2010_present.to_feather(f'../data/pipelines_2010_{today}.feather')
pipelines_2004_2009.to_feather(f'../data/pipelines_2004_{today}.feather')
incidents.to_feather(f'../data/incidents_{today}.feather')