# 3 - Select columns

In [1]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

## 3.1 Extract relevant columns of the pipeline incidents dataset

In [2]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', 
                          sheet_name=1)
incidents_selected = incidents[['OPERATOR_ID', 'LOCAL_DATETIME', 'NAME', 'COMMODITY_RELEASED_TYPE', 
                                'SERIOUS', 'SIGNIFICANT', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE']].copy()

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY_RELEASED_TYPE,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
2227,39085,2015-09-10 00:27:00,RIMROCK MIDSTREAM,CRUDE OIL,NO,YES,37.80348,-96.5137
2548,18718,2016-06-06 15:30:00,SUNOCO PIPELINE L.P.,CRUDE OIL,NO,NO,32.08145,-96.41445
2190,2552,2015-08-21 06:12:00,COLONIAL PIPELINE CO,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,NO,36.07101,-79.93669
1352,32147,2013-08-29 07:30:00,MARATHON PIPE LINE LLC,CRUDE OIL,NO,YES,39.327116,-87.898132
1531,31684,2014-02-16 07:00:00,PHILLIPS 66 PIPELINE LLC,BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL B...,NO,NO,35.265278,-101.890833


In [3]:
incidents_selected.dtypes

OPERATOR_ID                         int64
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
dtype: object

In [4]:
incidents_selected['OPERATOR_ID'] = incidents_selected['OPERATOR_ID'].astype(str)
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
dtype: object

Make sure SERIOUS and SIGNIFICANT are booleans.

In [5]:
(incidents_selected[['SERIOUS']] == 'YES')['SERIOUS'].value_counts()

False    3803
True       16
Name: SERIOUS, dtype: int64

In [6]:
incidents_selected['SERIOUS'] = incidents_selected[['SERIOUS']] == 'YES'

In [7]:
(incidents_selected[['SIGNIFICANT']] == 'YES')['SIGNIFICANT'].value_counts()

False    2364
True     1455
Name: SIGNIFICANT, dtype: int64

In [8]:
incidents_selected['SIGNIFICANT'] = incidents_selected[['SIGNIFICANT']] == 'YES'

In [9]:
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                              bool
SIGNIFICANT                          bool
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
dtype: object

In [10]:
incidents_selected = incidents_selected.rename(columns={'COMMODITY_RELEASED_TYPE': 'COMMODITY'})

In [11]:
incidents_selected.to_feather(f'../preprocessed_data/incidents_selected_{today}.feather')

## 3.2 Extract relevant columns of the pipeline system dataset

In [12]:
from os import listdir

pipelines_2010_present = [file for file in listdir('../data/pipelines_2010_present_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2010_present = pd.concat([pd.read_excel(f'../data/pipelines_2010_present_2019-08-02/{file}', skiprows=2) 
                                    for file in pipelines_2010_present])
pipelines_2010_present = pipelines_2010_present.reset_index(drop = True)

pipelines_2010_present.sample(5)

Unnamed: 0,DATAFILE_AS_OF,REPORT_YEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,OPERATOR_ID,PARTA2NAMEOFCOMP,PARTA4STREET,PARTA4CITY,PARTA4STATE,PARTA4ZIP,...,PARTE2010HF,PARTE2010LF,PARTE2010TOTAL,PARTETOTAL,PARTETOTALHF,PARTETOTALLF,REPORT_SUBMISSION_TYPE,REPORT_DATE,FILING_DATE,FORM_REV
4802,2019-07-01 11:08:29,2015,20161933,14170,20202,ULTRAMAR INC,2402 EAST ANAHEIM STREET,WILMINGTON,CA,90744,...,0.0,0.0,0.0,4.31,4.31,0.0,INITIAL,2016-06-08 15:23:32,2016-06-08 15:23:32,7000-1.1 (Rev. 06-2014)
591,2019-07-01 11:28:09,2017,20183638,16184,39410,RELIANCE GATHERING LLC,300 N. MARIENFELD SUITE 1100,MIDLAND,TX,79701,...,0.0,0.0,0.0,0.0,0.0,0.0,INITIAL,2018-06-15 16:51:17,2018-06-15 16:51:17,7000-1.1 (Rev. 06-2014)
4526,2019-07-01 11:21:18,2016,20172617,14987,38964,PHILADELPHIA ENERGY SOLUTIONS REFINING AND MAR...,1735 MARKET STREET,PHILADELPHIA,PA,19103,...,0.0,0.0,0.0,2.66,2.66,0.0,INITIAL,2017-06-12 13:58:00,2017-06-12 13:58:00,7000-1.1 (Rev. 06-2014)
3347,2019-07-01 10:31:50,2010,20110448,10484,32602,OXY USA INC,301 E. OCEAN BLVD.,LONG BEACH,CA,90802,...,0.0,0.0,0.0,5.542,5.542,0.0,INITIAL,2011-10-07 11:35:47,2011-10-07 11:35:47,7000-1.1 (Rev. 06-2011)
1188,2019-07-01 11:03:57,2014,20151157,13260,515,DAKOTA GASIFICATION COMPANY,GREAT PLAINS SYNFUELS PLANT 1717 EAST INTERSTA...,BISMARCK,ND,58503-0564,...,0.26,0.0,0.26,167.04,167.04,0.0,INITIAL,2015-04-27 11:43:58,2015-04-27 11:43:58,7000-1.1 (Rev. 06-2014)


In [13]:
pipelines_2004_2009 = [file for file in listdir('../data/pipelines_2004_2009_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2004_2009 = pd.concat([pd.read_excel(f'../data/pipelines_2004_2009_2019-08-02/{file}') 
                                 for file in pipelines_2004_2009])
pipelines_2004_2009 = pipelines_2004_2009.reset_index(drop = True)

pipelines_2004_2009.sample(5)

Unnamed: 0,ORIGINAL,SUPPLEMENT,YR,RPTID,HL_SUP_ID,SYSTEM_TYPE,OPERATOR_ID,NAME,PARENT,OFSTREET,...,BA_5,BA_6,BA_7,BA_8,DOR,DOE,PNAME,PPHONE,PEMAIL,PFAX
2238,Y,N,2004,20040047,71,CRUDE OIL,31630,BP AMERICA PRODUCTION COMPANY,,501 WESTLAKE PARK BLVD,...,0.0,0.0,0.0,,2005-06-01,2005-06-01 16:31:18,PAUL C. FALGOUT,2813666382.0,PAUL.FALGOUT@BP.COM,3373370000.0
613,Y,N,2006,20080488,5740,HVLS,32454,"DRY TRAILS MIDSTREAM ENERGY, LLC",,"8801 S YALE, SUITE 350",...,45.17,0.0,0.0,,2009-12-09,2009-12-09 13:46:02,REGINA GREGORY,9183890000.0,RGREGORY@MIDSTREAMENERGYLLC.COM,
1153,Y,N,2009,20090364,7354,PETROLEUM & REFINED PRODUCTS,26054,KEY WEST PIPELINE CO,,"TRUMBO POINT NAVAL ANNEX, BUILDING D-19",...,0.0,0.0,0.0,0.0,2010-06-15,2010-06-15 10:08:16,MARK RAUCH,7136271700.0,MARK@PTMC.US,7136220000.0
1953,N,Y,2004,20040075,440,PETROLEUM & REFINED PRODUCTS,26048,HUNT REFINING COMPANY,,1855 FAIRLAWN ROAD,...,0.0,0.0,0.0,,2005-04-13,2005-06-28 10:25:05,GERRY HALL OPERATIONS SPECIALIST,2053913379.0,GHALL@HUNTREFINING.COM,2053910000.0
422,Y,N,2008,20080031,4363,CO2 OR OTHER,32141,RESOLUTE NATURAL RESOURCES COMPANY,,"1675 BROADWAY, SUITE 1950",...,0.0,0.0,0.0,,2009-03-02,2009-03-02 17:47:37,DWIGHT MALLORY,3035344600.0,DMALLORY@RNRC.NET,


### 3.2.1 Select relevant columns

#### For 2010-

In [14]:
pipelines_2010_selected = pipelines_2010_present[[
    'OPERATOR_ID', 'REPORT_YEAR', 'PARTA2NAMEOFCOMP', 'PARTA5COMMODITY', 'PARTBHCATOTAL']].copy()

pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,REPORT_YEAR,PARTA2NAMEOFCOMP,PARTA5COMMODITY,PARTBHCATOTAL
718,3445,2011,DIXIE PIPELINE COMPANY LLC,HVL,577.251
687,879,2011,CHEMOIL TERMINALS CORP.,Refined and/or Petroleum Product (non-HVL),9.78
267,31526,2017,DELAWARE STORAGE AND PIPELINE COMPANY,Refined and/or Petroleum Product (non-HVL),6.72
1108,32364,2011,"VALERO REFINING-NEW ORLEANS, L.L.C.",Crude Oil,1.5
834,26085,2011,"PLAINS MARKETING, L.P.",Refined and/or Petroleum Product (non-HVL),6.24


In [15]:
pipelines_2010_selected.dtypes

OPERATOR_ID           int64
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCATOTAL       float64
dtype: object

In [16]:
pipelines_2010_selected['OPERATOR_ID'] = pipelines_2010_selected['OPERATOR_ID'].astype(str)
pipelines_2010_selected.dtypes

OPERATOR_ID          object
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCATOTAL       float64
dtype: object

In [17]:
pipelines_2010_selected = pipelines_2010_selected.rename(
    columns={'REPORT_YEAR': 'YEAR', 'PARTA2NAMEOFCOMP': 'NAME', 'PARTA5COMMODITY': 'COMMODITY', 'PARTBHCATOTAL': "MILES"})
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES
1739,39106,2014,PROGRESS SOLUTIONS LLC,Crude Oil,0.0
2844,39065,2013,HESS ND,Refined and/or Petroleum Product (non-HVL),0.0
3747,32428,2018,"INTERCONTINENTAL TERMINAL COMPANY, LLC",HVL,3.0
1520,31887,2014,DUKE ENERGY KENTUCKY - LIQUID,HVL,3.41
1364,30658,2014,KERN OIL & REFINING CO.,Crude Oil,13.65


In [18]:
pipelines_2010_selected.to_feather(f'../preprocessed_data/pipelines_2010_selected_{today}.feather')

#### For 2004-

Clean name column

In [19]:
import numpy as np

pipelines_2004_2009['NAME_FIXED'] = np.where(pd.isnull(pipelines_2004_2009['PARENT']), 
                                             pipelines_2004_2009['NAME'], 
                                             pipelines_2004_2009['PARENT'])
pd.isnull(pipelines_2004_2009['NAME_FIXED']).value_counts()

False    2730
Name: NAME_FIXED, dtype: int64

In [20]:
pipelines_2004_2009[['NAME_FIXED', 'NAME', 'PARENT']].sample(5)

Unnamed: 0,NAME_FIXED,NAME,PARENT
1107,VALERO REFINING COMPANY - CALIFORNIA,VALERO REFINING COMPANY - CALIFORNIA,
2018,"DUKE ENERGY FIELD SERVICES, LP","DUKE ENERGY FIELD SERVICES, LP",
1519,"CHAPARRAL ENERGY, L.L.C.","CHAPARRAL ENERGY, L.L.C.",
1675,MARATHON PETROLEUM COMPANY LLC,"MARATHON ASHLAND PIPE LINE, LLC",MARATHON PETROLEUM COMPANY LLC
1711,"THE DOW CHEMICAL COMPANY, INCORPORATED",UCAR PIPELINE INCORPORATED,"THE DOW CHEMICAL COMPANY, INCORPORATED"


Select columns

In [21]:
pipelines_2004_selected = pipelines_2004_2009[['OPERATOR_ID', 'YR', 'NAME_FIXED', 'HCAMT']].copy()
pipelines_2004_selected.dtypes

OPERATOR_ID      int64
YR               int64
NAME_FIXED      object
HCAMT          float64
dtype: object

In [22]:
pipelines_2004_selected['OPERATOR_ID'] = pipelines_2004_selected['OPERATOR_ID'].astype(str)
pipelines_2004_selected.dtypes

OPERATOR_ID     object
YR               int64
NAME_FIXED      object
HCAMT          float64
dtype: object

In [23]:
pipelines_2004_selected = pipelines_2004_selected.rename(
    columns={'YR': 'YEAR', 'NAME_FIXED': 'NAME', 'HCAMT': 'MILES_TOTAL'})
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,MILES_TOTAL
122,31875,2008,MERIT ENERGY COMPANY,89.5
1075,32037,2009,ENERDYNE POWER SYSTEMS,0.0
1463,31672,2005,"CHAPARRAL ENERGY, L.L.C.",0.0
857,31067,2006,AERA ENERGY LLC,1.583
145,31822,2008,SUNCOR ENERGY (USA) PIPELINE CO.,82.394


In [24]:
pipelines_2004_selected.to_feather(f'../preprocessed_data/pipelines_2004_selected_{today}.feather')

### 3.3 Write original data to .feather for reference

Some columns get erroneously read to data type 'O'. We convert those manually to str type.

In [25]:
pipelines_2010_present.loc[:, pipelines_2010_present.dtypes == 'O'] = pipelines_2010_present.loc[
    :, pipelines_2010_present.dtypes == 'O'].astype(str)

pipelines_2004_2009.loc[:, pipelines_2004_2009.dtypes == 'O'] = pipelines_2004_2009.loc[
    :, pipelines_2004_2009.dtypes == 'O'].astype(str)

incidents.loc[:, incidents.dtypes == 'O'] = incidents.loc[
    :, incidents.dtypes == 'O'].astype(str)

In [26]:
pipelines_2010_present.to_feather(f'../data/pipelines_2010_{today}.feather')
pipelines_2004_2009.to_feather(f'../data/pipelines_2004_{today}.feather')
incidents.to_feather(f'../data/incidents_{today}.feather')