# 3 - Select columns

In [1]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

## 3.1 Extract relevant columns of the pipeline incidents dataset

In [2]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', 
                          sheet_name=1)
incidents_selected = incidents[[
    'OPERATOR_ID', 'LOCAL_DATETIME', 'NAME', 'SERIOUS', 'SIGNIFICANT', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE']].copy()

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
2579,22610,2016-07-14 08:10:00,"MAGELLAN PIPELINE COMPANY, LP",NO,NO,40.20069,-104.60941
1851,31574,2014-11-25 09:30:00,"WESTERN REFINING PIPELINE, LLC",NO,YES,36.487768,-108.129948
1712,300,2014-07-10 01:30:00,"PLAINS PIPELINE, L.P.",NO,NO,32.53193,-100.83051
3791,31579,2019-06-04 16:00:00,"MAGELLAN PIPELINES HOLDINGS, LP",NO,NO,35.9904,-96.577985
3287,9175,2018-03-08 17:50:00,JAYHAWK PIPELINE LLC,NO,NO,38.34031,-98.31146


In [3]:
incidents_selected.dtypes

OPERATOR_ID                    int64
LOCAL_DATETIME        datetime64[ns]
NAME                          object
SERIOUS                       object
SIGNIFICANT                   object
LOCATION_LATITUDE            float64
LOCATION_LONGITUDE           float64
dtype: object

In [4]:
incidents_selected['OPERATOR_ID'] = incidents_selected['OPERATOR_ID'].astype(str)
incidents_selected.dtypes

OPERATOR_ID                   object
LOCAL_DATETIME        datetime64[ns]
NAME                          object
SERIOUS                       object
SIGNIFICANT                   object
LOCATION_LATITUDE            float64
LOCATION_LONGITUDE           float64
dtype: object

Make sure SERIOUS and SIGNIFICANT are booleans.

In [5]:
(incidents_selected[['SERIOUS']] == 'YES')['SERIOUS'].value_counts()

False    3803
True       16
Name: SERIOUS, dtype: int64

In [6]:
(incidents_selected[['SIGNIFICANT']] == 'YES')['SIGNIFICANT'].value_counts()

False    2364
True     1455
Name: SIGNIFICANT, dtype: int64

In [7]:
incidents_selected['SIGNIFICANT'] = incidents_selected[['SIGNIFICANT']] == 'YES'

In [8]:
incidents_selected.dtypes

OPERATOR_ID                   object
LOCAL_DATETIME        datetime64[ns]
NAME                          object
SERIOUS                       object
SIGNIFICANT                     bool
LOCATION_LATITUDE            float64
LOCATION_LONGITUDE           float64
dtype: object

In [9]:
incidents_selected.to_feather(f'../preprocessed_data/incidents_selected_{today}.feather')

## 3.2 Extract relevant columns of the pipeline system dataset

In [10]:
from os import listdir

pipelines_2010_present = [file for file in listdir('../data/pipelines_2010_present_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2010_present = pd.concat([pd.read_excel(f'../data/pipelines_2010_present_2019-08-02/{file}', skiprows=2) 
                                    for file in pipelines_2010_present])
pipelines_2010_present = pipelines_2010_present.reset_index(drop = True)

pipelines_2010_present.sample(5)

Unnamed: 0,DATAFILE_AS_OF,REPORT_YEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,OPERATOR_ID,PARTA2NAMEOFCOMP,PARTA4STREET,PARTA4CITY,PARTA4STATE,PARTA4ZIP,...,PARTE2010HF,PARTE2010LF,PARTE2010TOTAL,PARTETOTAL,PARTETOTALHF,PARTETOTALLF,REPORT_SUBMISSION_TYPE,REPORT_DATE,FILING_DATE,FORM_REV
696,2019-07-01 10:36:07,2011,20120359,13215,1845,"BUCKEYE PARTNERS, LP",FIVE TEK PARK,BREINIGSVILLE,PA,18031,...,0.149,0.0,0.149,4721.585,641.736,4079.849,SUPPLEMENTAL,2012-06-15 10:35:39,2014-11-26 11:15:21,7000-1.1 (Rev. 06-2011)
1223,2019-07-01 11:03:57,2014,20151411,13537,3535,DOW PIPELINE CO,1000 COUNTY ROAD 340,ANGLETON,TX,77515,...,0.0,0.0,0.0,196.76,136.54,60.22,INITIAL,2015-06-11 15:18:23,2015-06-11 15:18:23,7000-1.1 (Rev. 06-2014)
2386,2019-07-01 10:54:15,2013,20141051,13078,12628,MOBIL PIPE LINE COMPANY,"800 BELL STREET, ROOM 741-D",HOUSTON,TX,77002,...,0.0,0.0,0.0,189.0,107.5,81.5,INITIAL,2014-06-16 01:36:47,2014-06-16 01:36:47,7000-1.1 (Rev. 06-2011)
3597,2019-07-01 11:40:32,2018,20190145,16546,31434,PALMER PETROLEUM INC,LANDFILL RD,MONROEVILLE,AL,36460,...,0.0,0.0,0.0,0.0,0.0,0.0,INITIAL,2019-05-30 11:04:29,2019-05-30 11:04:29,7000-1.1 (Rev. 06-2014)
853,2019-07-01 10:36:07,2011,20120333,11371,30005,MOBIL PACIFIC PIPELINE CO,800 Bell St Room 623-F,TORRANCE,Texas,77002,...,0.0,0.0,0.0,20.5,15.0,5.5,INITIAL,2012-06-14 18:12:24,2012-06-14 18:12:24,7000-1.1 (Rev. 06-2011)


In [11]:
pipelines_2004_2009 = [file for file in listdir('../data/pipelines_2004_2009_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2004_2009 = pd.concat([pd.read_excel(f'../data/pipelines_2004_2009_2019-08-02/{file}') 
                                 for file in pipelines_2004_2009])
pipelines_2004_2009 = pipelines_2004_2009.reset_index(drop = True)

pipelines_2004_2009.sample(5)

Unnamed: 0,ORIGINAL,SUPPLEMENT,YR,RPTID,HL_SUP_ID,SYSTEM_TYPE,OPERATOR_ID,NAME,PARENT,OFSTREET,...,BA_5,BA_6,BA_7,BA_8,DOR,DOE,PNAME,PPHONE,PEMAIL,PFAX
2218,Y,N,2004,20040133,165,PETROLEUM & REFINED PRODUCTS,2714,"DOMINION TRANSMISION, INCORPORATED",DOMINION RESOURCES,445 WEST MAIN STREET,...,0.0,0.0,0.0,,2005-06-13,2005-06-13 14:07:33,MARTIN C. SCHWOEBLE,7244687726.0,MARTIN_C._SCHWOEBLE@DOM.COM,7244690000.0
1031,Y,N,2009,20090236,7193,HVLS,32428,"INTERCONTINENTAL TERMINAL COMPANY, LLC",,1943 BATTLEGROUND ROAD,...,0.0,0.0,3.0,0.0,2010-06-08,2010-06-08 00:00:00,WILLIAM BUDD,2818840391.0,WBUDD@ITERM.COM,2818840000.0
1281,Y,N,2009,20090302,7260,CRUDE OIL,10250,KIANTONE PIPELINE CORP,UNITED REFINING COMPANY,PO BOX 780,...,73.24,0.0,0.0,1.53,2010-06-11,2010-06-11 08:58:42,DANIEL SOBINA,8147264846.0,DANSOBINA@URC.COM,8147260000.0
501,Y,N,2006,20060218,2475,CO2 OR OTHER,32141,RESOLUTE NATURAL RESOURCES COMPANY,,"1675 BROADWAY, SUITE 1950",...,0.0,0.0,0.0,,2007-06-13,2007-06-13 11:29:59,DWIGHT E MALLORY,3035340000.0,DMALLORY@RNRC.NET,3036230000.0
2539,Y,N,2007,20070196,3591,CRUDE OIL,26085,"PLAINS MARKETING, L.P.",,"333 CLAY ST., SUITE 1600",...,3.5,22.0,0.0,,2008-06-09,2008-06-09 10:49:15,"GILBERT C. SCHUTZA, P.E., COMPLIANCE ENGINEER",7136464433.0,CE.GCSCHUTZA@PAALP.COM,7136460000.0


### 3.2.1 Select relevant columns

#### For 2010-

In [12]:
pipelines_2010_selected = pipelines_2010_present[[
    'OPERATOR_ID', 'REPORT_YEAR', 'PARTA2NAMEOFCOMP', 'PARTA5COMMODITY', 'PARTBHCATOTAL']].copy()

pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,REPORT_YEAR,PARTA2NAMEOFCOMP,PARTA5COMMODITY,PARTBHCATOTAL
5263,39313,2015,"MAVERICK TERMINAL THREE RIVERS, LLC",HVL,0.0
2729,32364,2013,"VALERO REFINING-NEW ORLEANS, L.L.C.",Refined and/or Petroleum Product (non-HVL),1.5
1226,4430,2014,VALERO TERMINALING AND DISTRIBUTION COMPANY,Refined and/or Petroleum Product (non-HVL),0.3
3738,32364,2018,"VALERO REFINING-NEW ORLEANS, L.L.C.",Crude Oil,1.35
1784,994,2012,"WILLIAMS FIELD SERVICES - GULF COAST COMPANY, LP",Refined and/or Petroleum Product (non-HVL),15.3


In [13]:
pipelines_2010_selected.dtypes

OPERATOR_ID           int64
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCATOTAL       float64
dtype: object

In [14]:
pipelines_2010_selected['OPERATOR_ID'] = pipelines_2010_selected['OPERATOR_ID'].astype(str)
pipelines_2010_selected.dtypes

OPERATOR_ID          object
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCATOTAL       float64
dtype: object

In [15]:
pipelines_2010_selected = pipelines_2010_selected.rename(
    columns={'REPORT_YEAR': 'YEAR', 'PARTA2NAMEOFCOMP': 'NAME', 'PARTA5COMMODITY': 'COMMODITY', 'PARTBHCATOTAL': "MILES"})
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES
1222,3527,2014,DOW PIPELINE CO,HVL,158.54
4304,31579,2016,"MAGELLAN PIPELINES HOLDINGS, LP",Refined and/or Petroleum Product (non-HVL),30.18
971,31582,2011,"ONEOK ROCKIES MIDSTREAM, LLC",HVL,28.05
2372,11032,2013,LACLEDE GAS CO,HVL,39.0
2788,32634,2013,"ENLINK PERMIAN, LLC",HVL,0.0


In [16]:
pipelines_2010_selected.to_feather(f'../preprocessed_data/pipelines_2010_selected_{today}.feather')

#### For 2004-

Clean name column

In [17]:
import numpy as np

pipelines_2004_2009['NAME_FIXED'] = np.where(pd.isnull(pipelines_2004_2009['PARENT']), 
                                             pipelines_2004_2009['NAME'], 
                                             pipelines_2004_2009['PARENT'])
pd.isnull(pipelines_2004_2009['NAME_FIXED']).value_counts()

False    2730
Name: NAME_FIXED, dtype: int64

In [18]:
pipelines_2004_2009[['NAME_FIXED', 'NAME', 'PARENT']].sample(5)

Unnamed: 0,NAME_FIXED,NAME,PARENT
1556,"GOLDEN VALLEY ELECTRIC ASSOCIATION, INC.","GOLDEN VALLEY ELECTRIC ASSOCIATION, INC.",
599,"PLAINS MARKETING, L.P.","PLAINS MARKETING, L.P.",
167,EXXONMOBIL PIPELINE CO,EXXONMOBIL PIPELINE CO,
74,"TEPPCO MIDSTREAM COMPANIES, L.P.","TEPPCO MIDSTREAM COMPANIES, L.P.",
202,TEXAS PETROCHEMICALS CORP,TEXAS PETROCHEMICALS CORP,


Select columns

In [19]:
pipelines_2004_selected = pipelines_2004_2009[['OPERATOR_ID', 'YR', 'NAME_FIXED', 'HCAMT']].copy()
pipelines_2004_selected.dtypes

OPERATOR_ID      int64
YR               int64
NAME_FIXED      object
HCAMT          float64
dtype: object

In [20]:
pipelines_2004_selected['OPERATOR_ID'] = pipelines_2004_selected['OPERATOR_ID'].astype(str)
pipelines_2004_selected.dtypes

OPERATOR_ID     object
YR               int64
NAME_FIXED      object
HCAMT          float64
dtype: object

In [21]:
pipelines_2004_selected = pipelines_2004_selected.rename(
    columns={'YR': 'YEAR', 'NAME_FIXED': 'NAME', 'HCAMT': 'MILES_TOTAL'})
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,MILES_TOTAL
2558,13162,2007,NAVAJO REFINING CO,6.9
2288,26041,2007,"KINDER MORGAN LIQUID TERMINALS, LLC",10.54
2373,15844,2007,ANADARKO PETROLEUM,0.0
2192,31312,2004,PINNACLE WEST CAPITAL CORPORATION,2.0
814,14391,2006,"OSAGE PIPELINE COMPANY, LLC",59.0


In [22]:
pipelines_2004_selected.to_feather(f'../preprocessed_data/pipelines_2004_selected_{today}.feather')

### 3.3 Write original data to .feather for reference

Some columns get erroneously read to data type 'O'. We convert those manually to str type.

In [23]:
pipelines_2010_present.loc[:, pipelines_2010_present.dtypes == 'O'] = pipelines_2010_present.loc[
    :, pipelines_2010_present.dtypes == 'O'].astype(str)

pipelines_2004_2009.loc[:, pipelines_2004_2009.dtypes == 'O'] = pipelines_2004_2009.loc[
    :, pipelines_2004_2009.dtypes == 'O'].astype(str)

incidents.loc[:, incidents.dtypes == 'O'] = incidents.loc[
    :, incidents.dtypes == 'O'].astype(str)

In [24]:
pipelines_2010_present.to_feather(f'../data/pipelines_2010_{today}.feather')
pipelines_2004_2009.to_feather(f'../data/pipelines_2004_{today}.feather')
incidents.to_feather(f'../data/incidents_{today}.feather')