# 3 - Select columns and clean

In [1]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

## 3.1 Extract relevant columns of the pipeline incidents dataset

In [2]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', 
                          sheet_name=1)

In [3]:
incidents = incidents[['OPERATOR_ID', 'LOCAL_DATETIME', 'NAME', 'SERIOUS', 'SIGNIFICANT', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE']]

incidents.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
296,9175,2010-11-15 08:35:00,JAYHAWK PIPELINE LLC,NO,NO,37.414383,-99.320819
1428,22855,2013-12-01 08:11:00,"KOCH PIPELINE COMPANY, L.P.",NO,NO,27.839803,-97.531604
3243,300,2018-02-04 09:00:00,"PLAINS PIPELINE, L.P.",NO,NO,33.91934,-98.43655
586,31618,2011-09-12 13:45:00,ENTERPRISE PRODUCTS OPERATING LLC,NO,YES,29.878864,-93.986341
445,32109,2011-05-14 15:40:00,ONEOK NGL PIPELINE LP,NO,YES,41.639016,-88.124155


In [4]:
incidents.dtypes

OPERATOR_ID                    int64
LOCAL_DATETIME        datetime64[ns]
NAME                          object
SERIOUS                       object
SIGNIFICANT                   object
LOCATION_LATITUDE            float64
LOCATION_LONGITUDE           float64
dtype: object

Make sure SERIOUS and SIGNIFICANT are booleans.

In [5]:
(incidents[['SERIOUS']] == 'YES')['SERIOUS'].value_counts()

False    3803
True       16
Name: SERIOUS, dtype: int64

In [6]:
incidents['SERIOUS'] = incidents[['SERIOUS']] == 'YES'

In [7]:
(incidents[['SIGNIFICANT']] == 'YES')['SIGNIFICANT'].value_counts()

False    2364
True     1455
Name: SIGNIFICANT, dtype: int64

In [8]:
incidents['SIGNIFICANT'] = incidents[['SIGNIFICANT']] == 'YES'

In [9]:
incidents.dtypes

OPERATOR_ID                    int64
LOCAL_DATETIME        datetime64[ns]
NAME                          object
SERIOUS                         bool
SIGNIFICANT                     bool
LOCATION_LATITUDE            float64
LOCATION_LONGITUDE           float64
dtype: object

In [10]:
incidents.to_feather(f'../preprocessed_data/incidents_filtered_{today}.feather')

## 3.2 Extract relevant columns of the pipeline system dataset

In [11]:
from os import listdir

pipelines_2010_present = [file for file in listdir('../data/pipelines_2010_present_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2010_present = pd.concat([pd.read_excel(f'../data/pipelines_2010_present_2019-08-02/{file}', skiprows=2) 
                                    for file in pipelines_2010_present])
pipelines_2010_present = pipelines_2010_present.reset_index(drop = True)

pipelines_2010_present.sample()

Unnamed: 0,DATAFILE_AS_OF,REPORT_YEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,OPERATOR_ID,PARTA2NAMEOFCOMP,PARTA4STREET,PARTA4CITY,PARTA4STATE,PARTA4ZIP,...,PARTE2010HF,PARTE2010LF,PARTE2010TOTAL,PARTETOTAL,PARTETOTALHF,PARTETOTALLF,REPORT_SUBMISSION_TYPE,REPORT_DATE,FILING_DATE,FORM_REV
2182,2019-07-01 10:43:40,2012,20130189,11973,32283,"FRONT RANGE PIPELINE, LLC.",803 HIGHWAY 212 SOUTH,Laurel,MT,59044,...,0.0,0.0,0.0,421.7,405.5,16.2,INITIAL,2013-06-10 12:41:19,2013-06-10 12:41:19,7000-1.1 (Rev. 06-2011)


In [12]:
pipelines_2004_2009 = [file for file in listdir('../data/pipelines_2004_2009_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2004_2009 = pd.concat([pd.read_excel(f'../data/pipelines_2004_2009_2019-08-02/{file}') 
                                 for file in pipelines_2004_2009])
pipelines_2004_2009 = pipelines_2004_2009.reset_index(drop = True)

pipelines_2004_2009.sample(5)

Unnamed: 0,ORIGINAL,SUPPLEMENT,YR,RPTID,HL_SUP_ID,SYSTEM_TYPE,OPERATOR_ID,NAME,PARENT,OFSTREET,...,BA_5,BA_6,BA_7,BA_8,DOR,DOE,PNAME,PPHONE,PEMAIL,PFAX
680,Y,N,2006,20060312,2569,CRUDE OIL,32169,PACIFIC ATLANTIC TERMINALS LLC,"PLAINS ALL AMERICAN PIPELINE, L.P.",5900 CHERRY AVENUE,...,0.0,0.0,0.0,,2007-06-15,2007-06-15 12:56:42,"ANN GOODINE, COMPLIANCE SPECIALIST",5627280000.0,RAGOODINE@PAALP.COM,5627280000.0
1465,N,Y,2005,20050052,2792,CO2 OR OTHER,401,AMERADA HESS CORPORATION,,3.5 MI NW HWY 214 P.O.BOX 1570,...,0.0,0.0,0.0,,2006-02-07,2007-08-16 11:12:45,THOMAS O. DAVIS,4327588615.0,TDAVIS@HESS.COM,4327590000.0
687,Y,N,2006,20060116,2344,HVLS,31778,ENCANA OIL & GAS (USA) INC.,,3606 COUNTY RD 116,...,0.0,0.0,0.0,,2007-06-04,2007-06-04 10:34:36,"JAKE JACOBS, EHS ANALYST",7208770000.0,JAKE.JACOBS@ENCANA.COM,7208770000.0
1699,Y,N,2005,20050380,1354,HVLS,31742,THE PREMCOR PIPELINE CO.,VALERO ENERGY CORPORATION,ONE VALERO WAY,...,0.0,0.0,0.0,,2006-06-30,2006-06-30 11:04:44,HENRY P. CARTAYA,2813620300.0,HCARTAYA@ENERGISTIXINC.COM,2813620000.0
1288,Y,N,2009,20090113,6852,CO2 OR OTHER,515,DAKOTA GASIFICATION COMPANY,BASIN ELECTRIC POWER COOPERATIVE,420 COUNTY ROAD 26,...,49.0,51.0,118.0,7.4,2010-05-20,2010-05-20 12:12:24,CLARENCE WINFREY,7018736773.0,CWINFREY@BEPC.COM,7018740000.0


### 3.2.1 Select relevant columns

#### For 2010-

In [103]:
pipelines_2010_selected = pipelines_2010_present[['OPERATOR_ID', 'REPORT_YEAR', 'PARTA2NAMEOFCOMP', 'PARTA5COMMODITY', 'PARTBHCATOTAL']]

pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,REPORT_YEAR,PARTA2NAMEOFCOMP,PARTA5COMMODITY,PARTBHCATOTAL
2416,19319,2013,"TPC GROUP, LLC",HVL,117.71
3371,999,2018,"PACIFIC COAST ENERGY COMPANY, LP",Crude Oil,1.7
2657,32004,2013,QEP FIELD SERVICES COMPANY,HVL,13.77
882,30959,2011,THE DOW CHEMICAL COMPANY,HVL,75.5
2174,32223,2012,VALERO REFINING COMPANY - CALIFORNIA,Refined and/or Petroleum Product (non-HVL),13.8


In [104]:
pipelines_2010_selected.dtypes

OPERATOR_ID           int64
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCATOTAL       float64
dtype: object

In [105]:
pipelines_2010_selected = pipelines_2010_selected.rename(
    columns={'REPORT_YEAR': 'YEAR', 'PARTA2NAMEOFCOMP': 'NAME', 'PARTA5COMMODITY': 'COMMODITY', 'PARTBHCATOTAL': "MILES"})
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES
1253,11169,2014,"ENBRIDGE ENERGY, LIMITED PARTNERSHIP",Crude Oil,1423.0
1468,31579,2014,"MAGELLAN PIPELINES HOLDINGS, LP",Crude Oil,5.25
1743,39124,2014,STATOIL PIPELINES LLC,Crude Oil,3.68
42,4430,2017,VALERO TERMINALING AND DISTRIBUTION COMPANY,Refined and/or Petroleum Product (non-HVL),14.298
671,39806,2017,"EVX MIDSTREAM PARTNERS, LLC",Crude Oil,0.0


In [116]:
pipelines_2010_selected.to_feather(f'../preprocessed_data/pipelines_2010_selected_{today}.feather')

#### For 2004-

Clean name column

In [110]:
import numpy as np

pipelines_2004_2009['NAME_FIXED'] = np.where(pd.isnull(pipelines_2004_2009['PARENT']), 
                                             pipelines_2004_2009['NAME'], 
                                             pipelines_2004_2009['PARENT'])

In [111]:
pd.isnull(pipelines_2004_2009['NAME_FIXED']).value_counts()

False    2730
Name: NAME_FIXED, dtype: int64

In [112]:
pipelines_2004_2009[['NAME_FIXED', 'NAME', 'PARENT']].sample(10)

Unnamed: 0,NAME_FIXED,NAME,PARENT
519,"KINDER MORGAN LIQUIDS TERMINALS, LLC","KINDER MORGAN LIQUIDS TERMINALS, LLC",
1436,"BUCKEYE PARTNERS, LP","BUCKEYE PARTNERS, LP",
1138,"COUNTRYMARK COOPERATIVE, LLP","COUNTRYMARK COOPERATIVE, LLP",
2397,"TOTAL PETROCHEMICALS PIPELINE USA, INC","TOTAL PETROCHEMICALS PIPELINE USA, INC",
477,"TEXAS PETROCHEMICALS, LP","TEXAS PETROCHEMICALS, LP",
1016,"MAGELLAN PIPELINE COMPANY, LP","MAGELLAN PIPELINE COMPANY, LP",
11,BASF CORPORATION,BASF CORPORATION,
187,"VENOCO, INC","VENOCO, INC",
893,"PIPELINE TECHNOLOGY VI, L.L.C.","PIPELINE TECHNOLOGY VI, L.L.C.",
2286,WYNNEWOOD REFINERY COMPANY,WYNNEWOOD REFINERY COMPANY,


Select columns

In [113]:
pipelines_2004_selected = pipelines_2004_2009[['OPERATOR_ID', 'YR', 'NAME_FIXED', 'HCAMT']]

In [114]:
pipelines_2004_selected = pipelines_2004_selected.rename(
    columns={'YR': 'YEAR', 'NAME_FIXED': 'NAME', 'HCAMT': 'MILES_TOTAL'})
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,MILES_TOTAL
2691,31805,2007,HOLLY REFINING & MARKETING COMPANY,4.0
696,30765,2006,TESORO LOS ANGELES REFINERY,6.95
1364,31663,2009,NAVAJO NATION OIL AND GAS COMPANY,72.53
1996,32073,2004,YATES PETROLEUM CORPORTION,0.0
418,473,2008,ANADARKO PETROLEUM CORP,12.0


In [115]:
pipelines_2004_selected.to_feather(f'../preprocessed_data/pipelines_2004_selected_{today}.feather')