# 6. Create preliminary sample

We grab the 50 operators with the largest network of pipelines (in any year) and obtain the most recent company name.

Most recent file versions used in this workbook:

In [55]:
pipeline_2010_selected_file = '../preprocessed_data/pipelines_2010_selected_2019-08-11.feather'
incidents_selected_file = '../preprocessed_data/incidents_selected_2019-08-16.feather'
largest_observations_file = '../preprocessed_data/largest_companies_2019-08-11.feather'

pipelines_2010_raw = '../data/pipelines_2010_2019-08-11.feather'
incidents_raw = '../data/incidents_2019-08-11.feather'

In [56]:
sample_len = 50

## Setup

In [57]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

# Load data

In [58]:
pipelines_2010 = pd.read_feather(pipeline_2010_selected_file)
pipelines_2010.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES
4972,31618,2015,ENTERPRISE PRODUCTS OPERATING LLC,Refined and/or Petroleum Product (non-HVL),624.302
2876,879,2010,CHEMOIL TERMINALS CORP.,Refined and/or Petroleum Product (non-HVL),9.78
3211,31871,2010,"MARKWEST MICHIGAN PIPELINE, LLC",Crude Oil,24.9


In [59]:
incidents = pd.read_feather(incidents_selected_file)
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
2308,1845,2015-11-03 15:50:00,"BUCKEYE PARTNERS, LP",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,False,40.56473,-74.250181
3097,10012,2017-09-28 09:29:00,NUSTAR PIPELINE OPERATING PARTNERSHIP L.P.,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,False,True,43.08949,-93.903342
702,18718,2012-01-12 22:18:00,SUNOCO PIPELINE L.P.,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,True,41.17974,-82.21559


## 6.1 Functions to reference from raw data

In [60]:
from functools import partial

def find_info(OPERATOR_ID, info_col:str, file = pipelines_2010_raw, id_col = 'OPERATOR_ID', year_col = 'REPORT_YEAR'):
    df = pd.read_feather(file)
    values = np.unique(df[df[id_col] == int(OPERATOR_ID)][info_col]).tolist()
    result = {}
    for value in values:
        start_year = df[df[info_col] == value][year_col].min()
        end_year = df[df[info_col] == value][year_col].max()
        result[value] = {'start_year': start_year, 'end_year': end_year}
    return(result)

find_address = partial(find_info, info_col = 'PARTA4STREET')
find_address('300')

{'333 CLAY STREET': {'start_year': 2010, 'end_year': 2011},
 '333 CLAY STREET, SUITE 1600': {'start_year': 2012, 'end_year': 2018}}

## 6.2 Largest operators - add parents

We use the list of the largest operators that we have generated in workbook 5.

In [61]:
largest_pipeline_operators = pd.read_feather('../preprocessed_data/largest_companies_2019-08-11.feather')
sample = largest_pipeline_operators[:sample_len]

sample

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES
0,31618,2018.0,ENTERPRISE PRODUCTS OPERATING LLC,8325.499
1,32109,2018.0,"ONEOK NGL PIPELINE, LLC",4756.61
2,22610,2014.0,"MAGELLAN PIPELINE COMPANY, LP",4505.5
3,2552,2015.0,COLONIAL PIPELINE CO,4500.92
4,31684,2017.0,PHILLIPS 66 PIPELINE LLC,4474.1
5,1845,2015.0,"BUCKEYE PARTNERS, LP",4298.29
6,32147,2018.0,MARATHON PIPE LINE LLC,4037.2
7,18718,2010.0,SUNOCO PIPELINE L.P.,3466.74
8,4906,2013.0,EXXONMOBIL PIPELINE CO,2944.4
9,22855,2013.0,"FLINT HILLS RESOURCES, LC",2757.45


All parent companies are retrieved from LexisNexis, except where comments added.

In [62]:
from collections import namedtuple

company = namedtuple('company', 'OPERATOR_ID NAME PARENT')

parent_companies = [company('31618', 'ENTERPRISE PRODUCTS OPERATING LLC', 'Enterprise Products Partners'),
                    company('32109', 'ONEOK NGL PIPELINE, LLC', 'ONEOK'), 
                    company('22610', 'MAGELLAN PIPELINE COMPANY, LP', 'Magellan Midstream Partners'), 
                    company('2552', 'COLONIAL PIPELINE CO', 'Colonial Pipeline'), 
                    company('31684', 'PHILLIPS 66 PIPELINE LLC', 'Phillips 66'), 
                    company('1845', 'BUCKEYE PARTNERS, LP', 'Buckeye Partners'), 
                    company('32147', 'MARATHON PIPE LINE LLC', 'Marathon Petroleum'),
                    company('18718', 'SUNOCO PIPELINE L.P.', 'Energy Transfer'), 
                    company('4906', 'EXXONMOBIL PIPELINE CO', 'Exxon Mobil'), 
                    company('22855', 'FLINT HILLS RESOURCES, LC', 'Koch Industries'), 
                    company('10012', 'NUSTAR PIPELINE OPERATING PARTNERSHIP L.P.', 'NuStar Energy'), 
                    company('15674', 'PLANTATION PIPE LINE CO', 'Kinder Morgan'),
                    company('11169', 'ENBRIDGE ENERGY, LIMITED PARTNERSHIP', 'Enbridge'), 
                    company('300', 'PLAINS PIPELINE, L.P.', 'Plains All American Pipeline'),
                    company('31174', 'SHELL PIPELINE CO., L.P.', 'Royal Dutch Shell'), 
                    company('18092', 'SFPP, LP', 'Kinder Morgan'), 
                    company('2731', 'CHEVRON PIPE LINE CO', 'Chevron'), 
                    company('4805', 'EXPLORER PIPELINE CO', 'Explorer Pipeline'), 
                    company('32099', 'ENERGY TRANSFER COMPANY', 'Energy Transfer'), 
                    company('31371', 'BUCKEYE DEVELOPMENT & LOGISTICS, LLC', 'Buckeye Partners'), 
                    company('30829', 'ENTERPRISE CRUDE PIPELINE LLC', 'Enterprise Products Partners'), 
                    company('31189', 'BP PIPELINE (NORTH AMERICA) INC.', 'BP'), 
                    company('12628', 'MOBIL PIPE LINE COMPANY', 'Exxon Mobil'), 
                    company('39205', 'DAPL-ETCO OPERATIONS MANAGEMENT, LLC', 'Energy Transfer'), 
                    company('39596', 'PERMIAN EXPRESS PARTNERS LLC', 'Energy Transfer'), 
                    company('25146', 'EQUISTAR CHEMICALS, L.P.', 'LyondellBasell'), 
                    company('38933', 'TESORO LOGISTICS OPERATIONS, LLC', 'Marathon Petroleum'), 
                    company('31454', 'NUSTAR LOGISTICS, L.P.', 'NuStar Energy'), 
                    company('31130', 'DCP MIDSTREAM', 'DCP Midstream'), 
                    # For TC OIL PIPELINE OPERTIONS INC (OPERATOR_ID 32334), we confirmed the information by comparing the address
                    # as stated in the dataset (see next cell) and on LexisNexis.
                    company('32334', 'TC OIL PIPELINE OPERATIONS INC', 'TransCanada USA'),
                    company('395', 'AMOCO OIL CO', 'Amoco',), 
                    company('31666', 'ROCKY MOUNTAIN PIPELINE SYSTEM, LLC', 'Plains All American Pipeline'), 
                    company('32080', 'CCPS TRANSPORTATION, LLC', 'Enbridge'), 
                    company('12470', 'MID - VALLEY PIPELINE CO', 'Energy Transfer'), 
                    company('3445', 'DIXIE PIPELINE COMPANY LLC', 'Enterprise Products Partners'), 
                    company('32103', 'CRIMSON PIPELINE L.P.', 'Crimson Pipeline'), 
                    company('32258', 'KINDER MORGAN COCHIN LLC', 'Kinder Morgan'), 
                    company('22430', 'WEST SHORE PIPELINE CO', 'West Shore Pipeline'), 
                    # For WOLVERINE PIPELINE CO (OPERATOR_ID 22830) , the information were found by searching for "Wolverine Pipe Line".
                    company('22830', 'WOLVERINE PIPELINE CO', 'Marathon Petroleum'), 
                    # For SINCLAUR TRANSPORTATION COMPANY (OPERATOR_ID 15156) the information were found through a web search 
                    # (http://www.pipelinesafetyinfo.com/user/file/Missouri%20(PAM)/Sinclair_Transportation_Company.pdf).
                    company('15156', 'SINCLAIR TRANSPORTATION COMPANY', 'Sinclair Oil'), 
                    company('39043', 'TALLGRASS PONY EXPRESS PIPELINE, LLC', 'Tallgrass Energy'), 
                    company('32011', 'HOLLY ENERGY PARTNERS - OPERATING, L.P.', 'HollyFrontier'), 
                    company('32502', 'ENBRIDGE PIPELINES (SOUTHERN LIGHTS) L.L.C.', 'Enbridge'), 
                    company('39029', 'TESORO LOGISTICS NORTHWEST PIPELINE LLC', 'Marathon Petroleum'), 
                    company('30826', 'WILLIAMS FIELD SERVICES', 'The Williams Companies'), 
                    company('31720', 'EXPRESS HOLDINGS (USA), LLC', 'Enbridge'), 
                    company('39398', 'INEOS USA LLC', 'INEOS'), 
                    # CALNEV PIPELINE (OPERATOR_ID 26125) found as Calnev Pipe Line.
                    company('26125', 'CALNEV PIPELINE CO', 'Kinder Morgan'), 
                    company('26149', 'ALYESKA PIPELINE SERVICE CO', 'Alyeska Pipeline Service'), 
                    company('30781', 'OLYMPIC PIPE LINE COMPANY', 'BP')                    
                   ]

parent_companies = pd.DataFrame(parent_companies)
assert len(parent_companies) == sample_len
parent_companies.sample(3)

Unnamed: 0,OPERATOR_ID,NAME,PARENT
17,4805,EXPLORER PIPELINE CO,Explorer Pipeline
18,32099,ENERGY TRANSFER COMPANY,Energy Transfer
11,15674,PLANTATION PIPE LINE CO,Kinder Morgan


Checking address for 'TC OIL PIPELINE OPERATIONS INC' (see comment above)

In [63]:
find_address('32334')

{'700 LOUISIANA SUITE 700': {'start_year': 2016, 'end_year': 2018},
 '717 TEXAS AVE': {'start_year': 2010, 'end_year': 2015}}

## 6.3 Add data to sample

In [64]:
sample = pipelines_2010.loc[pipelines_2010['OPERATOR_ID'].isin(sample['OPERATOR_ID'])].copy()
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.head(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES
0,300,2017,"PLAINS PIPELINE, L.P.",Crude Oil,2101.06
1,300,2017,"PLAINS PIPELINE, L.P.",HVL,72.9
2,300,2017,"PLAINS PIPELINE, L.P.",Refined and/or Petroleum Product (non-HVL),0.0


### 6.3.1 Merge in parents

Ensure all parents are there.

In [65]:
sample = sample.merge(parent_companies[['OPERATOR_ID', 'PARENT']], on='OPERATOR_ID')
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT
69,2731,2011,CHEVRON PIPE LINE CO,Crude Oil,355.5,Chevron
325,22610,2015,"MAGELLAN PIPELINE COMPANY, LP",Refined and/or Petroleum Product (non-HVL),3827.0,Magellan Midstream Partners
310,22610,2012,"MAGELLAN PIPELINE COMPANY, LP",Refined and/or Petroleum Product (non-HVL),3503.0,Magellan Midstream Partners


## 6.3.2 Merge in incidents (by type) 

#### See if there are any conflicts between observations in the same year

In [66]:
assert len(sample[['OPERATOR_ID', 'YEAR', 'NAME']].drop_duplicates()) == len(sample[['OPERATOR_ID', 'YEAR']].drop_duplicates())

We're good.

### Clean commodity names - pipelines

In [67]:
# Legacy code that could be used to create wide format data

# sample = sample.pivot_table(index=['OPERATOR_ID', 'YEAR', 'NAME'],columns='COMMODITY',values='MILES',fill_value=0)
# sample = sample.reset_index()
# sample.head(10)

In [68]:
sample['COMMODITY'].unique()

array(['Crude Oil', 'HVL', 'Refined and/or Petroleum Product (non-HVL)',
       'CO2', 'Fuel Grade Ethanol (dedicated system)'], dtype=object)

In [69]:
new_names_pipelines = {'Crude Oil': 'crude', 
                       'CO2': 'co2',
                       'Fuel Grade Ethanol (dedicated system)': 'fge', 
                       'HVL': 'hvl',
                       'Refined and/or Petroleum Product (non-HVL)': 'non-hvl'}

sample = sample.replace({'COMMODITY': new_names_pipelines})
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT
688,32109,2013,"ONEOK NGL PIPELINE, LLC",non-hvl,382.0,ONEOK
766,38933,2018,"TESORO LOGISTICS OPERATIONS, LLC",non-hvl,657.3,Marathon Petroleum
727,32258,2014,KINDER MORGAN COCHIN LLC,crude,423.06,Kinder Morgan


### Clean commodity names - incidents

In [70]:
incidents['COMMODITY'].unique()

array(['REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS',
       'CO2 (CARBON DIOXIDE)',
       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS',
       'CRUDE OIL',
       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)'],
      dtype=object)

In [71]:
new_names_incidents = {'REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS': 'non-hvl', 
                       'CO2 (CARBON DIOXIDE)': 'co2', 
                       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS': 'hvl', 
                       'CRUDE OIL': 'crude', 
                       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)': 'hvl'}

incidents = incidents.replace({'COMMODITY': new_names_incidents})
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
3341,39084,2018-04-21 10:30:00,"CRIMSON GULF, LLC",crude,False,False,29.260833,-89.199167
342,4906,2011-01-06 17:20:00,EXXONMOBIL PIPELINE CO,hvl,False,False,27.552918,-97.922826
1276,2552,2013-06-18 22:30:00,COLONIAL PIPELINE CO,non-hvl,False,False,39.673538,-76.125691


The index column has a funny name, but that name gets dropped when writing the sample to .feather.

### Merge and safe

In [72]:
incidents['YEAR'] = incidents['LOCAL_DATETIME'].dt.year

All incidents

In [73]:
incident_counts = incidents.groupby(['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='INCIDENTS')
incident_counts.sample(3)

assert len(incident_counts.loc[incident_counts.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0

In [74]:
sample = sample.merge(incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['INCIDENTS'] = sample['INCIDENTS'].fillna(value=0)
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
sample.head(10)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT,INCIDENTS
0,300,2017,"PLAINS PIPELINE, L.P.",crude,2101.06,Plains All American Pipeline,18.0
1,300,2017,"PLAINS PIPELINE, L.P.",hvl,72.9,Plains All American Pipeline,0.0
2,300,2017,"PLAINS PIPELINE, L.P.",non-hvl,0.0,Plains All American Pipeline,0.0
3,300,2011,"PLAINS PIPELINE, L.P.",crude,1988.14,Plains All American Pipeline,12.0
4,300,2011,"PLAINS PIPELINE, L.P.",non-hvl,326.8,Plains All American Pipeline,0.0
5,300,2014,"PLAINS PIPELINE, L.P.",crude,2233.02,Plains All American Pipeline,34.0
6,300,2014,"PLAINS PIPELINE, L.P.",non-hvl,0.0,Plains All American Pipeline,0.0
7,300,2012,"PLAINS PIPELINE, L.P.",crude,2085.22,Plains All American Pipeline,25.0
8,300,2012,"PLAINS PIPELINE, L.P.",non-hvl,265.17,Plains All American Pipeline,1.0
9,300,2013,"PLAINS PIPELINE, L.P.",crude,2138.0,Plains All American Pipeline,20.0


All significant incidents

In [75]:
significant_incident_counts = incidents[incidents['SIGNIFICANT'] == True].groupby(
    ['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='SIGNIFICANT_INCIDENTS')
significant_incident_counts.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,SIGNIFICANT_INCIDENTS
414,31684,2018,crude,2
419,31720,2014,crude,1
257,30782,2017,crude,3


In [76]:
sample = sample.merge(significant_incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['SIGNIFICANT_INCIDENTS'] = sample['SIGNIFICANT_INCIDENTS'].fillna(value=0)
sample.head(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS
0,300,2017,"PLAINS PIPELINE, L.P.",crude,2101.06,Plains All American Pipeline,18.0,9.0
1,300,2017,"PLAINS PIPELINE, L.P.",hvl,72.9,Plains All American Pipeline,0.0,0.0
2,300,2017,"PLAINS PIPELINE, L.P.",non-hvl,0.0,Plains All American Pipeline,0.0,0.0
3,300,2011,"PLAINS PIPELINE, L.P.",crude,1988.14,Plains All American Pipeline,12.0,7.0
4,300,2011,"PLAINS PIPELINE, L.P.",non-hvl,326.8,Plains All American Pipeline,0.0,0.0


In [77]:
sample.to_feather(f'../preprocessed_data/sample_{today}.feather')