# 6. Create preliminary sample

We grab the 50 operators with the largest network of pipelines (in any year) and obtain the most recent company name.

Most recent file versions used in this workbook:

In [1]:
pipeline_2010_selected_file = '../preprocessed_data/pipelines_2010_selected_2019-08-24.feather'
incidents_selected_file = '../preprocessed_data/incidents_selected_2019-08-22.feather'
largest_observations_file = '../preprocessed_data/largest_companies_2019-09-01.feather'

pipelines_2010_raw = '../data/pipelines_2010_2019-08-11.feather'
incidents_raw = '../data/incidents_2019-08-11.feather'

parent_companies_file = '../input/largest_companies_w_parents_2019-08-31.csv'

In [2]:
sample_len = 150

## Setup

In [3]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

In [4]:
pd.options.display.max_rows = 75

# Load data

In [5]:
pipelines_2010 = pd.read_feather(pipeline_2010_selected_file)
pipelines_2010.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE
3573,39065,2018,HESS ND,HVL,2.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.01,0.0,24.44,0.0,11.211566
1432,31933,2014,CALCASIEU REFINING COMPANY,Refined and/or Petroleum Product (non-HVL),5.4,0.0,0.0,0.0,0.0,0.0,0.0,5.4,0.0,0.0,0.0,0.0,35.0
3402,31723,2018,EXXONMOBIL FUELS AND LUBRICANTS COMPANY,Refined and/or Petroleum Product (non-HVL),23.413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [6]:
incidents = pd.read_feather(incidents_selected_file)
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ONSHORE
971,32532,2012-10-12 08:15:00,"ENERGY XXI PIPELINE, LLC",CRUDE OIL,False,False,29.96887,-89.25292,True
3792,4906,2019-05-29 13:15:00,EXXONMOBIL PIPELINE CO,CRUDE OIL,False,True,30.010793,-90.860891,True
156,32288,2010-07-06 09:45:00,"WHITE CLIFFS PIPELINE, LLC",CRUDE OIL,False,False,36.01526,-96.75729,True


In [7]:
parents = pd.read_csv(parent_companies_file)
parents[:150].sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES,PARENT
119,32107,2016,"ENLINK NGL PIPELINE, LP",111.089,Global Infrastructure Management
55,300,2011,"PLAINS PIPELINE, L.P.",326.8,Plains GP Holding
88,2714,2014,"DOMINION ENERGY TRANSMISSION, INC.",166.995,Dominion Energy


In [8]:
print(parents.dtypes)
parents['OPERATOR_ID'] = parents['OPERATOR_ID'].astype(str)
print(parents.dtypes)

OPERATOR_ID      int64
YEAR             int64
NAME            object
TOTAL_MILES    float64
PARENT          object
dtype: object
OPERATOR_ID     object
YEAR             int64
NAME            object
TOTAL_MILES    float64
PARENT          object
dtype: object


## 6.1 Functions to reference from raw data

In [9]:
from functools import partial

def find_info(OPERATOR_ID, info_col:str, file = pipelines_2010_raw, id_col = 'OPERATOR_ID', year_col = 'REPORT_YEAR'):
    df = pd.read_feather(file)
    values = np.unique(df[df[id_col] == int(OPERATOR_ID)][info_col]).tolist()
    result = {}
    for value in values:
        start_year = df[df[info_col] == value][year_col].min()
        end_year = df[df[info_col] == value][year_col].max()
        result[value] = {'start_year': start_year, 'end_year': end_year}
    return(result)

find_address = partial(find_info, info_col = 'PARTA4STREET')
find_address('300')

{'333 CLAY STREET': {'start_year': 2010, 'end_year': 2011},
 '333 CLAY STREET, SUITE 1600': {'start_year': 2012, 'end_year': 2018}}

In [10]:
find_names = partial(find_info, info_col = 'PARTA2NAMEOFCOMP')
find_names('300')

{'PLAINS PIPELINE, L.P.': {'start_year': 2010, 'end_year': 2018}}

## 6.2 Largest operators - add parents

We use the list of the largest operators that we have generated in workbook 5.

All parent companies are retrieved from LexisNexis. Where the search yielded unclear results, we consult the address column in the original .xls file by FERC. In some rare cases, we did additional research (mostly company documents) to resolve conflicts.

In [11]:
largest_pipeline_operators = pd.read_feather(largest_observations_file)
sample = largest_pipeline_operators[:sample_len]

sample.head()

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES
0,31618,2018.0,ENTERPRISE PRODUCTS OPERATING LLC,8311.369
1,32109,2018.0,"ONEOK NGL PIPELINE, LLC",4756.61
2,22610,2014.0,"MAGELLAN PIPELINE COMPANY, LP",4505.5
3,2552,2015.0,COLONIAL PIPELINE CO,4500.92
4,31684,2017.0,PHILLIPS 66 PIPELINE LLC,4474.1


In [12]:
parents.head()

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES,PARENT
0,31618,2018,ENTERPRISE PRODUCTS OPERATING LLC,8311.369,Enterprise Products Partners
1,32109,2018,"ONEOK NGL PIPELINE, LLC",4756.61,ONEOK
2,22610,2014,"MAGELLAN PIPELINE COMPANY, LP",4505.5,Magellan Midstream Partners
3,2552,2015,COLONIAL PIPELINE CO,4500.92,Colonial Pipeline
4,31684,2017,PHILLIPS 66 PIPELINE LLC,4474.1,Phillips 66


In [13]:
assert len(parents.loc[parents['OPERATOR_ID'].isin(sample[:sample_len]['OPERATOR_ID'])]) == sample_len
assert parents.loc[parents['OPERATOR_ID'].isin(sample[:sample_len]['OPERATOR_ID'])]['PARENT'].isna().sum() == 0

## 6.3 Finalize sample

In [14]:
sample = pipelines_2010.loc[pipelines_2010['OPERATOR_ID'].isin(sample['OPERATOR_ID'])].copy()
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.head(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE
0,300,2017,"PLAINS PIPELINE, L.P.",HVL,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,0.0,5.004173
1,300,2017,"PLAINS PIPELINE, L.P.",Refined and/or Petroleum Product (non-HVL),0.0,0.0,3.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
2,395,2017,AMOCO OIL CO,Refined and/or Petroleum Product (non-HVL),662.3,0.0,0.6,15.0,0.0,0.9,1.5,18.5,58.2,5.1,0.7,0.0,34.61194


### Drop offshore incidents

In [15]:
incidents = incidents.loc[incidents['ONSHORE'] == True].reset_index(drop = True)
incidents = incidents.drop(columns = ['ONSHORE'])

### 6.3.1 Merge in parents

Ensure all parents are there.

In [16]:
sample = sample.merge(parents[['OPERATOR_ID', 'PARENT']], on='OPERATOR_ID')
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT
1535,32543,2010,"DENBURY GREEN PIPELINE-TEXAS, LLC",CO2,50.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,119.43,0.0,5.0,Denbury Resources
1163,31570,2016,TESORO HIGH PLAINS PIPELINE COMPANY LLC,Crude Oil,16.1,4.4,0.0,0.0,155.2,39.8,38.8,73.9,72.2,0.0,26.2,0.0,45.667323,Marathon Petroleum
1349,32080,2012,"CCPS TRANSPORTATION, LLC",Crude Oil,209.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,35.0,Enbridge


### 6.3.2 Merge in incidents (by type) 

#### See if there are any conflicts between observations in the same year

In [17]:
assert len(sample[['OPERATOR_ID', 'YEAR', 'NAME']].drop_duplicates()) == len(sample[['OPERATOR_ID', 'YEAR']].drop_duplicates())

AssertionError: 

Let's look at this further. We keep the first instance of every ID-Year combination and see which entries don't fit this pattern.

In [18]:
combinations = sample[['OPERATOR_ID', 'YEAR', 'NAME']].drop_duplicates()
combinations.loc[combinations.duplicated(subset=['OPERATOR_ID', 'YEAR'])]

Unnamed: 0,OPERATOR_ID,YEAR,NAME
545,19319,2011,"TPC GROUP, LLC"
915,30777,2010,Motiva Enterprises LLC Convent Refinery
916,30777,2010,MOTIVA ENTERPRISE LLC


In [19]:
sample.loc[(sample['OPERATOR_ID'] == '19319') & (sample['YEAR'] == 2011)]

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT
544,19319,2011,TEXAS PETROCHEMICALS CORP,HVL,105.24,0.0,0.0,1.72,0.0,0.3,0.0,0.0,3.5,0.0,0.0,0.0,42.210145,TPC Group
545,19319,2011,"TPC GROUP, LLC",Refined and/or Petroleum Product (non-HVL),17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,TPC Group


In [20]:
sample.loc[(sample['OPERATOR_ID'] == '30777') & (sample['YEAR'] == 2010)]

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT
914,30777,2010,Motiva Enterprises LLC,Crude Oil,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,45.0,Saudi Refining Inc
915,30777,2010,Motiva Enterprises LLC Convent Refinery,HVL,21.0,0.0,0.0,0.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,Saudi Refining Inc
916,30777,2010,MOTIVA ENTERPRISE LLC,Refined and/or Petroleum Product (non-HVL),44.0,21.0,0.0,5.0,12.0,0.0,2.0,9.0,0.0,6.0,0.0,0.0,48.529412,Saudi Refining Inc


It does not appear that there are major conflicts in our dataset, just alternative spellings.

### 6.3.3 Clean commodity names - pipelines

In [21]:
# Legacy code that could be used to create wide format data

# sample = sample.pivot_table(index=['OPERATOR_ID', 'YEAR', 'NAME'],columns='COMMODITY',values='MILES',fill_value=0)
# sample = sample.reset_index()
# sample.head(10)

In [22]:
sample['COMMODITY'].unique()

array(['HVL', 'Refined and/or Petroleum Product (non-HVL)', 'Crude Oil',
       'CO2', 'Fuel Grade Ethanol (dedicated system)'], dtype=object)

In [23]:
new_names_pipelines = {'Crude Oil': 'crude', 
                       'CO2': 'co2',
                       'Fuel Grade Ethanol (dedicated system)': 'fge', 
                       'HVL': 'hvl',
                       'Refined and/or Petroleum Product (non-HVL)': 'non-hvl'}

sample = sample.replace({'COMMODITY': new_names_pipelines})
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT
1754,39848,2018,LAMBDA ENERGY RESOURCES LLC,hvl,89.5,0.0,0.0,0.0,0.0,0.0,225.0,0.0,0.0,0.0,0.0,0.0,45.0,Lambda Energy Resources
1217,31663,2014,NAVAJO NATION OIL AND GAS COMPANY,crude,87.51,0.0,0.0,0.0,87.34,0.0,0.0,0.0,0.0,0.0,0.165,0.0,64.886864,Navajo Nation Oil and Gas
1001,31045,2011,"GENESIS PIPELINE USA, L.P.",crude,118.31,0.0,0.0,0.0,0.0,79.93,87.25,0.0,49.23,10.73,0.0,0.0,42.767016,Genesis Energy


### 6.3.4 Clean commodity names - incidents

In [24]:
incidents['COMMODITY'].unique()

array(['REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS',
       'CO2 (CARBON DIOXIDE)',
       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS',
       'CRUDE OIL',
       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)'],
      dtype=object)

In [25]:
new_names_incidents = {'REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS': 'non-hvl', 
                       'CO2 (CARBON DIOXIDE)': 'co2', 
                       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS': 'hvl', 
                       'CRUDE OIL': 'crude', 
                       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)': 'hvl'}

incidents = incidents.replace({'COMMODITY': new_names_incidents})
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
1412,30829,2013-11-25 15:30:00,ENTERPRISE CRUDE PIPELINE LLC,crude,False,False,34.2391,-97.5136
935,26094,2012-09-11 11:50:00,NUSTAR TERMINALS OPERATIONS PARTNERSHIP L. P.,non-hvl,False,False,40.608277,-74.221761
2698,32537,2014-10-02 12:00:00,WYOMING PIPELINE COMPANY,crude,False,False,43.783069,-104.407283


The index column has a funny name, but that name gets dropped when writing the sample to .feather.

### Merge and safe

In [26]:
incidents['YEAR'] = incidents['LOCAL_DATETIME'].dt.year

All incidents

In [27]:
incident_counts = incidents.groupby(['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='INCIDENTS')
incident_counts.sample(3)

assert len(incident_counts.loc[incident_counts.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0

In [28]:
sample = sample.merge(incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['INCIDENTS'] = sample['INCIDENTS'].fillna(value=0)
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
sample.head(10)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT,INCIDENTS
0,300,2017,"PLAINS PIPELINE, L.P.",hvl,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,0.0,5.004173,Plains GP Holding,0.0
1,300,2017,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,3.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,Plains GP Holding,0.0
2,300,2011,"PLAINS PIPELINE, L.P.",non-hvl,326.8,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,0.0
3,300,2014,"PLAINS PIPELINE, L.P.",non-hvl,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,0.0
4,300,2012,"PLAINS PIPELINE, L.P.",non-hvl,265.17,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,1.0
5,300,2010,"PLAINS PIPELINE, L.P.",non-hvl,62.28,0.0,0.0,0.0,244.98,4.76,0.0,0.0,2.86,4.05,0.0,0.0,63.579778,Plains GP Holding,0.0
6,300,2018,"PLAINS PIPELINE, L.P.",hvl,29.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.99,0.0,5.004167,Plains GP Holding,0.0
7,300,2018,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,0.0
8,300,2016,"PLAINS PIPELINE, L.P.",hvl,121.23,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,9.08,0.0,5.022002,Plains GP Holding,0.0
9,300,2016,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,1.0


All significant incidents

In [29]:
significant_incident_counts = incidents[incidents['SIGNIFICANT'] == True].groupby(
    ['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='SIGNIFICANT_INCIDENTS')
significant_incident_counts.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,SIGNIFICANT_INCIDENTS
490,32147,2010,non-hvl,2
389,31672,2015,co2,5
41,1248,2011,crude,3


In [30]:
sample = sample.merge(significant_incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['SIGNIFICANT_INCIDENTS'] = sample['SIGNIFICANT_INCIDENTS'].fillna(value=0)
sample.head(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS
0,300,2017,"PLAINS PIPELINE, L.P.",hvl,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,0.0,5.004173,Plains GP Holding,0.0,0.0
1,300,2017,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,3.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,Plains GP Holding,0.0,0.0
2,300,2011,"PLAINS PIPELINE, L.P.",non-hvl,326.8,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,0.0,0.0
3,300,2014,"PLAINS PIPELINE, L.P.",non-hvl,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,0.0,0.0
4,300,2012,"PLAINS PIPELINE, L.P.",non-hvl,265.17,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,1.0,1.0


In [31]:
sample.to_feather(f'../preprocessed_data/sample_{today}.feather')
incidents.to_feather(f'../preprocessed_data/incidents_renamed_{today}.feather')