# 6. Create preliminary sample

We grab the 50 operators with the largest network of pipelines (in any year) and obtain the most recent company name.

Most recent file versions used in this workbook:

In [1]:
pre_sample_file = "../preprocessed_data/pre_sample_2019-09-08.feather"
incidents_selected_file = '../preprocessed_data/incidents_selected_2019-08-22.feather'
largest_observations_file = '../preprocessed_data/largest_companies_2019-09-08.feather'

pipelines_2010_raw = '../data/pipelines_2010_2019-08-11.feather'
pipelines_2004_raw = '../data/pipelines_2004_2019-08-11.feather'
incidents_raw = '../data/incidents_2019-08-11.feather'

parent_companies_file = '../input/largest_companies_w_parents_2019-09-09.csv'

names_table = '../preprocessed_data/names_table_2019-09-09.feather'

In [2]:
sample_len = 200

## Setup

In [3]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

In [4]:
# pd.options.display.max_rows = 200

# Load data

In [5]:
pipelines = pd.read_feather(pre_sample_file)
pipelines.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,AVG_AGE,PERC_OFFSHORE
2042,12913,2013,"CALUMET SUPERIOR, LLC",non-hvl,5.572,0.0,0.0,0.0,3.349,0.0,0.0,0.0,0.525,0.0,0.0,59.579246,0.0
1389,32473,2014,OP RENO LLC,non-hvl,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1520,99037,2014,SOUTH WILMINGTON-PIPELINE,non-hvl,28.6,0.0,0.0,0.0,0.0,0.0,28.64,0.0,0.0,0.0,0.0,45.0,0.0


In [6]:
incidents = pd.read_feather(incidents_selected_file)
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ONSHORE
282,22610,2010-11-12 09:48:00,"MAGELLAN PIPELINE COMPANY, LP",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,True,44.77328,-93.35844,True
2056,22610,2015-04-29 09:00:00,"MAGELLAN PIPELINE COMPANY, LP",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,False,35.97974,-96.00985,True
3746,31618,2019-04-29 07:00:00,ENTERPRISE PRODUCTS OPERATING LLC,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,False,False,36.4978,-90.2871,True


In [7]:
parents = pd.read_csv(parent_companies_file)
parents[:150].sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES,PARENT
70,9175,2009,JAYHAWK PIPELINE LLC,331.0,CHS
122,39368,2016,CRESTWOOD MIDSTREAM PARTNERS LP,129.702,Crestwood Equity Partners
111,39083,2013,ARROW PIPELINE,146.74,Crestwood Equity Partners


In [8]:
print(parents.dtypes)
parents['OPERATOR_ID'] = parents['OPERATOR_ID'].astype(str)
print(parents.dtypes)

OPERATOR_ID      int64
YEAR             int64
NAME            object
TOTAL_MILES    float64
PARENT          object
dtype: object
OPERATOR_ID     object
YEAR             int64
NAME            object
TOTAL_MILES    float64
PARENT          object
dtype: object


## 6.1 Reference table for company names

In [9]:
names_1 = pd.read_feather(pipelines_2010_raw)[['OPERATOR_ID', 'REPORT_YEAR', 'PARTA2NAMEOFCOMP', 'PARTA4STREET']]
names_2 = pd.read_feather(pipelines_2004_raw)[['OPERATOR_ID', 'YR', 'NAME', 'OFSTREET']]

names_1 = names_1.rename(columns={'REPORT_YEAR': 'YEAR', 'PARTA2NAMEOFCOMP': 'NAME', 'PARTA4STREET': 'STREET'})
names_2 = names_2.rename(columns={'YR': 'YEAR', 'OFSTREET': 'STREET'})

names = pd.concat([names_1, names_2]).reset_index(drop=True)
names.to_feather(f'../preprocessed_data/names_table_{today}.feather')

In [10]:
from functools import partial

def find_info(OPERATOR_ID, info_col:str, file = names_table, id_col = 'OPERATOR_ID', year_col = 'YEAR'):
    df = pd.read_feather(file)
    values = np.unique(df[df[id_col] == int(OPERATOR_ID)][info_col]).tolist()
    result = {}
    for value in values:
        start_year = df[df[info_col] == value][year_col].min()
        end_year = df[df[info_col] == value][year_col].max()
        result[value] = {'start_year': start_year, 'end_year': end_year}
    return(result)

find_address = partial(find_info, info_col = 'STREET')
find_address('300')

{'333 CLAY ST., SUITE 1600': {'start_year': 2004, 'end_year': 2009},
 '333 CLAY STREET': {'start_year': 2010, 'end_year': 2011},
 '333 CLAY STREET, SUITE 1600': {'start_year': 2012, 'end_year': 2018}}

In [11]:
find_names = partial(find_info, info_col = 'NAME')
find_names('300')

{'PLAINS PIPELINE, L.P.': {'start_year': 2004, 'end_year': 2018}}

## 6.2 Largest operators - add parents

We use the list of the largest operators that we have generated in workbook 5.

All parent companies are retrieved from LexisNexis. Where the search yielded unclear results, we consult the address column in the original .xls file by FERC. In some rare cases, we did additional research (mostly company documents) to resolve conflicts.

In [12]:
largest_pipeline_operators = pd.read_feather(largest_observations_file)
sample = largest_pipeline_operators[:sample_len]

sample.head()

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES
0,31618,2018.0,ENTERPRISE PRODUCTS OPERATING LLC,8311.369
1,32109,2018.0,"ONEOK NGL PIPELINE, LLC",4756.61
2,31684,2005.0,PHILLIPS 66 PIPELINE LLC,4691.0
3,22610,2014.0,"MAGELLAN PIPELINE COMPANY, LP",4505.5
4,2552,2015.0,COLONIAL PIPELINE CO,4500.92


In [13]:
parents.head()

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES,PARENT
0,31618,2018,ENTERPRISE PRODUCTS OPERATING LLC,8311.369,Enterprise Products Partners
1,32109,2018,"ONEOK NGL PIPELINE, LLC",4756.61,ONEOK
2,31684,2005,PHILLIPS 66 PIPELINE LLC,4691.0,Phillips 66
3,22610,2014,"MAGELLAN PIPELINE COMPANY, LP",4505.5,Magellan Midstream Partners
4,2552,2015,COLONIAL PIPELINE CO,4500.92,Colonial Pipeline


In [14]:
assert len(parents.loc[parents['OPERATOR_ID'].isin(sample[:sample_len]['OPERATOR_ID'])]) == sample_len
assert parents.loc[parents['OPERATOR_ID'].isin(sample[:sample_len]['OPERATOR_ID'])]['PARENT'].isna().sum() == 0

## 6.3 Finalize sample

In [15]:
sample = pipelines.loc[pipelines['OPERATOR_ID'].isin(sample['OPERATOR_ID'])].copy()
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.head(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,AVG_AGE,PERC_OFFSHORE
0,300,2017,"PLAINS PIPELINE, L.P.",hvl,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,5.004173,0.0
1,300,2017,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,3.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,0.0
2,395,2017,AMOCO OIL CO,non-hvl,662.3,0.0,0.6,15.0,0.0,0.9,1.5,18.5,58.2,5.1,0.7,34.61194,0.0


### Drop offshore incidents

In [16]:
incidents = incidents.loc[incidents['ONSHORE'] == True].reset_index(drop = True)
incidents = incidents.drop(columns = ['ONSHORE'])

### 6.3.1 Merge in parents

Ensure all parents are there.

In [17]:
sample = sample.merge(parents[['OPERATOR_ID', 'PARENT']], on='OPERATOR_ID')
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,AVG_AGE,PERC_OFFSHORE,PARENT
2397,31878,2018,BRIDGER PIPELINE LLC,crude,51.0,0.0,0.0,0.0,73.0,13.0,114.0,95.0,6.0,22.0,129.0,33.274336,0.0,Bridger Pipe Line
498,7660,2009,HUNT CRUDE OIL SUPPLY CO,crude,63.0,0.0,0.0,0.0,0.0,31.0,126.0,0.0,46.0,7.0,0.0,41.095238,0.0,Hunt Crude Oil
1008,19319,2009,TEXAS PETROCHEMICALS CORP,non-hvl,5.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,TPC Group


### 6.3.2 Merge in incidents (by type) 

#### See if there are any conflicts between observations in the same year

In [18]:
assert len(sample[['OPERATOR_ID', 'YEAR', 'COMMODITY', 'NAME']].drop_duplicates()) == len(sample[['OPERATOR_ID', 'COMMODITY', 'YEAR']].drop_duplicates())

### 6.3.3 Clean commodity names - incidents

In [19]:
incidents['COMMODITY'].unique()

array(['REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS',
       'CO2 (CARBON DIOXIDE)',
       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS',
       'CRUDE OIL',
       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)'],
      dtype=object)

In [20]:
new_names_incidents = {'REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS': 'non-hvl', 
                       'CO2 (CARBON DIOXIDE)': 'co2', 
                       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS': 'hvl', 
                       'CRUDE OIL': 'crude', 
                       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)': 'hvl'}

incidents = incidents.replace({'COMMODITY': new_names_incidents})
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
1306,26134,2013-08-01 20:00:00,EXXONMOBIL OIL CORP - WEST COAST,crude,False,False,35.431862,-119.68993
2409,31618,2016-03-06 04:45:00,ENTERPRISE PRODUCTS OPERATING LLC,hvl,False,False,32.18726,-94.25998
3020,30829,2017-08-16 09:00:00,ENTERPRISE CRUDE PIPELINE LLC,crude,False,False,32.75599,-102.719168


The index column has a funny name, but that name gets dropped when writing the sample to .feather.

### Merge and safe

In [21]:
incidents['YEAR'] = incidents['LOCAL_DATETIME'].dt.year

All incidents

In [22]:
incident_counts = incidents.groupby(['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='INCIDENTS')
incident_counts.sample(3)

assert len(incident_counts.loc[incident_counts.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0

In [23]:
print(len(sample))
sample = sample.merge(incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['INCIDENTS'] = sample['INCIDENTS'].fillna(value=0)
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
print(len(sample))

3200
3200


All significant incidents

In [24]:
significant_incident_counts = incidents[incidents['SIGNIFICANT'] == True].groupby(
    ['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='SIGNIFICANT_INCIDENTS')
significant_incident_counts.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,SIGNIFICANT_INCIDENTS
225,2731,2012,non-hvl,1
285,31130,2013,hvl,1
301,31174,2015,crude,2


In [25]:
print(len(sample))
sample = sample.merge(significant_incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['SIGNIFICANT_INCIDENTS'] = sample['SIGNIFICANT_INCIDENTS'].fillna(value=0)
print(len(sample))

3200
3200


In [26]:
sample.to_feather(f'../preprocessed_data/sample_{today}.feather')
incidents.to_feather(f'../preprocessed_data/incidents_renamed_{today}.feather')