# 6. Create preliminary sample

We grab the 50 operators with the largest network of pipelines (in any year) and obtain the most recent company name.

Most recent file versions used in this workbook:

In [1]:
pipeline_2010_selected_file = '../preprocessed_data/pipelines_2010_selected_2019-08-24.feather'
incidents_selected_file = '../preprocessed_data/incidents_selected_2019-08-22.feather'
largest_observations_file = '../preprocessed_data/largest_companies_2019-08-22.feather'

pipelines_2010_raw = '../data/pipelines_2010_2019-08-11.feather'
incidents_raw = '../data/incidents_2019-08-11.feather'

In [2]:
sample_len = 75

## Setup

In [3]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

In [4]:
pd.options.display.max_rows = 75

# Load data

In [5]:
pipelines_2010 = pd.read_feather(pipeline_2010_selected_file)
pipelines_2010.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE
1953,31723,2012,EXXONMOBIL REFINING AND SUPPLY COMPANY,Crude Oil,11.296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3080,32355,2010,REGENCY LIQUIDS PIPELINE LLC,HVL,1.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.3,0.0,0.0,0.0,25.0
2363,31178,2013,XTO ENERGY INC,CO2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [6]:
incidents = pd.read_feather(incidents_selected_file)
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ONSHORE
2262,31174,2015-09-26 13:15:00,"SHELL PIPELINE CO., L.P.",CRUDE OIL,False,False,27.946944,-90.996944,False
2636,11551,2016-08-17 07:05:00,"DELEK LOGISTICS OPERATING, LLC.",CRUDE OIL,False,False,33.34219,-92.70125,True
2551,22610,2016-06-11 13:47:00,"MAGELLAN PIPELINE COMPANY, LP",CRUDE OIL,False,True,29.720014,-95.14234,True


## 6.1 Functions to reference from raw data

In [7]:
from functools import partial

def find_info(OPERATOR_ID, info_col:str, file = pipelines_2010_raw, id_col = 'OPERATOR_ID', year_col = 'REPORT_YEAR'):
    df = pd.read_feather(file)
    values = np.unique(df[df[id_col] == int(OPERATOR_ID)][info_col]).tolist()
    result = {}
    for value in values:
        start_year = df[df[info_col] == value][year_col].min()
        end_year = df[df[info_col] == value][year_col].max()
        result[value] = {'start_year': start_year, 'end_year': end_year}
    return(result)

find_address = partial(find_info, info_col = 'PARTA4STREET')
find_address('300')

{'333 CLAY STREET': {'start_year': 2010, 'end_year': 2011},
 '333 CLAY STREET, SUITE 1600': {'start_year': 2012, 'end_year': 2018}}

In [8]:
find_names = partial(find_info, info_col = 'PARTA2NAMEOFCOMP')
find_names('300')

{'PLAINS PIPELINE, L.P.': {'start_year': 2010, 'end_year': 2018}}

## 6.2 Largest operators - add parents

We use the list of the largest operators that we have generated in workbook 5.

In [9]:
largest_pipeline_operators = pd.read_feather(largest_observations_file)
sample = largest_pipeline_operators[:sample_len]

sample

Unnamed: 0,OPERATOR_ID,YEAR,NAME,TOTAL_MILES
0,31618,2018.0,ENTERPRISE PRODUCTS OPERATING LLC,8311.369
1,32109,2018.0,"ONEOK NGL PIPELINE, LLC",4756.61
2,22610,2014.0,"MAGELLAN PIPELINE COMPANY, LP",4505.5
3,2552,2015.0,COLONIAL PIPELINE CO,4500.92
4,31684,2017.0,PHILLIPS 66 PIPELINE LLC,4474.1
5,1845,2015.0,"BUCKEYE PARTNERS, LP",4298.29
6,18718,2010.0,SUNOCO PIPELINE L.P.,3466.74
7,22855,2013.0,"FLINT HILLS RESOURCES, LC",2757.45
8,10012,2018.0,NUSTAR PIPELINE OPERATING PARTNERSHIP L.P.,2645.3
9,15674,2016.0,PLANTATION PIPE LINE CO,2564.73


All parent companies are retrieved from LexisNexis, except where comments added.

In [10]:
from collections import namedtuple

company = namedtuple('company', 'OPERATOR_ID NAME PARENT')

parent_companies = [company('31618', 'ENTERPRISE PRODUCTS OPERATING LLC', 'Enterprise Products Partners'),
                    company('32109', 'ONEOK NGL PIPELINE, LLC', 'ONEOK'), 
                    company('22610', 'MAGELLAN PIPELINE COMPANY, LP', 'Magellan Midstream Partners'), 
                    company('2552', 'COLONIAL PIPELINE CO', 'Colonial Pipeline'), 
                    company('31684', 'PHILLIPS 66 PIPELINE LLC', 'Phillips 66'), 
                    company('1845', 'BUCKEYE PARTNERS, LP', 'Buckeye Partners'), 
                    company('18718', 'SUNOCO PIPELINE L.P.', 'Energy Transfer'), 
                    company('22855', 'FLINT HILLS RESOURCES, LC', 'Koch Industries'), 
                    company('10012', 'NUSTAR PIPELINE OPERATING PARTNERSHIP L.P.', 'NuStar Energy'), 
                    company('15674', 'PLANTATION PIPE LINE CO', 'Kinder Morgan'),
                    company('11169', 'ENBRIDGE ENERGY, LIMITED PARTNERSHIP', 'Enbridge'), 
                    company('32147', 'MARATHON PIPE LINE LLC', 'Marathon Petroleum'),
                    company('4906', 'EXXONMOBIL PIPELINE CO', 'Exxon Mobil'), 
                    company('18092', 'SFPP, LP', 'Kinder Morgan'), 
                    company('2731', 'CHEVRON PIPE LINE CO', 'Chevron'), 
                    company('4805', 'EXPLORER PIPELINE CO', 'Explorer Pipeline'), 
                    company('32099', 'ENERGY TRANSFER COMPANY', 'Energy Transfer'), 
                    company('31371', 'BUCKEYE DEVELOPMENT & LOGISTICS, LLC', 'Buckeye Partners'), 
                    company('30829', 'ENTERPRISE CRUDE PIPELINE LLC', 'Enterprise Products Partners'), 
                    company('12628', 'MOBIL PIPE LINE COMPANY', 'Exxon Mobil'), 
                    company('39205', 'DAPL-ETCO OPERATIONS MANAGEMENT, LLC', 'Energy Transfer'), 
                    company('39596', 'PERMIAN EXPRESS PARTNERS LLC', 'Energy Transfer'), 
                    company('25146', 'EQUISTAR CHEMICALS, L.P.', 'LyondellBasell'), 
                    company('38933', 'TESORO LOGISTICS OPERATIONS, LLC', 'Marathon Petroleum'), 
                    company('31454', 'NUSTAR LOGISTICS, L.P.', 'NuStar Energy'), 
                    company('31130', 'DCP MIDSTREAM', 'DCP Midstream'), 
                    company('31174', 'SHELL PIPELINE CO., L.P.', 'Royal Dutch Shell'), 
                    # For TC OIL PIPELINE OPERTIONS INC (OPERATOR_ID 32334), we confirmed the information by comparing the address
                    # as stated in the dataset (see next cell) and on LexisNexis.
                    company('32334', 'TC OIL PIPELINE OPERATIONS INC', 'TransCanada USA'),
                    company('395', 'AMOCO OIL CO', 'Amoco'), 
                    company('31666', 'ROCKY MOUNTAIN PIPELINE SYSTEM, LLC', 'Plains GP Holding'), 
                    company('32080', 'CCPS TRANSPORTATION, LLC', 'Enbridge'), 
                    company('12470', 'MID - VALLEY PIPELINE CO', 'Energy Transfer'), 
                    company('3445', 'DIXIE PIPELINE COMPANY LLC', 'Enterprise Products Partners'), 
                    company('32103', 'CRIMSON PIPELINE L.P.', 'Crimson Pipeline'), 
                    company('32258', 'KINDER MORGAN COCHIN LLC', 'Kinder Morgan'), 
                    company('22430', 'WEST SHORE PIPELINE CO', 'West Shore Pipeline'), 
                    # For WOLVERINE PIPELINE CO (OPERATOR_ID 22830) , the information were found by searching for "Wolverine Pipe Line".
                    company('22830', 'WOLVERINE PIPELINE CO', 'Marathon Petroleum'), 
                    # For SINCLAUR TRANSPORTATION COMPANY (OPERATOR_ID 15156) the information were found through a web search 
                    # (http://www.pipelinesafetyinfo.com/user/file/Missouri%20(PAM)/Sinclair_Transportation_Company.pdf).
                    company('15156', 'SINCLAIR TRANSPORTATION COMPANY', 'Sinclair Oil'), 
                    company('31189', 'BP PIPELINE (NORTH AMERICA) INC.', 'BP'), 
                    company('39043', 'TALLGRASS PONY EXPRESS PIPELINE, LLC', 'Tallgrass Energy'), 
                    company('32011', 'HOLLY ENERGY PARTNERS - OPERATING, L.P.', 'HollyFrontier'), 
                    company('32502', 'ENBRIDGE PIPELINES (SOUTHERN LIGHTS) L.L.C.', 'Enbridge'), 
                    company('39029', 'TESORO LOGISTICS NORTHWEST PIPELINE LLC', 'Marathon Petroleum'), 
                    company('30826', 'WILLIAMS FIELD SERVICES', 'The Williams Companies'), 
                    company('31720', 'EXPRESS HOLDINGS (USA), LLC', 'Enbridge'), 
                    company('39398', 'INEOS USA LLC', 'INEOS'), 
                    # CALNEV PIPELINE (OPERATOR_ID 26125) found as Calnev Pipe Line.
                    company('26125', 'CALNEV PIPELINE CO', 'Kinder Morgan'), 
                    company('26149', 'ALYESKA PIPELINE SERVICE CO', 'Alyeska Pipeline Service'), 
                    company('30781', 'OLYMPIC PIPE LINE COMPANY', 'BP'), 
                    company('31045', 'GENESIS PIPELINE USA, L.P.', 'Gensis Energy'), 
                    company('32683', 'INLAND CORPORATION', 'Energy Transfer'), 
                    company('18386', 'BP OIL PIPELINE CO', 'BP'), 
                    company('32551', 'BKEP PIPELINE, LLC', 'Blueknight Energy Partners'),
                    company('15485', 'PHILLIPS 66 COMPANY - SWEENY REFINERY', 'Phillips 66'), 
                    company('26085', 'PLAINS MARKETING, L.P.', 'Plains GP Holding'), 
                    company('300', 'PLAINS PIPELINE, L.P.', 'Plains GP Holding'),
                    company('32296', 'TARGA RESOURCES OPERATING LLC', 'Targa Resources Corp'),
                    # Some references to Dow Chemical, e.g., https://www.seabrooktx.gov/DocumentCenter/View/5798/pipeline_Exhibit-A
                    company('26086', 'SEADRIFT PIPELINE CORP', 'Dow Chemical'), 
                    company('2170', 'CENEX PIPELINE LLC', 'CHS'), 
                    company('30782', 'HARVEST MIDSTREAM COMPANY', 'Harvest Midstream'), 
                    company('31947', 'ENBRIDGE PIPELINES (OZARK) L.L.C.', 'Enbridge'), 
                    company('26049', 'COUNTRYMARK REFINING AND LOGISTICS, LLC', 'Countrymark'), 
                    company('3527', 'DOW PIPELINE CO', 'DowDuPont'), 
                    company('30755', 'CITGO PRODUCTS PIPELINE CO', 'Citgo'),
                    company('9175', 'JAYHAWK PIPELINE LLC', 'CHS'), 
                    company('26134', 'EXXONMOBIL OIL CORP - WEST COAST', 'Exxon Mobil'),
                    company('12105', 'MAGELLAN AMMONIA PIPELINE, L.P.', 'Magellan Midstream'), 
                    company('39138', 'BOARDWALK LOUISIANA MIDSTREAM, LLC', 'Loews Corporation'), 
                    company('31325', 'PACIFIC PIPELINE SYSTEM LLC', 'Plains'), 
                    company('31888', 'CENTURION PIPELINE L.P.', 'Occidental Petroleum Corporation'), 
                    company('39105', 'VALERO PARTNERS OPERATING CO. LLC', 'Valero Energy Corporation'), 
                    company('5081', 'COFFEYVILLE RESOURCES CRUDE TRANSPORTATION, LLC', 'Icahn Enterprises'), 
                    company('39504', 'MAGELLAN CRUDE OIL PIPELINE COMPANY, L.P.', 'Magellan Midstream Partners'), 
                    company('30626', 'TARGA NGL PIPE LINE CO', 'Targa Resources'),
                    # According to http://ir.semgroupcorp.com/press-releases/press-release-details/2018/SemGroup-to-Diversify-White-Cliffs-Pipeline-Service-with-NGL-Capabilities-Signs-Long-Term-Contract-with-DCP-Midstream/default.aspx
                    company('32288', 'WHITE CLIFFS PIPELINE, LLC', 'SemGroup')
                   ]

parent_companies = pd.DataFrame(parent_companies)
# assert len(parent_companies) == sample_len
assert len(parent_companies['OPERATOR_ID']) == sample_len
assert len(sample['OPERATOR_ID']) == sample_len
assert len(set.union(set(parent_companies['OPERATOR_ID'].tolist()), set(sample['OPERATOR_ID'].tolist()))) == sample_len
parent_companies.sample(3)

Unnamed: 0,OPERATOR_ID,NAME,PARENT
53,15485,PHILLIPS 66 COMPANY - SWEENY REFINERY,Phillips 66
73,30626,TARGA NGL PIPE LINE CO,Targa Resources
32,3445,DIXIE PIPELINE COMPANY LLC,Enterprise Products Partners


In [11]:
find_names('15485')

{'CONOCOPHILLIPS COMPANY': {'start_year': 2010, 'end_year': 2010},
 'PHILLIPS 66 COMPANY': {'start_year': 2011, 'end_year': 2011},
 'PHILLIPS 66 COMPANY - SWEENY REFINERY': {'start_year': 2012,
  'end_year': 2018}}

Checking address for 'TC OIL PIPELINE OPERATIONS INC' (see comment above)

In [12]:
find_address('32334')

{'700 LOUISIANA SUITE 700': {'start_year': 2016, 'end_year': 2018},
 '717 TEXAS AVE': {'start_year': 2010, 'end_year': 2015}}

## 6.3 Finalize sample

In [13]:
sample = pipelines_2010.loc[pipelines_2010['OPERATOR_ID'].isin(sample['OPERATOR_ID'])].copy()
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.head(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE
0,300,2017,"PLAINS PIPELINE, L.P.",HVL,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,0.0,5.004173
1,300,2017,"PLAINS PIPELINE, L.P.",Refined and/or Petroleum Product (non-HVL),0.0,0.0,3.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
2,395,2017,AMOCO OIL CO,Refined and/or Petroleum Product (non-HVL),662.3,0.0,0.6,15.0,0.0,0.9,1.5,18.5,58.2,5.1,0.7,0.0,34.61194


### Drop offshore incidents

In [14]:
incidents = incidents.loc[incidents['ONSHORE'] == True].reset_index(drop = True)
incidents = incidents.drop(columns = ['ONSHORE'])

### 6.3.1 Merge in parents

Ensure all parents are there.

In [15]:
sample = sample.merge(parent_companies[['OPERATOR_ID', 'PARENT']], on='OPERATOR_ID')
assert len(sample['OPERATOR_ID'].unique()) == sample_len
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT
83,2731,2018,CHEVRON PIPE LINE CO,HVL,417.9,0.0,0.0,0.0,56.5,144.5,164.4,71.4,139.2,43.5,66.2,0.0,37.889019,Chevron
713,31454,2018,"NUSTAR LOGISTICS, L.P.",Crude Oil,189.5,0.0,0.0,0.0,30.5,4.4,33.0,36.2,452.6,13.4,288.4,0.0,20.890507,NuStar Energy
278,15485,2015,PHILLIPS 66 COMPANY - SWEENY REFINERY,Crude Oil,27.79,0.0,0.0,0.0,26.96,0.0,27.1,0.03,0.01,0.0,0.0,0.0,54.957486,Phillips 66


## 6.3.2 Merge in incidents (by type) 

#### See if there are any conflicts between observations in the same year

In [16]:
assert len(sample[['OPERATOR_ID', 'YEAR', 'NAME']].drop_duplicates()) == len(sample[['OPERATOR_ID', 'YEAR']].drop_duplicates())

We're good.

### Clean commodity names - pipelines

In [17]:
# Legacy code that could be used to create wide format data

# sample = sample.pivot_table(index=['OPERATOR_ID', 'YEAR', 'NAME'],columns='COMMODITY',values='MILES',fill_value=0)
# sample = sample.reset_index()
# sample.head(10)

In [18]:
sample['COMMODITY'].unique()

array(['HVL', 'Refined and/or Petroleum Product (non-HVL)', 'Crude Oil',
       'CO2', 'Fuel Grade Ethanol (dedicated system)'], dtype=object)

In [19]:
new_names_pipelines = {'Crude Oil': 'crude', 
                       'CO2': 'co2',
                       'Fuel Grade Ethanol (dedicated system)': 'fge', 
                       'HVL': 'hvl',
                       'Refined and/or Petroleum Product (non-HVL)': 'non-hvl'}

sample = sample.replace({'COMMODITY': new_names_pipelines})
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT
942,32502,2013,ENBRIDGE PIPELINES (SOUTHERN LIGHTS) L.L.C.,non-hvl,365.1,0.0,0.0,0.0,135.0,0.0,0.0,1.0,0.0,680.0,0.0,0.0,23.296569,Enbridge
790,31947,2014,ENBRIDGE PIPELINES (OZARK) L.L.C.,crude,284.0,0.0,0.0,447.0,1.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,74.558499,Enbridge
759,31684,2018,PHILLIPS 66 PIPELINE LLC,hvl,1266.15,0.42,0.04,293.03,525.92,869.84,675.96,245.4,900.65,22.15,307.91,0.0,43.584082,Phillips 66


### Clean commodity names - incidents

In [20]:
incidents['COMMODITY'].unique()

array(['REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS',
       'CO2 (CARBON DIOXIDE)',
       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS',
       'CRUDE OIL',
       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)'],
      dtype=object)

In [21]:
new_names_incidents = {'REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS': 'non-hvl', 
                       'CO2 (CARBON DIOXIDE)': 'co2', 
                       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS': 'hvl', 
                       'CRUDE OIL': 'crude', 
                       'BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)': 'hvl'}

incidents = incidents.replace({'COMMODITY': new_names_incidents})
incidents.sample(3)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE
3029,30781,2017-08-22 19:28:00,OLYMPIC PIPE LINE COMPANY,non-hvl,False,False,45.590306,-122.776702
701,2552,2012-01-20 07:00:00,COLONIAL PIPELINE CO,non-hvl,False,False,32.02879,-89.10231
1758,1845,2014-08-30 14:30:00,"BUCKEYE PARTNERS, LP",non-hvl,False,False,40.563565,-74.245512


The index column has a funny name, but that name gets dropped when writing the sample to .feather.

### Merge and safe

In [22]:
incidents['YEAR'] = incidents['LOCAL_DATETIME'].dt.year

All incidents

In [23]:
incident_counts = incidents.groupby(['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='INCIDENTS')
incident_counts.sample(3)

assert len(incident_counts.loc[incident_counts.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0

In [24]:
sample = sample.merge(incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['INCIDENTS'] = sample['INCIDENTS'].fillna(value=0)
assert len(sample.loc[sample.duplicated(subset=['OPERATOR_ID', 'YEAR', 'COMMODITY'])]) == 0
sample.head(10)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT,INCIDENTS
0,300,2017,"PLAINS PIPELINE, L.P.",hvl,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,0.0,5.004173,Plains GP Holding,0.0
1,300,2017,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,3.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,Plains GP Holding,0.0
2,300,2011,"PLAINS PIPELINE, L.P.",non-hvl,326.8,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,0.0
3,300,2014,"PLAINS PIPELINE, L.P.",non-hvl,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,0.0
4,300,2012,"PLAINS PIPELINE, L.P.",non-hvl,265.17,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,1.0
5,300,2010,"PLAINS PIPELINE, L.P.",non-hvl,62.28,0.0,0.0,0.0,244.98,4.76,0.0,0.0,2.86,4.05,0.0,0.0,63.579778,Plains GP Holding,0.0
6,300,2018,"PLAINS PIPELINE, L.P.",hvl,29.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.99,0.0,5.004167,Plains GP Holding,0.0
7,300,2018,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,0.0
8,300,2016,"PLAINS PIPELINE, L.P.",hvl,121.23,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,9.08,0.0,5.022002,Plains GP Holding,0.0
9,300,2016,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,1.0


All significant incidents

In [25]:
significant_incident_counts = incidents[incidents['SIGNIFICANT'] == True].groupby(
    ['OPERATOR_ID', 'YEAR', 'COMMODITY']).size().reset_index(name='SIGNIFICANT_INCIDENTS')
significant_incident_counts.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,SIGNIFICANT_INCIDENTS
182,2552,2018,non-hvl,6
377,31618,2015,non-hvl,1
167,22855,2019,crude,1


In [26]:
sample = sample.merge(significant_incident_counts, on=['OPERATOR_ID', 'YEAR', 'COMMODITY'], how='left')
sample['SIGNIFICANT_INCIDENTS'] = sample['SIGNIFICANT_INCIDENTS'].fillna(value=0)
sample.head(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,MILES_PRE_1940,MILES_1940,MILES_1950,MILES_1960,MILES_1970,MILES_1980,MILES_1990,MILES_2000,MILES_2010,PERC_OFFSHORE,AVG_AGE,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS
0,300,2017,"PLAINS PIPELINE, L.P.",hvl,72.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,47.92,0.0,5.004173,Plains GP Holding,0.0,0.0
1,300,2017,"PLAINS PIPELINE, L.P.",non-hvl,0.0,0.0,3.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,Plains GP Holding,0.0,0.0
2,300,2011,"PLAINS PIPELINE, L.P.",non-hvl,326.8,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,0.0,0.0
3,300,2014,"PLAINS PIPELINE, L.P.",non-hvl,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Plains GP Holding,0.0,0.0
4,300,2012,"PLAINS PIPELINE, L.P.",non-hvl,265.17,0.0,0.0,0.0,0.8,188.43,1.17,4.43,2.85,0.35,0.0,0.0,54.03146,Plains GP Holding,1.0,1.0


In [27]:
sample.to_feather(f'../preprocessed_data/sample_{today}.feather')
incidents.to_feather(f'../preprocessed_data/incidents_renamed_{today}.feather')