# 3 - Select columns

In [1]:
import pandas as pd
import numpy as np
from datetime import date

today = date.today().isoformat()

## 3.1 Extract relevant columns of the pipeline incidents dataset

In [2]:
incidents = pd.read_excel('../data/incidents_2019-08-01/hl2010toPresent.xlsx', 
                          sheet_name=1)
incidents_selected = incidents[['OPERATOR_ID', 'LOCAL_DATETIME', 'NAME', 'COMMODITY_RELEASED_TYPE', 
                                'SERIOUS', 'SIGNIFICANT', 'LOCATION_LATITUDE', 'LOCATION_LONGITUDE', 
                                'ON_OFF_SHORE']].copy()

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY_RELEASED_TYPE,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ON_OFF_SHORE
255,31454,2010-01-11 14:47:00,"NUSTAR LOGISTICS, L.P.",REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,YES,28.05529,-97.32797,ONSHORE
1029,32099,2012-12-01 10:35:00,ENERGY TRANSFER COMPANY,HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,NO,YES,28.894955,-97.759029,ONSHORE
2899,2170,2017-03-27 11:00:00,CENEX PIPELINE LLC,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,NO,YES,46.89783,-96.90347,ONSHORE
1285,12105,2013-07-01 15:00:00,"MAGELLAN AMMONIA PIPELINE, L.P.",HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS...,NO,YES,42.80667,-94.36011,ONSHORE
1533,18718,2014-02-18 12:00:00,SUNOCO PIPELINE L.P.,CRUDE OIL,NO,NO,32.519797,-94.870959,ONSHORE


In [3]:
import numpy as np

np.unique(incidents_selected['COMMODITY_RELEASED_TYPE'])

array(['BIOFUEL / ALTERNATIVE FUEL(INCLUDING ETHANOL BLENDS)',
       'CO2 (CARBON DIOXIDE)', 'CRUDE OIL',
       'HVL OR OTHER FLAMMABLE OR TOXIC FLUID WHICH IS A GAS AT AMBIENT CONDITIONS',
       'REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHICH IS A LIQUID AT AMBIENT CONDITIONS'],
      dtype=object)

In [4]:
incidents_selected['ON_OFF_SHORE'].value_counts()

ONSHORE     3791
OFFSHORE      28
Name: ON_OFF_SHORE, dtype: int64

### 3.1.1 Fix data types

In [5]:
incidents_selected.dtypes

OPERATOR_ID                         int64
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

In [6]:
incidents_selected['OPERATOR_ID'] = incidents_selected['OPERATOR_ID'].astype(str)
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                            object
SIGNIFICANT                        object
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

Make sure SERIOUS and SIGNIFICANT are booleans.

In [7]:
(incidents_selected[['SERIOUS']] == 'YES')['SERIOUS'].value_counts()

False    3803
True       16
Name: SERIOUS, dtype: int64

In [8]:
incidents_selected['SERIOUS'] = incidents_selected[['SERIOUS']] == 'YES'

In [9]:
(incidents_selected[['SIGNIFICANT']] == 'YES')['SIGNIFICANT'].value_counts()

False    2364
True     1455
Name: SIGNIFICANT, dtype: int64

In [10]:
incidents_selected['SIGNIFICANT'] = incidents_selected[['SIGNIFICANT']] == 'YES'

In [11]:
incidents_selected.dtypes

OPERATOR_ID                        object
LOCAL_DATETIME             datetime64[ns]
NAME                               object
COMMODITY_RELEASED_TYPE            object
SERIOUS                              bool
SIGNIFICANT                          bool
LOCATION_LATITUDE                 float64
LOCATION_LONGITUDE                float64
ON_OFF_SHORE                       object
dtype: object

In [12]:
incidents_selected['ONSHORE'] = incidents_selected[['ON_OFF_SHORE']] == 'ONSHORE'
incidents_selected = incidents_selected.drop(columns=['ON_OFF_SHORE'])
incidents_selected = incidents_selected.rename(columns={'COMMODITY_RELEASED_TYPE': 'COMMODITY'})

incidents_selected.sample(5)

Unnamed: 0,OPERATOR_ID,LOCAL_DATETIME,NAME,COMMODITY,SERIOUS,SIGNIFICANT,LOCATION_LATITUDE,LOCATION_LONGITUDE,ONSHORE
1044,32080,2012-12-21 21:00:00,"CCPS TRANSPORTATION, LLC",CRUDE OIL,False,False,40.3375,-89.82917,True
2321,31618,2015-12-02 13:00:00,ENTERPRISE PRODUCTS OPERATING LLC,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,False,29.912017,-94.008532,True
1796,31618,2014-09-18 17:36:00,ENTERPRISE PRODUCTS OPERATING LLC,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,True,35.13221,-91.59935,True
2976,300,2017-06-19 16:11:00,"PLAINS PIPELINE, L.P.",CRUDE OIL,False,False,32.48288,-94.830489,True
59,31684,2010-04-08 18:58:00,CONOCOPHILLIPS,REFINED AND/OR PETROLEUM PRODUCT (NON-HVL) WHI...,False,False,38.63064,-93.39656,True


### Drop offshore accidents

In [13]:
incidents_selected = incidents_selected.loc[incidents_selected['ONSHORE'] == True].reset_index()
incidents_selected = incidents_selected.drop(columns = ['ONSHORE'])

In [14]:
incidents_selected.to_feather(f'../preprocessed_data/incidents_selected_{today}.feather')

## 3.2 Extract relevant columns of the pipeline system dataset

In [15]:
from os import listdir

pipelines_2010_present = [file for file in listdir('../data/pipelines_2010_present_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2010_present = pd.concat([pd.read_excel(f'../data/pipelines_2010_present_2019-08-02/{file}', skiprows=2) 
                                    for file in pipelines_2010_present])
pipelines_2010_present = pipelines_2010_present.reset_index(drop = True)

pipelines_2010_present.sample(5)

Unnamed: 0,DATAFILE_AS_OF,REPORT_YEAR,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,OPERATOR_ID,PARTA2NAMEOFCOMP,PARTA4STREET,PARTA4CITY,PARTA4STATE,PARTA4ZIP,...,PARTE2010HF,PARTE2010LF,PARTE2010TOTAL,PARTETOTAL,PARTETOTALHF,PARTETOTALLF,REPORT_SUBMISSION_TYPE,REPORT_DATE,FILING_DATE,FORM_REV
2772,2019-07-01 10:54:15,2013,20140789,12798,32572,"TOLEDO REFINING COMPANY, LLC",1819 WOODVILLE ROAD,OREGON,OH,43616,...,0.0,0.0,0.0,0.0,0.0,0.0,INITIAL,2014-06-11 09:03:17,2014-06-11 09:03:17,7000-1.1 (Rev. 06-2011)
3186,2019-07-01 10:31:50,2010,20110370,11634,31684,CONOCOPHILLIPS,600 North Dairy Ashford,,TX,77079,...,4.04,0.0,4.04,3555.05,1406.83,2148.22,SUPPLEMENTAL,2011-08-16 15:01:25,2012-11-26 14:36:26,7000-1.1 (Rev. 06-2011)
877,2019-07-01 10:36:07,2011,20120184,12305,30834,TIDELANDS OIL PRODUCTION COMPANY,111 W Ocean Blvd. #800,LONG BEACH,CA,90802,...,0.439,0.0,0.439,2.283,1.121,1.162,SUPPLEMENTAL,2012-06-11 17:27:57,2013-06-18 09:19:56,7000-1.1 (Rev. 06-2011)
1581,2019-07-01 11:03:57,2014,20151497,13629,32258,KINDER MORGAN COCHIN LLC,"1001 LOUISIANA ST., SUITE 1000",HOUSTON,TX,77002,...,1.38,0.0,1.38,297.73,297.48,0.25,INITIAL,2015-06-12 15:48:58,2015-06-12 15:48:58,7000-1.1 (Rev. 06-2014)
4651,2019-07-01 11:21:18,2016,20172477,14838,39492,"CELANESE CHEMICALS, INC",9502 BAYPORT BLVD.,PASADENA,TX,77507,...,0.0,0.0,0.0,0.0,0.0,0.0,INITIAL,2017-05-27 14:00:05,2017-05-27 14:00:05,7000-1.1 (Rev. 06-2014)


In [16]:
pipelines_2004_2009 = [file for file in listdir('../data/pipelines_2004_2009_2019-08-02/') if 'annual_hazardous_liquid' in file]
pipelines_2004_2009 = pd.concat([pd.read_excel(f'../data/pipelines_2004_2009_2019-08-02/{file}') 
                                 for file in pipelines_2004_2009])
pipelines_2004_2009 = pipelines_2004_2009.reset_index(drop = True)

pipelines_2004_2009.sample(5)

Unnamed: 0,ORIGINAL,SUPPLEMENT,YR,RPTID,HL_SUP_ID,SYSTEM_TYPE,OPERATOR_ID,NAME,PARENT,OFSTREET,...,BA_5,BA_6,BA_7,BA_8,DOR,DOE,PNAME,PPHONE,PEMAIL,PFAX
1787,Y,N,2005,20050065,937,HVLS,32007,SUPERIOR PIPELINE COMPANY L.L.C.,UNIT CORPORATION,7130 S. LEWIS SUITE 510,...,0.0,0.0,0.0,,2006-04-11,2006-04-11 16:59:43,GARY COOK,9184775607.0,GCOOK@SUPERIORPIPELINE.COM,9183830000.0
2407,Y,N,2007,20070123,3494,CO2 OR OTHER,31875,MERIT ENERGY COMPANY,,P.O. BOX 748,...,0.0,0.0,0.0,,2008-05-13,2008-05-13 15:07:43,BILL ELLSWORTH,3073282345.0,RICKY.WELCHER@MERITENERGY.COM,
24,Y,N,2008,20080367,5174,PETROLEUM & REFINED PRODUCTS,31371,BUCKEYE GULF COAST PIPELINE LP,,"5002 BUCKEYE RD, PO BOX 368",...,0.0,25.0,28.0,,2009-06-12,2009-06-12 15:05:12,DONALD E HANKEY,6109044410.0,DHANKEY@BUCKEYE.COM,6109044545.0
533,Y,N,2006,20060063,2164,PETROLEUM & REFINED PRODUCTS,99002,TAMPA PIPELINE LIMITED PARTNERSHIP,TAMPA PIPELINE CORPORATION,7403 HOADLEY ST.,...,0.0,0.0,0.0,,2007-04-30,2007-04-30 09:32:08,"DON BYRD, GENERAL MANAGER",8138390000.0,DONBYRD265@MSN.COM,8138360000.0
116,Y,N,2008,20080132,4726,HVLS,31375,"AGRIUM US, INC",,227515 E. BOWLES ROAD,...,0.0,0.0,0.0,,2009-05-07,2009-05-07 11:13:38,W. CARTER HOPE,5095865469.0,WC.HOPE@GMAIL.COM,5095865440.0


### 3.2.1 Select relevant columns

#### For 2010-

In [17]:
pipelines_2010_selected = pipelines_2010_present[[
    'OPERATOR_ID', 'REPORT_YEAR', 'PARTA2NAMEOFCOMP', 'PARTA5COMMODITY', 'PARTBHCAONSHORE', 
    'PARTEUNKNTOTAL', 'PARTEPRE40TOTAL', 'PARTE1940TOTAL', 'PARTE1950TOTAL', 'PARTE1960TOTAL', 
    'PARTE1970TOTAL', 'PARTE1980TOTAL', 'PARTE1990TOTAL', 'PARTE2000TOTAL', 'PARTE2010TOTAL',
    'PARTBHCAOFFSHORE', 'PARTBHCATOTAL']].copy()

pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,REPORT_YEAR,PARTA2NAMEOFCOMP,PARTA5COMMODITY,PARTBHCAONSHORE,PARTEUNKNTOTAL,PARTEPRE40TOTAL,PARTE1940TOTAL,PARTE1950TOTAL,PARTE1960TOTAL,PARTE1970TOTAL,PARTE1980TOTAL,PARTE1990TOTAL,PARTE2000TOTAL,PARTE2010TOTAL,PARTBHCAOFFSHORE,PARTBHCATOTAL
411,32412,2017,"MARKWEST LIBERTY MIDSTREAM & RESOURCES, LLC",HVL,164.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,426.51,,164.86
1012,31869,2011,"PRISM GAS SYSTEMS I, L.P.",HVL,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,13.0,0.0,,2.0
5060,32141,2015,RESOLUTE NATURAL RESOURCES COMPANY,CO2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.1,0.0,0.0,0.0,,0.0
1995,31130,2012,DCP MIDSTREAM,HVL,485.307,513.864,0.0,38.776,1.76,260.958,428.36,44.359,356.547,196.558,244.886,,485.307
137,25146,2017,"EQUISTAR CHEMICALS, L.P.",Refined and/or Petroleum Product (non-HVL),101.81,0.0,0.0,0.0,0.1,0.0,39.75,0.0,70.89,0.0,0.0,,101.81


In [18]:
pipelines_2010_selected.dtypes

OPERATOR_ID           int64
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCAONSHORE     float64
PARTEUNKNTOTAL      float64
PARTEPRE40TOTAL     float64
PARTE1940TOTAL      float64
PARTE1950TOTAL      float64
PARTE1960TOTAL      float64
PARTE1970TOTAL      float64
PARTE1980TOTAL      float64
PARTE1990TOTAL      float64
PARTE2000TOTAL      float64
PARTE2010TOTAL      float64
PARTBHCAOFFSHORE    float64
PARTBHCATOTAL       float64
dtype: object

In [19]:
pipelines_2010_selected['OPERATOR_ID'] = pipelines_2010_selected['OPERATOR_ID'].astype(str)
pipelines_2010_selected.dtypes

OPERATOR_ID          object
REPORT_YEAR           int64
PARTA2NAMEOFCOMP     object
PARTA5COMMODITY      object
PARTBHCAONSHORE     float64
PARTEUNKNTOTAL      float64
PARTEPRE40TOTAL     float64
PARTE1940TOTAL      float64
PARTE1950TOTAL      float64
PARTE1960TOTAL      float64
PARTE1970TOTAL      float64
PARTE1980TOTAL      float64
PARTE1990TOTAL      float64
PARTE2000TOTAL      float64
PARTE2010TOTAL      float64
PARTBHCAOFFSHORE    float64
PARTBHCATOTAL       float64
dtype: object

In [20]:
pipelines_2010_selected = pipelines_2010_selected.rename(
    columns={'REPORT_YEAR': 'YEAR', 'PARTA2NAMEOFCOMP': 'NAME', 'PARTA5COMMODITY': 'COMMODITY', 'PARTBHCAONSHORE': "MILES", 
             'PARTBHCAOFFSHORE': 'OFFSHORE_MILES', 'PARTBHCATOTAL': 'TOTAL_MILES', 'PARTEUNKNTOTAL': 'AGE_UNKNOWN_MILES', 
             'PARTEPRE40TOTAL': 'PRE_1940_MILES', 'PARTE1940TOTAL': '1940_MILES', 'PARTE1950TOTAL': '1950_MILES', 
             'PARTE1960TOTAL': '1960_MILES', 'PARTE1970TOTAL': '1970_MILES', 'PARTE1980TOTAL': '1980_MILES', 
             'PARTE1990TOTAL': '1990_MILES', 'PARTE2000TOTAL': '2000_MILES', 'PARTE2010TOTAL': '2010_MILES'})
pipelines_2010_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,AGE_UNKNOWN_MILES,PRE_1940_MILES,1940_MILES,1950_MILES,1960_MILES,1970_MILES,1980_MILES,1990_MILES,2000_MILES,2010_MILES,OFFSHORE_MILES,TOTAL_MILES
4132,18718,2016,SUNOCO PIPELINE L.P.,Crude Oil,1711.93,216.0,765.6,250.2,388.55,189.2,48.5,29.2,28.0,32.13,658.15,,1711.93
2937,10346,2010,KUPARUK TRANSPORTATION CO,Crude Oil,36.862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,36.862
4591,39231,2016,XPLORER MIDSTREAM LLC,HVL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6,,0.0
487,38960,2017,"RANCHO LPG HOLDINGS, LLC",HVL,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1112,32386,2011,OILTANKING BEAUMONT PARTNERS L.P.,Refined and/or Petroleum Product (non-HVL),14.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,14.37


### Calculate percentage offshore pipelines and average age of pipelines

In [21]:
pipelines_2010_selected['OFFSHORE_MILES'] = pipelines_2010_selected['OFFSHORE_MILES'].fillna(0.0)
pipelines_2010_selected['PERC_OFFSHORE'] = pipelines_2010_selected['OFFSHORE_MILES'] / pipelines_2010_selected['TOTAL_MILES']
# pipelines_2010_selected = pipelines_2010_selected.drop(columns=['TOTAL_MILES'])
# decade (e.g., 1990s pipelines on average are build in 1995).
pipelines_2010_selected['AVG_AGE'] = ((pipelines_2010_selected['PRE_1940_MILES'] * 90 + 
                                       pipelines_2010_selected['1940_MILES'] * 75 + 
                                       pipelines_2010_selected['1950_MILES'] * 65 + 
                                       pipelines_2010_selected['1960_MILES'] * 55 + 
                                       pipelines_2010_selected['1970_MILES'] * 45 + 
                                       pipelines_2010_selected['1980_MILES'] * 35 + 
                                       pipelines_2010_selected['1990_MILES'] * 25 + 
                                       pipelines_2010_selected['2000_MILES'] * 15 + 
                                       pipelines_2010_selected['2010_MILES'] * 5) /
                                       (pipelines_2010_selected['PRE_1940_MILES'] +
                                        pipelines_2010_selected['1940_MILES'] +
                                        pipelines_2010_selected['1950_MILES'] +
                                        pipelines_2010_selected['1960_MILES'] + 
                                        pipelines_2010_selected['1970_MILES'] +
                                        pipelines_2010_selected['1980_MILES'] +
                                        pipelines_2010_selected['1990_MILES'] +
                                        pipelines_2010_selected['2000_MILES'] +
                                        pipelines_2010_selected['2010_MILES']))
pipelines_2010_selected[['OPERATOR_ID', 'YEAR', 'COMMODITY', 'AVG_AGE', 'PERC_OFFSHORE']].sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,AVG_AGE,PERC_OFFSHORE
210,31051,2017,Refined and/or Petroleum Product (non-HVL),37.57085,0.0
2393,15156,2013,Crude Oil,41.617336,0.0
855,30544,2011,Refined and/or Petroleum Product (non-HVL),65.0,0.0
147,26061,2017,Crude Oil,68.231843,0.0
2283,38942,2012,Crude Oil,,0.0


In [22]:
pipelines_2010_selected['PERC_OFFSHORE'].isna().value_counts()

False    4192
True     1102
Name: PERC_OFFSHORE, dtype: int64

In [23]:
(pipelines_2010_selected['TOTAL_MILES'] == 0.0).value_counts()

False    4192
True     1102
Name: TOTAL_MILES, dtype: int64

In [24]:
# Add 0.1 mile to divisor to avoid NaNs
pipelines_2010_selected['PERC_OFFSHORE'] = pipelines_2010_selected['OFFSHORE_MILES'] / (pipelines_2010_selected['TOTAL_MILES'] + 0.1)
pipelines_2010_selected[['OPERATOR_ID', 'YEAR', 'COMMODITY', 'AVG_AGE', 'PERC_OFFSHORE']].sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,COMMODITY,AVG_AGE,PERC_OFFSHORE
1435,31371,2014,Refined and/or Petroleum Product (non-HVL),48.740544,0.0
1673,32683,2014,Refined and/or Petroleum Product (non-HVL),70.776732,0.0
1574,32213,2014,Crude Oil,,0.0
4089,10250,2016,Refined and/or Petroleum Product (non-HVL),37.77368,0.0
2604,31684,2013,HVL,49.779747,0.0


In [25]:
pipelines_2010_selected['PERC_OFFSHORE'].isna().value_counts()

False    5294
Name: PERC_OFFSHORE, dtype: int64

In [26]:
pipelines_2010_selected.to_feather(f'../preprocessed_data/pipelines_2010_selected_{today}.feather')

#### For 2004-

Clean name column

In [27]:
import numpy as np

pipelines_2004_2009['NAME_FIXED'] = np.where(pd.isnull(pipelines_2004_2009['PARENT']), 
                                             pipelines_2004_2009['NAME'], 
                                             pipelines_2004_2009['PARENT'])
pd.isnull(pipelines_2004_2009['NAME_FIXED']).value_counts()

False    2730
Name: NAME_FIXED, dtype: int64

In [28]:
pipelines_2004_2009[['NAME_FIXED', 'NAME', 'PARENT']].sample(5)

Unnamed: 0,NAME_FIXED,NAME,PARENT
1906,GENESIS PIPELINE TEXAS LP,GENESIS PIPELINE USA LP,GENESIS PIPELINE TEXAS LP
1298,DCP MIDSTREAM,DCP MIDSTREAM,
2292,"AMERADA HESS CORP, N. JERSEY PUBLIC SVC PIPELINE","AMERADA HESS CORP, N. JERSEY PUBLIC SVC PIPELINE",
1216,"ENBRIDGE ENERGY PARTNERS, L.P.",ENBRIDGE PIPELINES (TOLEDO) INC,"ENBRIDGE ENERGY PARTNERS, L.P."
419,TESORO ALASKA PIPELINE COMPANY,TESORO ALASKA PIPELINE COMPANY,


Select columns

In [29]:
pipelines_2004_selected = pipelines_2004_2009[['OPERATOR_ID', 'YR', 'NAME_FIXED', 'HCAMT']].copy()
pipelines_2004_selected.dtypes

OPERATOR_ID      int64
YR               int64
NAME_FIXED      object
HCAMT          float64
dtype: object

In [30]:
pipelines_2004_selected['OPERATOR_ID'] = pipelines_2004_selected['OPERATOR_ID'].astype(str)
pipelines_2004_selected.dtypes

OPERATOR_ID     object
YR               int64
NAME_FIXED      object
HCAMT          float64
dtype: object

In [31]:
pipelines_2004_selected = pipelines_2004_selected.rename(
    columns={'YR': 'YEAR', 'NAME_FIXED': 'NAME', 'HCAMT': 'MILES_TOTAL'})
pipelines_2004_selected.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,MILES_TOTAL
1081,18526,2009,"SOUTHERN UNION GAS SERVICES, LTD",0.0
1375,30755,2009,CITGO PRODUCTS PIPELINE CO,109.89
2619,3156,2007,DAVIS GAS PROCESSING,0.0
484,32103,2006,CRIMSON PIPELINE L.P.,307.27
1282,18667,2009,PLAINS EXPLORATION & PRODUCTION COMPANY (PXP),58.03


In [32]:
pipelines_2004_selected.to_feather(f'../preprocessed_data/pipelines_2004_selected_{today}.feather')

### 3.3 Write original data to .feather for reference

Some columns get erroneously read to data type 'O'. We convert those manually to str type.

In [33]:
pipelines_2010_present.loc[:, pipelines_2010_present.dtypes == 'O'] = pipelines_2010_present.loc[
    :, pipelines_2010_present.dtypes == 'O'].astype(str)

pipelines_2004_2009.loc[:, pipelines_2004_2009.dtypes == 'O'] = pipelines_2004_2009.loc[
    :, pipelines_2004_2009.dtypes == 'O'].astype(str)

incidents.loc[:, incidents.dtypes == 'O'] = incidents.loc[
    :, incidents.dtypes == 'O'].astype(str)

In [34]:
pipelines_2010_present.to_feather(f'../data/pipelines_2010_{today}.feather')
pipelines_2004_2009.to_feather(f'../data/pipelines_2004_{today}.feather')
incidents.to_feather(f'../data/incidents_{today}.feather')