# 9. Consolidate observations

In [527]:
sample_file = '../preprocessed_data/sample_2019-08-17.feather'
consolidate = '../preprocessed_data/consolidate_2019-08-17.pickle'
pipelines_2010_raw_file = '../data/pipelines_2010_2019-08-16.feather'

We will use some R functions for convenience.

In [528]:
import rpy2.rinterface

In [529]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [530]:
%%R
suppressPackageStartupMessages(library(tidyverse))

## Setup

In [531]:
import pandas as pd
import numpy as np
from datetime import date
from functools import partial

today = date.today().isoformat()

# Load data 

In [532]:
sample = pd.read_feather(sample_file)
sample.sample(3)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS
745,32334,2016,TC OIL PIPELINE OPERATIONS INC,crude,785.83,TransCanada USA,2.0,1.0
708,32147,2012,MARATHON PIPE LINE LLC,non-hvl,2043.95,Marathon Petroleum,1.0,0.0
593,31618,2015,ENTERPRISE PRODUCTS OPERATING LLC,non-hvl,624.302,Enterprise Products Partners,5.0,1.0


In [533]:
import pickle

with open(consolidate, 'rb') as file:
    company_groups, m_as, spin_offs = pickle.load(file)

In [545]:
pd.DataFrame()

AttributeError: 'list' object has no attribute 'itemgetter'

## 9.1 Calculate absolute change

We calculate absolute change before merging to obtain more accurate data.

In [534]:
%%R -i sample
sample <- sample %>%
    group_by(OPERATOR_ID, COMMODITY) %>%
    arrange(YEAR) %>%
    mutate(CHANGE = abs(MILES - lag(MILES, 1)))

In [535]:
sample = %Rget sample
sample.sample(5)

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS,CHANGE
599,38933,2016,TESORO LOGISTICS OPERATIONS LLC - SOUTHERN CAL...,crude,5.3,Marathon Petroleum,0.0,0.0,3.3
167,300,2012,"PLAINS PIPELINE, L.P.",non-hvl,265.17,Plains All American Pipeline,1.0,1.0,61.63
634,18092,2017,"SFPP, LP",non-hvl,1773.36,Kinder Morgan,5.0,3.0,5.05
614,2731,2017,CHEVRON PIPE LINE CO,crude,301.0,Chevron,0.0,0.0,0.0
444,12470,2015,MID - VALLEY PIPELINE CO,crude,662.0,Energy Transfer,1.0,1.0,6.0


## 9.2 Consolidate observations

### 9.2.1 Group observations: company_groups

In [536]:
company_groups

[{'name': 'Enterprise Products (Group)',
  'members': ['31618', '30829', '3445']},
 {'name': 'Buckeye (Group)', 'members': ['1845', '31371']},
 {'name': 'Marathon (Group)', 'members': ['32147', '22830']},
 {'name': 'Tesoro (Group)', 'members': ['38933', '39029']},
 {'name': 'Sunoco (Group)', 'members': ['18718', '12470', '39596']},
 {'name': 'Energy Transfer (Group)', 'members': ['32099', '39205']},
 {'name': 'ExxonMobil (Group)', 'members': ['4906', '12628']},
 {'name': 'NuStar', 'members': ['10012', '31454']},
 {'name': 'Kinder Morgan (Group)',
  'members': ['15674', '18092', '26125', '32258']},
 {'name': 'Enbridge (Group)', 'members': ['11169', '32080', '32502']},
 {'name': 'Plains Pipeline (Group)', 'members': ['300', '31666']},
 {'name': 'BP (Group)', 'members': ['31189', '30781']}]

In [538]:
from functools import reduce

for group in company_groups:
    new_group = sample.loc[sample['OPERATOR_ID'].isin(group['members'])]
    
    new_group_miles = new_group[['YEAR', 'COMMODITY', 'MILES']].groupby(['YEAR', 'COMMODITY']).sum()
    new_group_change = new_group[['YEAR', 'COMMODITY', 'CHANGE']].groupby(['YEAR', 'COMMODITY']).sum()
    new_group_incidents = new_group[['YEAR', 'COMMODITY', 'INCIDENTS']].groupby(['YEAR', 'INCIDENTS']).sum()
    
    new_group_dfs = [new_group_miles, new_group_change, new_group_incidents]
    new_group = reduce(lambda left, right: pd.merge(left, right, on=['YEAR', 'COMMODITY'], how='outer'), new_group_dfs)
    new_group['OPERATOR_ID'] = group['name']
    new_group['NAME'] = group['name']
    new_group['PARENT'] = 'parent'
    # new_group['YEAR'] = new_group.index
    new_group = new_group.reset_index('YEAR')
    
    sample = pd.concat([sample, new_group], sort=False, ignore_index=True)
    sample = sample.loc[~sample['OPERATOR_ID'].isin(group['members'])]

sample = sample.reset_index(drop=True)
print(sample.tail())

    OPERATOR_ID  YEAR        NAME               COMMODITY  MILES  PARENT  \
771  BP (Group)  2015  BP (Group)       non-hvlhvlnon-hvl    NaN  parent   
772  BP (Group)  2016  BP (Group)              hvlnon-hvl    NaN  parent   
773  BP (Group)  2017  BP (Group)              hvlnon-hvl    NaN  parent   
774  BP (Group)  2017  BP (Group)            non-hvlcrude    NaN  parent   
775  BP (Group)  2018  BP (Group)  non-hvlcrudehvlnon-hvl    NaN  parent   

     INCIDENTS  SIGNIFICANT_INCIDENTS  CHANGE  
771        NaN                    NaN     NaN  
772        NaN                    NaN     NaN  
773        NaN                    NaN     NaN  
774        NaN                    NaN     NaN  
775        NaN                    NaN     NaN  


### 9.2.2 Group observations: m_as

In [540]:
m_as[:1]

[{'name': 'Marathon (Group)',
  'members': ['Marathon (Group)', 'Tesoro (Group)'],
  'start_year': '2018'}]

In [541]:
for group in m_as[:1]:
    observations = [sample[sample['OPERATOR_ID'] == member] for member in group['members']]
    new_group = pd.concat(observations)
    
    if 'start_year' in group:
        observations = new_group.loc[new_group['YEAR'] >= int(group['start_year'])]
    if 'end_year' in group:
        observations = new_group.loc[new_group['YEAR'] < int(group['start_year'])]
    
    print(new_group)
    

          OPERATOR_ID  YEAR              NAME            COMMODITY    MILES  \
439  Marathon (Group)  2010  Marathon (Group)                crude   805.60   
440  Marathon (Group)  2010  Marathon (Group)                  hvl    63.60   
441  Marathon (Group)  2010  Marathon (Group)              non-hvl  2653.13   
442  Marathon (Group)  2011  Marathon (Group)                crude   824.20   
443  Marathon (Group)  2011  Marathon (Group)                  hvl    64.00   
444  Marathon (Group)  2011  Marathon (Group)              non-hvl  2617.23   
445  Marathon (Group)  2011  Marathon (Group)              non-hvl  2617.23   
446  Marathon (Group)  2012  Marathon (Group)                crude   826.50   
447  Marathon (Group)  2012  Marathon (Group)                  hvl    64.50   
448  Marathon (Group)  2012  Marathon (Group)              non-hvl  2579.06   
449  Marathon (Group)  2013  Marathon (Group)                crude  1234.50   
450  Marathon (Group)  2013  Marathon (Group)       

In [542]:
sample[sample['YEAR'].isna()]

Unnamed: 0,OPERATOR_ID,YEAR,NAME,COMMODITY,MILES,PARENT,INCIDENTS,SIGNIFICANT_INCIDENTS,CHANGE
