# 9. Consolidate observations

## Setup

In [1]:
library(tidyverse)
library(feather)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1       [32m✔[39m [34mpurrr  [39m 0.3.2  
[32m✔[39m [34mtibble [39m 2.1.1       [32m✔[39m [34mdplyr  [39m 0.8.0.[31m1[39m
[32m✔[39m [34mtidyr  [39m 0.8.3       [32m✔[39m [34mstringr[39m 1.4.0  
[32m✔[39m [34mreadr  [39m 1.3.1       [32m✔[39m [34mforcats[39m 0.4.0  
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
sample_file <- '../preprocessed_data/sample_2019-08-17.feather'
company_groups_file <- '../preprocessed_data/company_groups_2019-08-18.feather'
m_as_file <- '../preprocessed_data/m_as_2019-08-18.feather'

In [3]:
today <- Sys.Date()

## Read data

In [4]:
sample <- read_feather(sample_file)
glimpse(sample)

“Coercing int64 to double”

Observations: 795
Variables: 8
$ OPERATOR_ID           [3m[38;5;246m<chr>[39m[23m "300", "300", "300", "300", "300", "300", "300"…
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2017, 2017, 2017, 2011, 2011, 2014, 2014, 2012,…
$ NAME                  [3m[38;5;246m<chr>[39m[23m "PLAINS PIPELINE, L.P.", "PLAINS PIPELINE, L.P.…
$ COMMODITY             [3m[38;5;246m<chr>[39m[23m "crude", "hvl", "non-hvl", "crude", "non-hvl", …
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 2101.060, 72.900, 0.000, 1988.140, 326.800, 223…
$ PARENT                [3m[38;5;246m<chr>[39m[23m "Plains All American Pipeline", "Plains All Ame…
$ INCIDENTS             [3m[38;5;246m<dbl>[39m[23m 18, 0, 0, 12, 0, 34, 0, 25, 1, 20, 11, 0, 19, 0…
$ SIGNIFICANT_INCIDENTS [3m[38;5;246m<dbl>[39m[23m 9, 0, 0, 7, 0, 7, 0, 3, 1, 6, 3, 0, 4, 0, 0, 11…


In [5]:
company_groups <- read_feather(company_groups_file)
glimpse(company_groups)

Observations: 29
Variables: 2
$ members [3m[38;5;246m<chr>[39m[23m "31618", "30829", "3445", "1845", "31371", "32147", "22830", …
$ name    [3m[38;5;246m<chr>[39m[23m "Enterprise Products (Group)", "Enterprise Products (Group)",…


In [6]:
m_as <- read_feather(m_as_file)
glimpse(m_as)

Observations: 8
Variables: 4
$ members    [3m[38;5;246m<chr>[39m[23m "Marathon (Group)", "Tesoro (Group)", "Sunoco (Group)", "E…
$ name       [3m[38;5;246m<chr>[39m[23m "Marathon (Group)", "Marathon (Group)", "Sunoco (Group)", …
$ start_year [3m[38;5;246m<chr>[39m[23m "2018", "2018", "2017", "2017", NA, NA, "2018", "2018"
$ end_year   [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, "2013", "2013", NA, NA


## 9.1 Calculate absolute change

In [7]:
sample <- sample %>%
    group_by(OPERATOR_ID, COMMODITY) %>%
    arrange(YEAR) %>%
    mutate(CHANGE = abs(MILES - lag(MILES, 1)))
glimpse(subset(sample, YEAR > 2010))

Observations: 711
Variables: 9
Groups: OPERATOR_ID, COMMODITY [101]
$ OPERATOR_ID           [3m[38;5;246m<chr>[39m[23m "300", "300", "395", "1845", "1845", "2552", "2…
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,…
$ NAME                  [3m[38;5;246m<chr>[39m[23m "PLAINS PIPELINE, L.P.", "PLAINS PIPELINE, L.P.…
$ COMMODITY             [3m[38;5;246m<chr>[39m[23m "crude", "non-hvl", "non-hvl", "hvl", "non-hvl"…
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 1988.140, 326.800, 527.000, 47.644, 4043.393, 4…
$ PARENT                [3m[38;5;246m<chr>[39m[23m "Plains All American Pipeline", "Plains All Ame…
$ INCIDENTS             [3m[38;5;246m<dbl>[39m[23m 12, 0, 0, 0, 14, 12, 0, 3, 4, 1, 1, 0, 2, 5, 2,…
$ SIGNIFICANT_INCIDENTS [3m[38;5;246m<dbl>[39m[23m 7, 0, 0, 0, 7, 2, 0, 2, 2, 1, 1, 0, 2, 0, 0, 2,…
$ CHANGE                [3m[38;5;246m<dbl>[39m[23m 726.700, 264.520, 202.000, 0.047, 289.292, 63

## 9.2 Consolidate observations

### 9.2.1 Company groups

In [8]:
company_groups <- rename(company_groups, new_name = name)
sample <- left_join(sample, company_groups, by=c('OPERATOR_ID' = 'members'))
glimpse(subset(sample, !is.na(sample$new_name)))

Observations: 439
Variables: 10
Groups: OPERATOR_ID, COMMODITY [60]
$ OPERATOR_ID           [3m[38;5;246m<chr>[39m[23m "300", "300", "1845", "1845", "3445", "4906", "…
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,…
$ NAME                  [3m[38;5;246m<chr>[39m[23m "PLAINS PIPELINE, L.P.", "PLAINS PIPELINE, L.P.…
$ COMMODITY             [3m[38;5;246m<chr>[39m[23m "crude", "non-hvl", "hvl", "non-hvl", "hvl", "c…
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 1261.440, 62.280, 47.691, 3754.101, 577.251, 97…
$ PARENT                [3m[38;5;246m<chr>[39m[23m "Plains All American Pipeline", "Plains All Ame…
$ INCIDENTS             [3m[38;5;246m<dbl>[39m[23m 11, 0, 0, 10, 3, 4, 2, 1, 3, 4, 14, 0, 3, 0, 0,…
$ SIGNIFICANT_INCIDENTS [3m[38;5;246m<dbl>[39m[23m 3, 0, 0, 4, 2, 1, 0, 1, 0, 0, 7, 0, 2, 0, 0, 0,…
$ CHANGE                [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N

In [9]:
sample_no_group <- subset(sample, is.na(sample$new_name))
sample_group <- subset(sample, !is.na(sample$new_name))

sample_group <- sample_group %>%
    group_by(new_name, YEAR, COMMODITY) %>%
    summarize(OPERATOR_ID = first(new_name), 
              NAME = first(new_name),
              MILES = sum(MILES), 
              PARENT = first(PARENT), 
              INCIDENTS = sum(INCIDENTS), 
              SIGNIFICANT_INCIDENTS = sum(SIGNIFICANT_INCIDENTS), 
              CHANGE = sum(CHANGE), 
              GROUP = "group") %>%
    ungroup()
sample_no_group$GROUP = "not group"

sample <- bind_rows(sample_group, sample_no_group)
sample <- sample[ , (names(sample) != "new_name")]
glimpse(sample_n(sample, 10))

Observations: 10
Variables: 10
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2013, 2013, 2011, 2012, 2018, 2017, 2010, 2015,…
$ COMMODITY             [3m[38;5;246m<chr>[39m[23m "hvl", "non-hvl", "crude", "co2", "hvl", "hvl",…
$ OPERATOR_ID           [3m[38;5;246m<chr>[39m[23m "32109", "Tesoro (Group)", "Marathon (Group)", …
$ NAME                  [3m[38;5;246m<chr>[39m[23m "ONEOK NGL PIPELINE, LLC", "Tesoro (Group)", "M…
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 2980.230, 504.400, 824.200, 2.600, 1180.090, 40…
$ PARENT                [3m[38;5;246m<chr>[39m[23m "ONEOK", "Marathon Petroleum", "Marathon Petrol…
$ INCIDENTS             [3m[38;5;246m<dbl>[39m[23m 3, 1, 5, 0, 1, 1, 5, 1, 0, 1
$ SIGNIFICANT_INCIDENTS [3m[38;5;246m<dbl>[39m[23m 2, 1, 3, 0, 0, 0, 4, 1, 0, 1
$ CHANGE                [3m[38;5;246m<dbl>[39m[23m 326.58, NA, 18.60, 0.00, 140.03, 1.28, NA, 60.4…
$ GROUP                 [3m[38;5;246m<chr>[39m[23m "not group", "grou

In [10]:
table(sample$GROUP)


    group not group 
      289       356 

### 9.2.2 M&As

In [11]:
m_as <- rename(m_as, new_name = name)

sample <- left_join(sample, m_as, by=c('OPERATOR_ID' = 'members'))

# We need to fill in the NA values for start and end_year with values that always match, because when logical conditions encounter NAs
# they will always be wrong. The filtering in those cases happens because the new_name column is NA.
sample[is.na(sample$start_year), ]$start_year <- -9999
sample[is.na(sample$end_year), ]$end_year <- 9999
glimpse(sample)

Observations: 654
Variables: 13
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012,…
$ COMMODITY             [3m[38;5;246m<chr>[39m[23m "crude", "hvl", "non-hvl", "crude", "hvl", "non…
$ OPERATOR_ID           [3m[38;5;246m<chr>[39m[23m "BP (Group)", "BP (Group)", "BP (Group)", "BP (…
$ NAME                  [3m[38;5;246m<chr>[39m[23m "BP (Group)", "BP (Group)", "BP (Group)", "BP (…
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 427.0, 334.0, 546.0, 427.0, 108.0, 518.0, 427.0…
$ PARENT                [3m[38;5;246m<chr>[39m[23m "BP", "BP", "BP", "BP", "BP", "BP", "BP", "BP",…
$ INCIDENTS             [3m[38;5;246m<dbl>[39m[23m 2, 1, 0, 1, 0, 1, 0, 0, 2, 1, 0, 0, 1, 0, 2, 1,…
$ SIGNIFICANT_INCIDENTS [3m[38;5;246m<dbl>[39m[23m 2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,…
$ CHANGE                [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, 0.0, 226.0, 28.0, 0.0, 0.0, 0.0, 28…
$ GROUP                 [3m[38

In [12]:
sample_m_as <- subset(sample, (!is.na(new_name) & YEAR >= start_year & YEAR < end_year))
sample_no_m_as <- subset(sample, !(!is.na(new_name) & YEAR >= start_year & YEAR < end_year))

glimpse(sample_m_as)

Observations: 29
Variables: 13
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2018, 2018, 2018, 2017, 2017, 2018, 2018, 2010,…
$ COMMODITY             [3m[38;5;246m<chr>[39m[23m "crude", "hvl", "non-hvl", "crude", "hvl", "cru…
$ OPERATOR_ID           [3m[38;5;246m<chr>[39m[23m "Enbridge (Group)", "Enbridge (Group)", "Enbrid…
$ NAME                  [3m[38;5;246m<chr>[39m[23m "Enbridge (Group)", "Enbridge (Group)", "Enbrid…
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 2740.45, 163.74, 473.84, 887.62, 1218.28, 1087.…
$ PARENT                [3m[38;5;246m<chr>[39m[23m "Enbridge", "Enbridge", "Enbridge", "Energy Tra…
$ INCIDENTS             [3m[38;5;246m<dbl>[39m[23m 7, 0, 0, 9, 4, 7, 2, 0, 11, 2, 7, 0, 5, 2, 0, 4…
$ SIGNIFICANT_INCIDENTS [3m[38;5;246m<dbl>[39m[23m 0, 0, 0, 1, 4, 0, 2, 0, 2, 1, 1, 0, 0, 1, 0, 2,…
$ CHANGE                [3m[38;5;246m<dbl>[39m[23m 237.55, 6.26, 20.16, NA, 538.70, 200.08, 60.23,…
$ GROUP                 [3m[38;

In [13]:
sample_m_as <- sample_m_as %>%
    group_by(new_name, YEAR, COMMODITY) %>%
    summarize(OPERATOR_ID = first(new_name), 
              NAME = first(new_name),
              MILES = sum(MILES), 
              PARENT = first(PARENT), 
              INCIDENTS = sum(INCIDENTS), 
              SIGNIFICANT_INCIDENTS = sum(SIGNIFICANT_INCIDENTS), 
              CHANGE = sum(CHANGE), 
              GROUP = 'm&a') %>%
    ungroup()

sample <- bind_rows(sample_m_as, sample_no_m_as)
sample <- sample[ , !(names(sample) %in% c("new_name", "start_year", "end_year"))]
glimpse(sample)

Observations: 646
Variables: 10
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2018, 2018, 2018, 2010, 2010, 2010, 2011, 2011,…
$ COMMODITY             [3m[38;5;246m<chr>[39m[23m "crude", "hvl", "non-hvl", "crude", "hvl", "non…
$ OPERATOR_ID           [3m[38;5;246m<chr>[39m[23m "Enbridge (Group)", "Enbridge (Group)", "Enbrid…
$ NAME                  [3m[38;5;246m<chr>[39m[23m "Enbridge (Group)", "Enbridge (Group)", "Enbrid…
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 3182.99, 163.74, 473.84, 196.00, 243.21, 4660.4…
$ PARENT                [3m[38;5;246m<chr>[39m[23m "Enbridge", "Enbridge", "Enbridge", "Enbridge",…
$ INCIDENTS             [3m[38;5;246m<dbl>[39m[23m 11, 0, 0, 2, 0, 11, 0, 2, 7, 0, 0, 5, 2, 0, 4, …
$ SIGNIFICANT_INCIDENTS [3m[38;5;246m<dbl>[39m[23m 2, 0, 0, 2, 0, 2, 0, 1, 1, 0, 0, 0, 1, 0, 2, 4,…
$ CHANGE                [3m[38;5;246m<dbl>[39m[23m 271.09, 6.26, 20.16, NA, NA, NA, 0.00, 240.28, …
$ GROUP                 [3m[38

## 9.3 Save

In [14]:
feather::write_feather(sample, paste0("../preprocessed_data/sample_consolidated_", today, ".feather"))
haven::write_dta(sample, paste0("../stata_data/sample_consolidated_", today, ".dta"))