# 5. Find largest observations (R)

In [1]:
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1       [32m✔[39m [34mpurrr  [39m 0.3.2  
[32m✔[39m [34mtibble [39m 2.1.1       [32m✔[39m [34mdplyr  [39m 0.8.0.[31m1[39m
[32m✔[39m [34mtidyr  [39m 0.8.3       [32m✔[39m [34mstringr[39m 1.4.0  
[32m✔[39m [34mreadr  [39m 1.3.1       [32m✔[39m [34mforcats[39m 0.4.0  
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
pipelines_2010_selected_file = "../preprocessed_data/pipelines_2010_selected_2019-08-11.feather"

## Load data

In [4]:
pipelines_2010 <- read_feather(pipelines_2010_selected_file)
glimpse(pipelines_2010)

“Coercing int64 to double”

Observations: 5,294
Variables: 5
$ OPERATOR_ID [3m[38;5;246m<chr>[39m[23m "300", "300", "300", "395", "402", "473", "473", "515", "…
$ YEAR        [3m[38;5;246m<dbl>[39m[23m 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 201…
$ NAME        [3m[38;5;246m<chr>[39m[23m "PLAINS PIPELINE, L.P.", "PLAINS PIPELINE, L.P.", "PLAINS…
$ COMMODITY   [3m[38;5;246m<chr>[39m[23m "Crude Oil", "HVL", "Refined and/or Petroleum Product (no…
$ MILES       [3m[38;5;246m<dbl>[39m[23m 2101.060, 72.900, 0.000, 662.300, 1.550, 12.000, 0.000, 3…


## 5.1 Find largest observations

In [5]:
pipelines_2010 %>%
    group_by(OPERATOR_ID) %>%
    top_n(1, MILES) %>%
    ungroup() %>%
    top_n(10, MILES) %>%
    arrange(desc(MILES))

OPERATOR_ID,YEAR,NAME,COMMODITY,MILES
<chr>,<dbl>,<chr>,<chr>,<dbl>
31618,2018,ENTERPRISE PRODUCTS OPERATING LLC,HVL,7726.183
32109,2018,"ONEOK NGL PIPELINE, LLC",HVL,4531.48
2552,2015,COLONIAL PIPELINE CO,Refined and/or Petroleum Product (non-HVL),4500.92
1845,2015,"BUCKEYE PARTNERS, LP",Refined and/or Petroleum Product (non-HVL),4266.43
22610,2018,"MAGELLAN PIPELINE COMPANY, LP",Refined and/or Petroleum Product (non-HVL),3926.4
15674,2016,PLANTATION PIPE LINE CO,Refined and/or Petroleum Product (non-HVL),2564.73
300,2016,"PLAINS PIPELINE, L.P.",Crude Oil,2312.33
11169,2017,"ENBRIDGE ENERGY, LIMITED PARTNERSHIP",Crude Oil,2290.0
31684,2010,CONOCOPHILLIPS,Refined and/or Petroleum Product (non-HVL),2283.79
32147,2013,MARATHON PIPE LINE LLC,Refined and/or Petroleum Product (non-HVL),2281.9


#### Largest for the sum of all types

In [6]:
unique(pipelines_2010$COMMODITY)

Check whether any operator ID has more than the maximum of five expected entries.

In [7]:
pipelines_2010 %>%
    group_by(YEAR, OPERATOR_ID) %>%
    summarize(n = n()) %>%
    ungroup() %>%
    {table(.$n)}


   1    2    3    4 
3307  565  251   26 

Everything seems to be A-Okay.

In [8]:
largest_pipeline_operators <- pipelines_2010 %>%
    group_by(OPERATOR_ID) %>% 
    # Make sure we retrieve the latestb name, no matter what year has to most extensive pipeline network
    arrange(desc(YEAR)) %>%
    mutate(NAME = first(NAME)) %>%
    group_by(OPERATOR_ID, YEAR) %>%
    summarize(NAME = first(NAME), TOTAL_MILES = sum(MILES)) %>%
    group_by(OPERATOR_ID) %>%
    top_n(1, TOTAL_MILES) %>%
    # Year as tie breaker
    top_n(1, YEAR) %>%
    ungroup() %>%
    arrange(desc(TOTAL_MILES))

head(largest_pipeline_operators, 50)

OPERATOR_ID,YEAR,NAME,TOTAL_MILES
<chr>,<dbl>,<chr>,<dbl>
31618,2018,ENTERPRISE PRODUCTS OPERATING LLC,8325.499
32109,2018,"ONEOK NGL PIPELINE, LLC",4756.61
22610,2014,"MAGELLAN PIPELINE COMPANY, LP",4505.5
2552,2015,COLONIAL PIPELINE CO,4500.92
31684,2017,PHILLIPS 66 PIPELINE LLC,4474.1
1845,2015,"BUCKEYE PARTNERS, LP",4298.29
32147,2018,MARATHON PIPE LINE LLC,4037.2
18718,2010,SUNOCO PIPELINE L.P.,3466.74
4906,2013,EXXONMOBIL PIPELINE CO,2944.4
22855,2013,"FLINT HILLS RESOURCES, LC",2757.45


## 5.2 Save result

In [9]:
feather::write_feather(largest_pipeline_operators, paste0("../preprocessed_data/largest_companies_", Sys.Date(), ".feather"))