# Create Dataset

## Setup

In [1]:
library(tidyverse)
library(feather)
year <- lubridate::year

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1       [32m✔[39m [34mpurrr  [39m 0.3.2  
[32m✔[39m [34mtibble [39m 2.1.1       [32m✔[39m [34mdplyr  [39m 0.8.0.[31m1[39m
[32m✔[39m [34mtidyr  [39m 0.8.3       [32m✔[39m [34mstringr[39m 1.4.0  
[32m✔[39m [34mreadr  [39m 1.3.1       [32m✔[39m [34mforcats[39m 0.4.0  
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
raw_data_file <- "../../../data/incidents_2019-08-11.feather"
largest_observations_file = '../../../preprocessed_data/largest_companies_2019-09-08.feather'

## Read Data

In [3]:
largest_observations <- read_feather(largest_observations_file)
head(largest_observations)

OPERATOR_ID,YEAR,NAME,TOTAL_MILES
<chr>,<dbl>,<chr>,<dbl>
31618,2018,ENTERPRISE PRODUCTS OPERATING LLC,8311.369
32109,2018,"ONEOK NGL PIPELINE, LLC",4756.61
31684,2005,PHILLIPS 66 PIPELINE LLC,4691.0
22610,2014,"MAGELLAN PIPELINE COMPANY, LP",4505.5
2552,2015,COLONIAL PIPELINE CO,4500.92
1845,2015,"BUCKEYE PARTNERS, LP",4298.29


In [4]:
raw_data <- read_feather(raw_data_file)
head(raw_data[, 1:20])

“Coercing int64 to double”

DATAFILE_AS_OF,SIGNIFICANT,SERIOUS,IPE,IA_IPE,OM_IPE,REPORT_NUMBER,SUPPLEMENTAL_NUMBER,REPORT_RECEIVED_DATE,REPORT_TYPE,OPERATOR_ID,NAME,OPERATOR_STREET_ADDRESS,OPERATOR_CITY_NAME,OPERATOR_STATE_ABBREVIATION,OPERATOR_POSTAL_CODE,LOCAL_DATETIME,IYEAR,LOCATION_LATITUDE,LOCATION_LONGITUDE
<dttm>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dttm>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dttm>,<dbl>,<dbl>,<dbl>
2019-07-31 04:37:40,NO,NO,NO,NO,NO,20100001,15751,2010-03-10 08:05:46,SUPPLEMENTAL FINAL,22610,"MAGELLAN PIPELINE COMPANY, LP","MAGELLAN MIDSTREAM PARTNERS, L.P.",TULSA,OK,74172,2010-02-16 02:42:00,2010,41.94352,-88.23353
2019-07-31 04:37:40,NO,NO,NO,NO,NO,20100002,19837,2010-03-16 13:43:54,SUPPLEMENTAL FINAL,31672,"CHAPARRAL ENERGY, LLC",701 CEDAR LAKE BLVD,OKLAHOMA CITY,OK,73114,2010-03-01 06:50:00,2010,37.10847,-100.80037
2019-07-31 04:37:40,YES,NO,NO,NO,NO,20100003,18021,2010-03-17 06:38:15,SUPPLEMENTAL FINAL,32035,LDH ENERGY PIPELINE L.P.,13430 NORTHWEST FREEWAY SUITE 1200,HOUSTON,TX,77040-6019,2010-02-22 05:38:00,2010,32.22471,-101.4044
2019-07-31 04:37:40,NO,NO,NO,NO,NO,20100004,19086,2010-03-18 09:18:48,SUPPLEMENTAL FINAL,1845,"BUCKEYE PARTNERS, LP",FIVE TEK PARK,BREINIGSVILLE,PA,18031,2010-02-19 01:50:00,2010,40.6086,-74.2399
2019-07-31 04:37:40,NO,NO,NO,NO,NO,20100005,17663,2010-03-18 10:26:48,SUPPLEMENTAL FINAL,300,"PLAINS PIPELINE, L.P.",333 CLAY STREET,HOUSTON,TX,772104648,2010-02-21 07:45:00,2010,31.13284,-101.18974
2019-07-31 04:37:40,NO,NO,NO,NO,NO,20100006,17891,2010-03-19 10:48:18,SUPPLEMENTAL FINAL,31684,CONOCOPHILLIPS,600 NORTH DAIRY ASHFORD,HOUSTON,TX,77079,2010-02-22 05:56:00,2010,47.71696,-117.35583


## Define functions

In [5]:
get_latest_name <- function(dataset, id_col, name_col, time_col){
    quo_id_col <- enquo(id_col)
    quo_name_col <- enquo(name_col)
    quo_time_col <- enquo(time_col)
    
    dataset <- dataset %>%
        group_by(!! quo_id_col) %>%
        arrange(desc(!! quo_time_col)) %>%
        mutate(!! quo_name_col := first(!! quo_name_col))
    
    return(dataset)
}

In [6]:
create_dataset <- function(raw_data, variables, variable_names, operators, 
                           undesired_company_endings, name_col = "Name", time_col = "Time", 
                           recode_bool = c("Significant", "Serious")){
    dataset <- select(raw_data, variables)
    dataset <- rename(dataset, !!! variable_names)
    
    # For option to filter by sample
    dataset$in_sample <- dataset$ID %in% operators
    
    # Clean (unify) company names
    dataset <- get_latest_name(dataset, ID, Name, Time)
    
    # Various cleaning
    dataset$Year = year(dataset[[time_col]])
    dataset[[name_col]] = str_remove_all(dataset[[name_col]], undesired_company_endings)
    dataset[[name_col]] = str_to_title(dataset[[name_col]])
    
    # Recode from yes/no to boolean
    for (column in recode_bool){
        dataset[[column]] <- dataset[[column]] == "YES"
    }
    
    return(dataset)
}

## Run

In [7]:
variables <- c("OPERATOR_ID", "NAME", "LOCAL_DATETIME", "LOCATION_LATITUDE", 
               "LOCATION_LONGITUDE", "SIGNIFICANT", "SERIOUS", "ON_OFF_SHORE")

variable_names <- c("ID" = "OPERATOR_ID", 
                    "Name" = "NAME", 
                    "Time" = "LOCAL_DATETIME", 
                    "Lat" = "LOCATION_LATITUDE", 
                    "Long" = "LOCATION_LONGITUDE", 
                    "Significant" = "SIGNIFICANT", 
                    "Serious" = "SERIOUS", 
                    "Offshore" = "ON_OFF_SHORE")

undesired_company_endings <- c(" L. P.| L.L.C.|, LLC|, L.P.| L.P.|, LP|LP|LLC|, LIMITED PARTNERSHIP|, A DIVISION OF EXXON MOBIL CORPORATION")

In [8]:
dataset <- create_dataset(raw_data, variables, variable_names, largest_observations$OPERATOR_ID, undesired_company_endings)
head(dataset)

ID,Name,Time,Lat,Long,Significant,Serious,in_sample,Year
<dbl>,<chr>,<dttm>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<dbl>
26045,Par Hawaii Refining,2019-07-08 07:41:00,21.36536,-157.93803,False,False,False,2019
12105,Magellan Ammonia Pipeline,2019-07-01 04:36:00,43.93145,-94.11276,True,False,True,2019
31618,Enterprise Products Operating,2019-06-29 07:07:00,33.05238,-103.2345,False,False,True,2019
30829,Enterprise Crude Pipeline,2019-06-27 06:15:00,29.7484,-95.1285,False,False,True,2019
30829,Enterprise Crude Pipeline,2019-06-27 05:50:00,35.9548,-96.7591,True,False,True,2019
18718,Sunoco Pipeline,2019-06-26 07:26:00,36.60264,-96.15029,False,False,True,2019


In [9]:
write_feather(dataset, paste0("../incident_dashboard/data/dataset_", Sys.Date(), ".feather"))