# 11. Data exploration - model (numeric)

In [1]:
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1       [32m✔[39m [34mpurrr  [39m 0.3.2  
[32m✔[39m [34mtibble [39m 2.1.1       [32m✔[39m [34mdplyr  [39m 0.8.0.[31m1[39m
[32m✔[39m [34mtidyr  [39m 0.8.3       [32m✔[39m [34mstringr[39m 1.4.0  
[32m✔[39m [34mreadr  [39m 1.3.1       [32m✔[39m [34mforcats[39m 0.4.0  
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
options(repr.plot.width=4, repr.plot.height=3)

In [3]:
sample_file <- "../preprocessed_data/sample_features_2019-09-02.feather"
sample_crude_file <- "../preprocessed_data/sample_features_crude_2019-09-02.feather"

In [4]:
today <- Sys.Date()

## Read data

In [5]:
sample <- feather::read_feather(sample_file)
sample_crude <- feather::read_feather(sample_crude_file)
glimpse(sample)

Observations: 1,121
Variables: 61
$ YEAR                  [3m[38;5;246m<dbl>[39m[23m 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018,…
$ COMMODITY             [3m[38;5;246m<fct>[39m[23m hvl, crude, hvl, non-hvl, crude, non-hvl, hvl, …
$ OPERATOR_ID           [3m[38;5;246m<fct>[39m[23m Sunoco (Group), Valero Energy (Group), Valero E…
$ NAME                  [3m[38;5;246m<chr>[39m[23m "Sunoco (Group)", "Valero Energy (Group)", "Val…
$ MILES                 [3m[38;5;246m<dbl>[39m[23m 3241.490, 92.377, 24.609, 170.415, 272.000, 0.0…
$ AGE_UNKNOWN_MILES     [3m[38;5;246m<dbl>[39m[23m 235.050, 0.000, 0.000, 0.000, 0.000, 0.000, 0.0…
$ MILES_PRE_1940        [3m[38;5;246m<dbl>[39m[23m 14.19, 0.00, 0.00, 0.00, 1.70, 0.00, 0.00, 28.4…
$ MILES_1940            [3m[38;5;246m<dbl>[39m[23m 301.630, 0.000, 0.000, 0.000, 69.800, 0.000, 0.…
$ MILES_1950            [3m[38;5;246m<dbl>[39m[23m 263.620, 84.333, 0.000, 11.233, 11.500, 0.000, …
$ MILES_1960            [3m[

## 11.1 Correlation matrix

In [6]:
sample %>%
    select(MILES, INCIDENTS, SIGNIFICANT_INCIDENTS, MILES, miles_avg_3, sd_change_3, sd_change_sq, sd_change_pw_3, AVG_AGE, M_A, M_A_3) %>%
    as.matrix %>%
    cor(use="pairwise.complete.obs")

Unnamed: 0,MILES,INCIDENTS,SIGNIFICANT_INCIDENTS,miles_avg_3,sd_change_3,sd_change_sq,sd_change_pw_3,AVG_AGE,M_A,M_A_3
MILES,1.0,0.60289425,0.62285286,0.993393811,0.41542965,0.26056399,0.184629386,0.1379752536,0.001279154,0.0154980049
INCIDENTS,0.602894253,1.0,0.86980507,0.58916046,0.25489597,0.10705099,0.044760596,0.0182257499,0.028197345,0.0265269169
SIGNIFICANT_INCIDENTS,0.62285286,0.86980507,1.0,0.612566026,0.33419656,0.2057732,0.123400913,0.0286139935,0.015030073,0.0275739915
miles_avg_3,0.993393811,0.58916046,0.61256603,1.0,0.394147,0.21701662,0.137252532,0.1332346447,-0.009935012,-0.0125232962
sd_change_3,0.415429646,0.25489597,0.33419656,0.394146996,1.0,0.81849755,0.658183017,-0.0870069818,-0.019225021,0.1733823115
sd_change_sq,0.260563994,0.10705099,0.2057732,0.217016623,0.81849755,1.0,0.959260113,-0.0700304545,-0.010134377,0.2280907855
sd_change_pw_3,0.184629386,0.0447606,0.12340091,0.137252532,0.65818302,0.95926011,1.0,-0.0564828636,-0.005320897,0.2180406339
AVG_AGE,0.137975254,0.01822575,0.02861399,0.133234645,-0.08700698,-0.07003045,-0.056482864,1.0,-0.017245615,0.0008685243
M_A,0.001279154,0.02819735,0.01503007,-0.009935012,-0.01922502,-0.01013438,-0.005320897,-0.017245615,1.0,0.3718099403
M_A_3,0.015498005,0.02652692,0.02757399,-0.012523296,0.17338231,0.22809079,0.218040634,0.0008685243,0.37180994,1.0


In [7]:
sample %>%
    select(MILES, INCIDENTS, SIGNIFICANT_INCIDENTS, sd_change_3, sd_change_sq, avg_pre_1940, avg_1940, 
           avg_1950, avg_1960, avg_1970, avg_1980, avg_1990, avg_2000, avg_2010) %>%
    as.matrix %>%
    cor(use="pairwise.complete.obs")

Unnamed: 0,MILES,INCIDENTS,SIGNIFICANT_INCIDENTS,sd_change_3,sd_change_sq,avg_pre_1940,avg_1940,avg_1950,avg_1960,avg_1970,avg_1980,avg_1990,avg_2000,avg_2010
MILES,1.0,0.60289425,0.62285286,0.415429646,0.2605639939,-0.009011023,0.0369672283,-0.023222223,0.13683115,0.06633672,0.054284991,-0.06395612,-0.041074254,-0.152159919
INCIDENTS,0.602894253,1.0,0.869805067,0.254895973,0.1070509893,-0.018734523,0.0269398658,-0.040698874,0.09384486,-0.01018079,-0.03970907,-0.07546472,0.036704636,0.016250814
SIGNIFICANT_INCIDENTS,0.62285286,0.86980507,1.0,0.334196562,0.2057732013,-0.019305471,0.0436935466,-0.039786162,0.09051,0.01091989,-0.053718107,-0.06356092,0.019151185,-0.002078363
sd_change_3,0.415429646,0.25489597,0.334196562,1.0,0.8184975498,0.001754532,-0.044242507,-0.125232563,-0.05943569,0.13618034,0.044811652,-0.0756823,0.077816282,0.043803397
sd_change_sq,0.260563994,0.10705099,0.205773201,0.81849755,1.0,-0.018779748,0.0008984284,-0.075563801,-0.05450734,0.06100805,-0.005822185,-0.05293728,0.068116032,0.077929838
avg_pre_1940,-0.009011023,-0.01873452,-0.019305471,0.001754532,-0.0187797478,1.0,0.0758749485,-0.005999364,-0.04231905,-0.08137341,-0.039840226,-0.09741583,-0.002463487,-0.053950646
avg_1940,0.036967228,0.02693987,0.043693547,-0.044242507,0.0008984284,0.075874949,1.0,-0.001107723,-0.11052943,-0.17913487,-0.120918238,-0.04647108,-0.07210982,-0.127605185
avg_1950,-0.023222223,-0.04069887,-0.039786162,-0.125232563,-0.0755638008,-0.005999364,-0.0011077234,1.0,-0.19428123,-0.26939435,-0.13679345,-0.14552498,-0.210323435,-0.186260855
avg_1960,0.136831146,0.09384486,0.090509997,-0.059435691,-0.0545073427,-0.042319055,-0.1105294258,-0.194281234,1.0,-0.1298649,-0.081095926,-0.2193429,-0.127022755,-0.209647487
avg_1970,0.066336719,-0.01018079,0.010919892,0.136180336,0.0610080459,-0.081373415,-0.1791348733,-0.269394353,-0.1298649,1.0,-0.102236866,-0.16003008,-0.195983351,-0.220965091


In [8]:
sample_crude %>%
    select(sd_change_3, sd_change_sq, miles_avg_3, miles_1950_3) %>%
    as.matrix %>%
    cor(use="pairwise.complete.obs")

Unnamed: 0,sd_change_3,sd_change_sq,miles_avg_3,miles_1950_3
sd_change_3,1.0,0.86047616,0.40918,0.05316338
sd_change_sq,0.86047616,1.0,0.2659409,0.01704418
miles_avg_3,0.40918003,0.26594091,1.0,0.32139253
miles_1950_3,0.05316338,0.01704418,0.3213925,1.0


In [9]:
sample %>%
    select(YEAR, COMMODITY, OPERATOR_ID, NAME, MILES, CHANGE, sd_change_3) %>%
    top_n(30, CHANGE) %>%
    arrange(desc(CHANGE))

YEAR,COMMODITY,OPERATOR_ID,NAME,MILES,CHANGE,sd_change_3
<dbl>,<fct>,<fct>,<chr>,<dbl>,<dbl>,<dbl>
2014,non-hvl,2552,COLONIAL PIPELINE CO,4458.6,249.0,154.04765
2009,hvl,32109,ONEOK INC,2962.0,247.0,
2009,non-hvl,Marathon (Group),Marathon (Group),2454.629,243.471,
2016,hvl,Sunoco (Group),Sunoco (Group),1877.7,241.04,784.83568
2011,hvl,Kinder Morgan (Group),Kinder Morgan (Group),486.49,240.28,
2013,hvl,NuStar,NuStar,1606.5,237.79,190.28422
2016,hvl,Phillips 66 (Group),Phillips 66 (Group),1156.08,237.38,157.10195
2018,hvl,Sunoco (Group),Sunoco (Group),3241.49,233.71,1276.67664
2015,non-hvl,Sunoco (Group),Sunoco (Group),2525.72,230.04,190.67251
2011,crude,Sunoco (Group),Sunoco (Group),2391.74,230.0,
