Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
799 lines (635 sloc) 28.4 KB
---
title: "Article Followup"
author: "Jason Baik"
date: "1/7/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r, include= FALSE}
library(tidyverse)
library(survival)
library(survminer)
library(rms)
library(ggfortify)
library(gganimate)
library(RColorBrewer)
theme_set(theme_light())
# Disable scientific notation
options(scipen=999)
# Load pbp
pbp_raw <- read_csv("./data/aggregated_pbp.csv")
# Create SAVE event_type for goalie saves to get goalie percentage
# Filter for Regular Season
pbp <- pbp_raw %>%
mutate(event_type = if_else(event_description == "GOALIE STOPPED", "SAVE", event_type)) %>%
filter(session == "R")
```
```{r include=FALSE}
cox_regression_period <- function(period_x, period_y, event_type_fun) {
pbp_filtered <- pbp %>%
dplyr::filter(game_period %in% c(period_x, period_y)) %>%
mutate(game_period = if_else(game_period == period_y, 1, 0))
# time_diff_game_end: Difference between time at which event last occured and end of the game
time_diff_game_end <- pbp_filtered %>%
dplyr::filter(event_type == event_type_fun) %>%
# Time difference in minutes (seconds / 60)
group_by(game_id, game_period) %>%
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)) %>%
dplyr::select(game_id, game_period, game_seconds, event_type, time_diff) %>%
group_by(game_id, game_period) %>%
summarise(time_diff_sum = sum(time_diff)) %>%
mutate(time_diff_game_end = 60 - time_diff_sum) %>%
pull(time_diff_game_end)
# Build new dataset for Cox regression
pbp_cox <- pbp_filtered %>%
dplyr::filter(event_type == event_type_fun) %>%
group_by(game_id, game_period) %>%
summarise(event_index = last(event_index)) %>%
ungroup() %>%
mutate(event_type = "game_end",
time_diff = time_diff_game_end) %>%
bind_rows(pbp_filtered %>%
dplyr::filter(event_type == event_type_fun) %>%
group_by(game_id, game_period) %>%
# Time difference in minutes (seconds / 60)
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)), .) %>%
arrange(game_period) %>%
dplyr::select(game_id, game_period, event_type, time_diff) %>%
mutate(event_type = if_else(event_type == event_type_fun, 1, 0))
pbp_cox_fit <- survival::coxph(formula = survival::Surv(time_diff, event_type) ~ game_period, data = pbp_cox)
hazard_ratio <- coef(summary(pbp_cox_fit))[[2]]
lower_ci <- summary(pbp_cox_fit)$conf.int[3]
upper_ci <- summary(pbp_cox_fit)$conf.int[4]
final <- c(lower_ci, hazard_ratio, upper_ci)
final
}
```
Run iteration
```{r include=FALSE}
event_type_cox <- c("GOAL", "HIT", "SHOT",
"FAC", "STOP", "BLOCK",
"MISS", "GIVE", "TAKE",
"CHL", "SAVE")
period_x <- rep(c(1:3), times = 11)
period_y <- rep(c(2:4), times = 11)
event_types <- rep(c("GOAL", "HIT", "SHOT",
"FAC", "STOP", "BLOCK",
"MISS", "GIVE", "TAKE",
"CHL", "SAVE"), each = 3)
args_cox <- list(period_x = period_x, period_y = period_y, event_type_fun = event_types)
hazard_ratio_final <- args_cox %>%
pmap(cox_regression_period)
hazard_ratio_final
```
Create hazard ratio for plotting
```{r include=FALSE}
# First value of each list
lower_ci_intermediate <- hazard_ratio_final %>%
map_dbl(1)
# Second value of each list
hazard_ratio_intermediate <- hazard_ratio_final %>%
map_dbl(2)
# Third value of each list
upper_ci_intermediate <- hazard_ratio_final %>%
map_dbl(3)
# Create data frame
hazard_ratio_plot <- data.frame(lower_ci = lower_ci_intermediate,
hazard_ratio = hazard_ratio_intermediate,
upper_ci = upper_ci_intermediate)
hazard_ratio_plot <- hazard_ratio_plot %>%
mutate(event_type = event_types) %>%
select(event_type, everything())
# Round numeric columns
hazard_ratio_plot <- hazard_ratio_plot %>%
mutate_if(is.numeric, round, 3)
```
### Plot Period 1 vs Period 2 (Filter out Coach's Challenge)
```{r}
hazard_ratio_plot %>%
group_by(event_type) %>%
filter(row_number() == 1) %>%
ungroup() %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio)) %>%
ggplot(aes(event_type, hazard_ratio, col = event_type)) +
geom_point(show.legend = FALSE) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), position = "identity") +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
scale_x_discrete(labels = c("Hit",
"Giveaway",
"Takeaway",
"Faceoff",
"Goalie Save",
"Blocked Shot",
"Stoppage in Play",
"Missed Shot",
"Shot",
"Goal",
"Coach's Challenge")) +
coord_flip() +
theme(legend.position = "none") +
theme(text = element_text(family = "Optima"),
plot.title=element_text(family="Times", face="bold", size=18)) +
ggtitle("Distribution of Hazard Ratios by Event Types", subtitle = "1st Period vs 2nd Period") +
ylim(0.80, 1.04) +
annotate("text", x = "GOAL", y = 0.81, label = "Higher Rate\n1st Period") +
annotate("text", x = "GOAL", y = 1.03, label = "Higher Rate\n2nd Period")
```
### Plot Period 2 vs Period 3 (Filter out Coach's Challenge)
```{r}
hazard_ratio_plot %>%
group_by(event_type) %>%
filter(row_number() == 2) %>%
ungroup() %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio)) %>%
ggplot(aes(event_type, hazard_ratio, col = event_type)) +
geom_point(show.legend = FALSE) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), position = "identity") +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
coord_flip() +
theme(legend.position = "none") +
theme(text = element_text(family = "Optima"),
plot.title=element_text(family="Times", face="bold", size=18)) +
scale_x_discrete(labels = c("Goal",
"Goalie Save",
"Takeaway",
"Giveaway",
"Hit",
"Missed Shot",
"Blocked Shot",
"Shot",
"Faceoff",
"Stoppage in Play")) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
ylim(0.50, 1.1) +
ggtitle("Distribution of Hazard Ratios by Event Types", subtitle = "2nd Period vs 3rd Period") +
annotate("text", x = "MISS", y = 0.81, label = "Higher Rate\n2nd Period") +
annotate("text", x = "MISS", y = 1.06, label = "Higher Rate\n3rd Period")
```
### Filtering for Period
```{r}
### PBP Hack
pbp_hack <- pbp_raw %>%
mutate(event_type = if_else(event_description == "GOALIE STOPPED", "SAVE", event_type))
### Function
cox_regression_period_hack <- function(period_hack, event_types_hack) {
pbp_hack <- pbp_hack %>%
filter(game_period == period_hack) %>%
mutate(session = if_else(session == "P", 1, 0))
# time_diff_game_end: Difference between time at which event last occured
# and end of the game.
time_diff_game_end <- pbp_hack %>%
filter(event_type == event_types_hack) %>%
# Time difference in minutes (seconds / 60)
group_by(game_id, session) %>%
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)) %>%
select(game_id, session, game_seconds, event_type, time_diff) %>%
group_by(game_id, session) %>%
summarise(time_diff_sum = sum(time_diff)) %>%
mutate(time_diff_game_end = 20 - time_diff_sum) %>%
pull(time_diff_game_end)
# Build new dataset for Cox regression
df_cox <- pbp_hack %>%
filter(event_type == event_types_hack) %>%
group_by(game_id, session) %>%
summarise(event_index = last(event_index)) %>%
ungroup() %>%
mutate(event_type = "period_end",
time_diff = time_diff_game_end) %>%
bind_rows(pbp_hack %>%
filter(event_type == event_types_hack) %>%
group_by(game_id, session) %>%
# Time difference in minutes (seconds / 60)
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)), .) %>%
arrange(game_id) %>%
select(game_id, session, event_type, time_diff) %>%
mutate(event_type = if_else(event_type == event_types_hack, 1, 0))
cox_fit <- survival::coxph(formula = survival::Surv(time_diff, event_type) ~ session, data = df_cox)
hazard_ratio <- coef(summary(cox_fit))[[2]]
lower_ci <- summary(cox_fit)$conf.int[3]
upper_ci <- summary(cox_fit)$conf.int[4]
final <- c(lower_ci, hazard_ratio, upper_ci)
final
}
### Iteration
period_hack <- rep(c(1:3), each = 11)
event_types_hack <- rep(c("GOAL", "HIT", "SHOT",
"FAC", "STOP", "BLOCK",
"MISS", "GIVE", "TAKE",
"CHL", "SAVE"), times = 3)
args_cox_hack <- data.frame(period_hack = period_hack, event_types_hack = event_types_hack) %>%
mutate(event_types_hack = as.character(event_types_hack))
hazard_ratio_hack <- args_cox_hack %>%
pmap(cox_regression_period_hack)
# Make hazard_ratio_hack for plotting
# First value of each list
lower_ci_intermediate_hack <- hazard_ratio_hack %>%
map_dbl(1)
# Second value of each list
hazard_ratio_intermediate_hack <- hazard_ratio_hack %>%
map_dbl(2)
# Third value of each list
upper_ci_intermediate_hack <- hazard_ratio_hack %>%
map_dbl(3)
# Create data frame
hazard_ratio_hack_plot <- data.frame(lower_ci = lower_ci_intermediate_hack,
hazard_ratio = hazard_ratio_intermediate_hack,
upper_ci = upper_ci_intermediate_hack)
hazard_ratio_hack_plot <- hazard_ratio_hack_plot %>%
mutate(event_type = event_types_hack) %>%
select(event_type, everything())
# Round numeric columns
hazard_ratio_hack_plot <- hazard_ratio_hack_plot %>%
mutate_if(is.numeric, round, 3)
# Mutate period column
hazard_ratio_hack_plot <- hazard_ratio_hack_plot %>%
mutate(period = case_when(
between(row_number(), 1, 11) ~ 1,
between(row_number(), 12, 22) ~ 2,
between(row_number(), 23, 33) ~ 3,
))
```
### 1st Period
```{r}
hazard_ratio_hack_plot %>%
slice(1:11) %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio)) %>%
ggplot(aes(event_type, hazard_ratio, col = event_type)) +
geom_point(show.legend = FALSE) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), position = "identity") +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
coord_flip() +
theme(legend.position = "none") +
theme(text = element_text(family = "Optima"),
plot.title=element_text(family="Times", face="bold", size=18)) +
scale_x_discrete(labels = c("Takeaway",
"Shot",
"Goal",
"Goalie Save",
"Missed Shot",
"Faceoff",
"Giveaway",
"Blocked Shot",
"Stoppage in Play",
"Hit")) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
ggtitle("Hazard Ratios by Event Types in the 1st Period") +
annotate("text", x = "HIT", y = 0.949, label = "Higher Rate\nReg Season") +
annotate("text", x = "HIT", y = 1.049, label = "Higher Rate\nPlayoff")
```
* Took out Coach's Challenge because it had a wide confidence interval due to small sample size.
* Hazard ratio for goals is 1.007, meaning _as we move from regular season to the playoffs, goals increase by approximately 0.7% in the first period_. Note that the confidence interval crosses 1.0, demonstrating it is not statistically significant.
* Hits hazard ratio is 1.437, signifying _an increase of 43% as we move from regular season to playoffs in 1st period._ Just like my previous article, magnitude of increase in hits visibly outweighs that of other events.
* With a hazard ratio of 1.087, stoppages in play show the _second highest increase from regular season to playoffs in the 1st period._
* On the other side of the spectrum, rate of takeaways decrease by 4.4%.
### 2nd Period
```{r}
hazard_ratio_hack_plot %>%
slice(12:22) %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio)) %>%
ggplot(aes(event_type, hazard_ratio, col = event_type)) +
geom_point(show.legend = FALSE) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), position = "identity") +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
coord_flip() +
scale_x_discrete(labels = c("Goal",
"Takeaway",
"Shot",
"Goalie Save",
"Missed Shot",
"Faceoff",
"Stoppage in Play",
"Giveaway",
"Blocked Shot",
"Hit")) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
ggtitle("Distribution of Hazard Ratios by Event Types in the 2nd Period") +
annotate("text", x = "HIT", y = 0.949, label = "Higher Rate\nReg Season") +
annotate("text", x = "HIT", y = 1.04, label = "Higher Rate\nPlayoff") +
theme(legend.position = "none") +
theme(text = element_text(family = "Optima"),
plot.title=element_text(family="Times", face="bold", size=18))
```
* Hazard ratio for goals is 0.976, demonstrating _a decrease of 2.4%_ as we move from regular season to playoffs in 2nd period. Importantly, this change is not statistically significant.
* Again, there is a much higer rate of hits. Its hazard ratio is 1.277. In other words, _hits increase by almost 30%_ from regular season to playoffs in the 2nd period.
* Stoppages in play's hazard ratio is 1.062, meaning that we see a 6.2% increase as we move from regular season to playoffs. More stoppages in play means the play is choppy, and teams lose momentum from referees blowing the play down.
* Given the previous result, the fact that the rate of faceoffs increased by 3.3% makes sense. Referees drop puck in faceoff plays to resume play. Thus, coaches can devise more faceoff plays in the playoffs.
### 3rd Period
```{r}
hazard_ratio_hack_plot %>%
slice(23:nrow(hazard_ratio_hack_plot)) %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio)) %>%
ggplot(aes(event_type, hazard_ratio, col = event_type)) +
geom_point(show.legend = FALSE) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), position = "identity") +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
coord_flip() +
theme(legend.position = "none") +
scale_x_discrete(labels = c("Goal",
"Shot",
"Takeaway",
"Goalie Save",
"Missed Shot",
"Faceoff",
"Blocked Shot",
"Stoppage in Play",
"Giveaway",
"Hit")) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
ggtitle("Distribution of Hazard Ratios by Event Types in the 3rd Period") +
annotate("text", x = "HIT", y = 0.949, label = "Higher Rate\nReg Season") +
annotate("text", x = "HIT", y = 1.04, label = "Higher Rate\nPlayoff")
```
* Rate of goals decreases by 2%, but this result is not statistically significant.
* Although not statistically significant (barely), we do observe a decrease in shots (1.6%) as we move from regular season to playoffs in the 3rd period.
* Rate of hits increase by almost 25% in the 3rd period from regular season to playoffs.
* We do observe an increase in rate of blocked shots in all three periods from the regular season to playoffs.
Animation
```{r}
hazard_ratio_hack_plot <- read_csv("hazard-ratio-period-filter.csv")
anim <- hazard_ratio_hack_plot %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio),
event_type = str_to_title(event_type)) %>%
ggplot(aes(event_type, hazard_ratio, col = event_type)) +
geom_point(show.legend = FALSE) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), position = "identity") +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
coord_flip() +
theme(legend.position = "none") +
scale_x_discrete(labels = c("Blocked Shot",
"Faceoff",
"Giveaway",
"Goal",
"Hit",
"Missed Shot",
"Goalie Save",
"Shot",
"Stoppage in Play",
"Takeaway")) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
theme(text = element_text(family = "Optima", size = 15),
plot.title=element_text(family="Times", face="bold", size = 20)) +
ggtitle("Hazard Ratios in Period {closest_state}") +
transition_states(period)
animate(anim)
```
### Hazard Ratio among 17/18 Playoff Teams
Data Prep
```{r}
playoff_team <- c("T.B", "BOS", "TOR", "WSH", "PIT", "PHI", "CBJ", "N.J", "NSH", "WPG", "MIN", "COL", "VGK", "ANA", "S.J", "L.A")
pbp_1718 <- pbp_raw %>%
filter(season == 20172018, event_team %in% playoff_team)
```
```{r Cox Regression 5v5, include=FALSE}
cox_regression_5v5 <- function(df, event_type_fun) {
# Only look at games that ended in regulation
df <- df %>%
filter(game_period <= 3,
game_strength_state == "5v5") %>%
mutate(session = if_else(session == "P", 1, 0))
# time_diff_game_end: Difference between time at which event last occured
# and end of the game.
time_diff_game_end <- df %>%
filter(event_type == event_type_fun) %>%
# Time difference in minutes (seconds / 60)
group_by(game_id, session) %>%
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)) %>%
select(game_id, session, game_seconds, event_type, time_diff) %>%
group_by(game_id, session) %>%
summarise(time_diff_sum = sum(time_diff)) %>%
mutate(time_diff_game_end = 60 - time_diff_sum) %>%
pull(time_diff_game_end)
# Build new dataset for Cox regression
df_cox <- df %>%
filter(event_type == event_type_fun) %>%
group_by(game_id, session) %>%
summarise(event_index = last(event_index)) %>%
ungroup() %>%
mutate(event_type = "game_end",
time_diff = time_diff_game_end) %>%
bind_rows(df %>%
filter(event_type == event_type_fun) %>%
group_by(game_id, session) %>%
# Time difference in minutes (seconds / 60)
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)), .) %>%
arrange(game_id) %>%
select(game_id, session, event_type, time_diff) %>%
mutate(event_type = if_else(event_type == event_type_fun, 1, 0))
cox_fit <- survival::coxph(formula = survival::Surv(time_diff, event_type) ~ session, data = df_cox)
summary(cox_fit)
}
```
Took out "Stoppages in Play" and "Goalie Save" because it was throwing an error for some reason
```{r, message=FALSE, include = FALSE}
event_type_cox <- c("GOAL",
"HIT",
"SHOT",
"FAC",
"BLOCK",
"MISS",
"GIVE",
"TAKE",
"CHL")
all_summary_data_5v5 <- vector("list", length(event_type_cox))
for (ii in seq_along(event_type_cox)) {
# summary_data = summary of cox regression
summary_data <- cox_regression_5v5(pbp_1718, event_type_cox[ii])
# Now, add that into all_summary_data
all_summary_data_5v5[[ii]] <- summary_data
}
```
```{r Cox Regression PP, include = FALSE}
cox_regression_pp <- function(df, event_type_fun) {
# Only look at games that ended in regulation
df <- df %>%
filter(game_period <= 3,
game_strength_state %in% c("4v5",
"5v4",
"3v5",
"5v3")) %>%
mutate(session = if_else(session == "P", 1, 0))
# time_diff_game_end: Difference between time at which event last occured
# and end of the game.
time_diff_game_end <- df %>%
filter(event_type == event_type_fun) %>%
# Time difference in minutes (seconds / 60)
group_by(game_id, session) %>%
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)) %>%
select(game_id, session, game_seconds, event_type, time_diff) %>%
group_by(game_id, session) %>%
summarise(time_diff_sum = sum(time_diff)) %>%
mutate(time_diff_game_end = 60 - time_diff_sum) %>%
pull(time_diff_game_end)
# Build new dataset for Cox regression
df_cox <- df %>%
filter(event_type == event_type_fun) %>%
group_by(game_id, session) %>%
summarise(event_index = last(event_index)) %>%
ungroup() %>%
mutate(event_type = "game_end",
time_diff = time_diff_game_end) %>%
bind_rows(df %>%
filter(event_type == event_type_fun) %>%
group_by(game_id, session) %>%
# Time difference in minutes (seconds / 60)
mutate(time_diff = game_seconds - lag(game_seconds, default = 0),
time_diff = round(time_diff / 60, 2)), .) %>%
arrange(game_id) %>%
select(game_id, session, event_type, time_diff) %>%
mutate(event_type = if_else(event_type == event_type_fun, 1, 0))
cox_fit <- survival::coxph(formula = survival::Surv(time_diff, event_type) ~ session, data = df_cox)
summary(cox_fit)
}
```
```{r, message=FALSE, include=FALSE}
all_summary_data_pp <- vector("list", length(event_type_cox))
for (ii in seq_along(event_type_cox)) {
# summary_data = summary of cox regression
summary_data <- cox_regression_pp(pbp_1718, event_type_cox[ii])
# Now, add that into all_summary_data
all_summary_data_pp[[ii]] <- summary_data
}
```
### Plot of Hazard Ratios
```{r}
####################################################################################
## Make hazard_ratio_combined
####################################################################################
# 5v5 Hazard Ratio
hazard_ratio_5v5 <- vector("numeric", length = 9)
for (ii in seq(length(all_summary_data_5v5))) {
hazard_ratio_5v5[ii] <- coef(all_summary_data_5v5[[ii]])[[2]]
}
# 5v5 Hazard Ratio CI
hazard_ratio_lower_ci_5v5 <- vector("numeric", length = 9)
for (ii in seq(length(all_summary_data_5v5))) {
hazard_ratio_lower_ci_5v5[ii] <- all_summary_data_5v5[[ii]]$conf.int[3]
}
hazard_ratio_upper_ci_5v5 <- vector("numeric", length = 9)
for (ii in seq(length(all_summary_data_5v5))) {
hazard_ratio_upper_ci_5v5[ii] <- all_summary_data_5v5[[ii]]$conf.int[4]
}
# PP Hazard Ratio
hazard_ratio_pp <- vector("numeric", length = 9)
for (ii in seq(length(all_summary_data_pp))) {
hazard_ratio_pp[ii] <- coef(all_summary_data_pp[[ii]])[[2]]
}
# PP Hazard Ratio CI
hazard_ratio_lower_ci_pp <- vector("numeric", length = 9)
for (ii in seq(length(all_summary_data_pp))) {
hazard_ratio_lower_ci_pp[ii] <- all_summary_data_pp[[ii]]$conf.int[3]
}
hazard_ratio_upper_ci_pp <- vector("numeric", length = 9)
for (ii in seq(length(all_summary_data_pp))) {
hazard_ratio_upper_ci_pp[ii] <- all_summary_data_pp[[ii]]$conf.int[4]
}
hazard_ratio_combined <- data.frame(event_type = event_type_cox,
hazard_ratio_5v5 = hazard_ratio_5v5,
hazard_ratio_lower_ci_5v5 = hazard_ratio_lower_ci_5v5,
hazard_ratio_upper_ci_5v5 = hazard_ratio_upper_ci_5v5,
hazard_ratio_pp = hazard_ratio_pp,
hazard_ratio_lower_ci_pp = hazard_ratio_lower_ci_pp,
hazard_ratio_upper_ci_pp = hazard_ratio_upper_ci_pp)
```
```{r}
####################################################################################
## RESULTS: 5v5 Plots
####################################################################################
hazard_ratio_combined_5v5 <- hazard_ratio_combined %>%
select(event_type, contains("5v5")) %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio_5v5, .fun = identity)) %>%
ggplot(aes(event_type, hazard_ratio_5v5, color = event_type)) +
geom_point() +
geom_errorbar(aes(ymin = hazard_ratio_lower_ci_5v5, ymax = hazard_ratio_upper_ci_5v5), position = "identity") +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
coord_flip() +
theme(legend.position = "none") +
theme(text = element_text(family = "Optima"),
plot.title=element_text(family="Times", face="bold", size=18)) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
scale_x_discrete(labels = c("Goal",
"Shot",
"Faceoff",
"Takeaway",
"Missed Shot",
"Giveaway",
"Blocked Shot",
"Hit")) +
ggtitle("Hazard Ratios on 5v5", subtitle = "17/18 Playoff Teams")
```
* Hits hazard ratio is 1.903, which signifies the rate of hits increased by 90% among playoff teams from the regular season to playoffs on 5v5!
```{r}
####################################################################################
## RESULTS PP Plots
####################################################################################
hazard_ratio_combined_pp <- hazard_ratio_combined %>%
select(event_type, ends_with("pp")) %>%
filter(event_type != "CHL") %>%
mutate(event_type = fct_reorder(event_type, hazard_ratio_pp, .fun = identity)) %>%
ggplot(aes(event_type, hazard_ratio_pp, color = event_type)) +
geom_point() +
geom_hline(yintercept = 1.0, size = 1.2, linetype = 2) +
geom_errorbar(aes(ymin = hazard_ratio_lower_ci_pp, ymax = hazard_ratio_upper_ci_pp), position = "dodge") +
coord_flip() +
theme(legend.position = "none") +
theme(text = element_text(family = "Optima"),
plot.title=element_text(family="Times", face="bold", size=18)) +
labs(x = "Event Type",
y = "Hazard Ratio",
caption = "Source: Corsica Hockey\n@jsonbaik") +
scale_x_discrete(labels = c("Giveaway",
"Missed Shot",
"Takeaway",
"Shot",
"Goal",
"Faceoff",
"Blocked Shot",
"Hit")) +
ggtitle("Hazard Ratios on PP", subtitle = "17/18 Playoff Teams")
```
* Noticeably, playoff-bound teams show a statistically significant increase in all events, especially hits and blocks.
* This difference in rate of events from the regular season to playoffs explains the excitement of NHL playoff hockey.
Combine graphs
```{r}
cowplot::plot_grid(hazard_ratio_combined_5v5, hazard_ratio_combined_pp)
```
Find distance of hazard ratio from 1.0
```{r}
hazard_ratio_combined %>%
select(event_type, hazard_ratio_5v5, hazard_ratio_pp) %>%
filter(event_type != "CHL") %>%
mutate(distance_5v5 = hazard_ratio_5v5 - 1.0,
distance_pp = hazard_ratio_pp - 1.0) %>%
summarise(distance_5v5_avg = 100 * mean(distance_5v5),
distance_pp_avg = 100 * mean(distance_pp))
```