-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Merge branch 'afltables-playerstats' # Conflicts: # DESCRIPTION
- Loading branch information
Showing
56 changed files
with
10,916 additions
and
262 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,22 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(convert_results) | ||
export(get_afltables_stats) | ||
export(get_afltables_urls) | ||
export(get_fixture) | ||
export(get_footywire_stats) | ||
export(get_match_results) | ||
export(get_score_progression_raw) | ||
export(get_squiggle_data) | ||
export(replace_teams) | ||
export(scrape_afltables_match) | ||
export(update_footywire_stats) | ||
import(dplyr) | ||
import(tidyr) | ||
importFrom(dplyr,mutate) | ||
importFrom(magrittr,"%>%") | ||
importFrom(purrr,map) | ||
importFrom(purrr,map2) | ||
importFrom(rvest,html_nodes) | ||
importFrom(rvest,html_text) | ||
importFrom(utils,type.convert) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#' Return afltables match stats | ||
#' | ||
#' \code{get_afltables_stats} returns a data frame containing match stats for each game within the specified date range | ||
#' | ||
#' This function returns a data frame containing match stats for each game within the specified date range. The data from contains all stats on afltables match pages and returns 1 row per player. | ||
#' | ||
#' The data for this function is hosted on github to avoid extensive scraping of historical data from afltables.com. This will be updated regularly. | ||
#' | ||
#' @param start_date character string for start date return to URLs from, in "dmy" or "ymd" format | ||
#' @param end_date optional, character string for end date to return URLS, in "dmy" or "ymd" format | ||
#' | ||
#' @return a data table containing player stats for each game between start date and end date | ||
#' @export | ||
#' | ||
#' @examples | ||
#' # Gets all data | ||
#' get_afltables_stats() | ||
#' | ||
#' # Specify a date range | ||
#' get_afltables_stats("01/01/2018", end_date = "01/04/2018") | ||
#' @importFrom magrittr %>% | ||
#' @importFrom purrr map | ||
get_afltables_stats <- function(start_date = "1897-05-08", end_date = Sys.Date()) { | ||
start_date <- lubridate::parse_date_time(start_date, c("dmy", "ymd")) | ||
if (is.na(start_date)) stop(paste("Date format not reccognised. Check that start_date is in dmy or ymd format")) | ||
end_date <- lubridate::parse_date_time(end_date, c("dmy", "ymd")) | ||
if (is.na(end_date)) stop(paste("Date format not reccognised. Check that end_date is in dmy or ymd format")) | ||
message(paste0("Returning data from ", start_date, " to ", end_date)) | ||
|
||
dat_url <- url("https://github.com/jimmyday12/fitzRoy/raw/afltables-playerstats/data-raw/afl_tables_playerstats/afldata.rda") | ||
|
||
loadRData <- function(fileName) { | ||
load(fileName) | ||
get(ls()[ls() != "fileName"]) | ||
} | ||
|
||
dat <- loadRData(dat_url) | ||
max_date <- max(dat$Date) | ||
|
||
if (end_date > max_date) { | ||
urls <- get_afltables_urls(max_date, end_date) | ||
dat_new <- scrape_afltables_match(urls) | ||
dat <- dplyr::bind_rows(dat, dat_new) | ||
} | ||
message("Finished getting afltables data") | ||
dplyr::filter(dat, Date > start_date & Date < end_date) | ||
} | ||
|
||
#' Return match URLs for specified dates | ||
#' | ||
#' \code{get_afltables_urls} returns a character vector containing match URLs for the specified date range | ||
#' | ||
#' This function returns match URLs for the specified date range. This will typically be used to pass to | ||
#' to `scrape_afltables_match` to return player match results. | ||
#' | ||
#' @param start_date character string for start date return to URLs from, in "dmy" or "ymd" format | ||
#' @param end_date optional, character string for end date to return URLS, in "dmy" or "ymd" format | ||
#' | ||
#' @return a character vector of match URL's between `start_date` and `end_date` | ||
#' @export | ||
#' | ||
#' @examples | ||
#' get_afltables_urls("01/01/2018") | ||
#' get_afltables_urls("01/01/2018", end_date = "01/04/2018") | ||
#' @importFrom magrittr %>% | ||
#' @importFrom purrr map | ||
get_afltables_urls <- function(start_date, | ||
end_date = Sys.Date()) { | ||
start_date <- lubridate::parse_date_time(start_date, c("dmy", "ymd")) | ||
if (is.na(start_date)) stop(paste("Date format not reccognised. Check that start_date is in dmy or ymd format")) | ||
end_date <- lubridate::parse_date_time(end_date, c("dmy", "ymd")) | ||
if (is.na(end_date)) stop(paste("Date format not reccognised. Check that end_date is in dmy or ymd format")) | ||
|
||
Seasons <- format(start_date, "%Y"):format(end_date, "%Y") | ||
|
||
html_games <- Seasons %>% | ||
map(~ paste0("https://afltables.com/afl/seas/", ., ".html")) %>% | ||
map(xml2::read_html) | ||
|
||
dates <- html_games %>% | ||
map(rvest::html_nodes, "table+ table tr:nth-child(1) > td:nth-child(4)") %>% | ||
map(rvest::html_text) %>% | ||
map(stringr::str_extract, "\\d{1,2}-[A-z]{3}-\\d{4}") %>% | ||
map(lubridate::dmy) %>% | ||
map(~.x > start_date & .x < end_date) | ||
|
||
match_ids <- html_games %>% | ||
map(rvest::html_nodes, "tr+ tr b+ a") %>% | ||
map(rvest::html_attr, "href") %>% | ||
map(~stringr::str_replace(., "..", "https://afltables.com/afl")) | ||
|
||
# Return only id's that match | ||
match_ids <- match_ids %>% | ||
purrr::map2(.y = dates, ~magrittr::extract(.x, .y)) %>% | ||
purrr::reduce(c) | ||
|
||
match_ids[!is.na(match_ids)] | ||
} | ||
|
||
|
||
get_afltables_player_ids <- function(seasons) { | ||
base_url <- function(x) { | ||
if (x < 2017) { | ||
stop("season must be greater than 2016") | ||
} else if (x == 2017) { | ||
"https://raw.githubusercontent.com/jimmyday12/fitzRoy/afltables-playerstats/data-raw/afl_tables_playerstats/afltables_playerstats_2017.csv" | ||
} else { | ||
paste0("https://afltables.com/afl/stats/", x, "_stats.txt") | ||
} | ||
} | ||
|
||
if (min(seasons) < 2017) stop("season must be 2017 onwards") | ||
urls <- purrr::map_chr(seasons, base_url) | ||
|
||
vars <- c("Season", "Player", "ID", "Team") | ||
|
||
id_data <- urls %>% | ||
purrr::map(readr::read_csv, col_types = readr::cols(Round = "c")) %>% | ||
purrr::map2_dfr(.y = seasons, ~mutate(., Season = .y)) | ||
|
||
id_data %>% | ||
dplyr::select(!! vars) %>% | ||
dplyr::distinct() %>% | ||
dplyr::rename(Team.abb = Team) %>% | ||
dplyr::left_join(team_abbr, by = c("Team.abb" = "Team.abb")) %>% | ||
dplyr::select(!! vars) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
|
||
#' Return afltables playr match stats | ||
#' | ||
#' \code{scrape_afltables_match} returns a character vector containing match URLs for the specified date range | ||
#' | ||
#' This function returns the full afltables.com match stats for each player and each game specified in `match_urls`. | ||
#' It is useful to use the helper function `get_afltables_urls` to return these or simply navigate to afltables.com | ||
#' and find the match of interest. | ||
#' | ||
#' @param match_urls A list of URL's for matches to scrape data from | ||
#' @return data table of afltables.com match results, with a row per player per match. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' scrape_afltables_match("https://afltables.com/afl/stats/games/2018/071120180602.html") | ||
#' \dontrun{ | ||
#' scrape_afltables_match(get_afltables_urls("01/06/2018, "01/06/2018")) | ||
#' } | ||
#' @importFrom magrittr %>% | ||
#' @importFrom purrr map | ||
#' @importFrom purrr map2 | ||
#' @importFrom dplyr mutate | ||
#' @importFrom utils type.convert | ||
scrape_afltables_match <- function(match_urls) { | ||
|
||
# For each game url, download data, extract the stats tables #3 and #5 and bind together | ||
message("Downloading data\n") | ||
pb <- progress_estimated(length(match_urls)) | ||
|
||
match_xmls <- match_urls %>% | ||
map(~{ | ||
pb$tick()$print() | ||
xml2::read_html(.) | ||
}) | ||
message("\nFinished downloading data. Processing XMLs\n") | ||
|
||
|
||
replace_names <- function(x) { | ||
names(x) <- x[1, ] | ||
x[-1, ] | ||
} | ||
|
||
details <- match_xmls %>% | ||
map(rvest::html_nodes, "br+ table td") %>% | ||
map(rvest::html_text) | ||
|
||
home_scores <- match_xmls %>% | ||
map(rvest::html_nodes, "br+ table tr:nth-child(2) td") %>% | ||
map(rvest::html_text) | ||
|
||
away_scores <- match_xmls %>% | ||
map(rvest::html_nodes, "br+ table tr:nth-child(3) td") %>% | ||
map(rvest::html_text) | ||
|
||
games <- match_xmls %>% | ||
map(rvest::html_table, fill = TRUE) %>% | ||
map(magrittr::extract, c(3, 5)) %>% | ||
purrr::modify_depth(1, ~ purrr::map(., replace_names)) | ||
|
||
home_games <- games %>% | ||
rvest::pluck(1) %>% | ||
map2(.y = details, ~ mutate(.x, Playing.for = .y[4])) | ||
|
||
away_games <- games %>% | ||
rvest::pluck(2) %>% | ||
map2(.y = details, ~ mutate(.x, Playing.for = .y[9])) | ||
|
||
games <- home_games %>% | ||
map2(.y = away_games, ~bind_rows(.x, .y)) | ||
|
||
|
||
games_df <- games %>% | ||
map2(.y = details, ~ mutate( | ||
.x, | ||
Round = stringr::str_extract(.y[2], "(?<=Round:\\s)(.*)(?=\\sVenue)"), | ||
Venue = stringr::str_extract(.y[2], "(?<=Venue:\\s)(.*)(?=\\Date)"), | ||
Date = stringr::str_extract(.y[2], "(?<=Date:\\s)(.*)(?=\\sAtt)"), | ||
Attendance = stringr::str_extract(.y[2], "(?<=Attendance:\\s)(.*)"), | ||
Umpires = .y[length(.y)] | ||
)) %>% | ||
map2(.y = home_scores, ~mutate( | ||
.x, | ||
Home.team = .y[1], | ||
HQ1 = .y[2], | ||
HQ2 = .y[3], | ||
HQ3 = .y[4], | ||
HQ4 = .y[5] | ||
)) %>% | ||
map2(.y = away_scores, ~mutate( | ||
.x, | ||
Away.team = .y[1], | ||
AQ1 = .y[2], | ||
AQ2 = .y[3], | ||
AQ3 = .y[4], | ||
AQ4 = .y[5] | ||
)) %>% | ||
purrr::reduce(dplyr::bind_rows) | ||
|
||
games_df <- games_df %>% | ||
mutate(Date = gsub("\\([^]]*)", "", Date)) | ||
|
||
# Remove columns with NA and abbreviations | ||
games_df <- games_df[, !(names(games_df) %in% "NA")] | ||
games_df <- games_df[, !(stringr::str_detect(names(games_df), "Abbreviations"))] | ||
|
||
# Fix names | ||
names(games_df) <- make.names(names(games_df)) | ||
if ("X." %in% names(games_df)) games_df <- rename(games_df, Jumper.No. = X.) | ||
if ("X1." %in% names(games_df)) games_df <- rename(games_df, One.Percenters = X1.) | ||
if ("X.P" %in% names(games_df)) games_df <- rename(games_df, TOG = X.P) | ||
|
||
# change column types | ||
games_df <- games_df %>% | ||
dplyr::filter(!Player %in% c("Rushed", "Totals", "Opposition")) | ||
|
||
games_df <- as.data.frame(lapply(games_df, function(x) type.convert(x, na.strings = "NA", as.is = TRUE)), stringsAsFactors = FALSE) | ||
|
||
games_cleaned <- games_df %>% | ||
mutate( | ||
Date = lubridate::dmy_hm(Date), | ||
Local.start.time = as.integer(format(Date, "%H%M")), | ||
Date = lubridate::ymd(format(Date, "%Y-%m-%d")), | ||
Season = as.integer(lubridate::year(Date)) | ||
) %>% | ||
tidyr::separate(Player, into = c("Surname", "First.name"), sep = ",") %>% | ||
dplyr::mutate_at(c("Surname", "First.name"), stringr::str_squish) %>% | ||
tidyr::separate(Umpires, into = c("Umpire.1", "Umpire.2", "Umpire.3", "Umpire.4"), sep = ",", fill = "right") %>% | ||
dplyr::mutate_at(vars(starts_with("Umpire")), stringr::str_replace, " \\(.*\\)", "") | ||
|
||
sep <- function(...) { | ||
dots <- list(...) | ||
tidyr::separate_(..., into = sprintf("%s%s", dots[[2]], c("G", "B", "P")), sep = "\\.") | ||
} | ||
|
||
score_cols <- c("HQ1", "HQ2", "HQ3", "HQ4", "AQ1", "AQ2", "AQ3", "AQ4") | ||
games_cleaned <- games_cleaned %>% | ||
Reduce(f = sep, x = score_cols) %>% | ||
dplyr::mutate_at(vars(contains("HQ")), as.integer) %>% | ||
dplyr::mutate_at(vars(contains("AQ")), as.integer) %>% | ||
dplyr::rename( | ||
Home.score = HQ4P, | ||
Away.score = AQ4P | ||
) | ||
|
||
|
||
ids <- get_afltables_player_ids(min(games_cleaned$Season):max(games_cleaned$Season)) | ||
|
||
games_joined <- games_cleaned %>% | ||
mutate(Player = paste(First.name, Surname)) %>% | ||
dplyr::left_join(ids, by = c("Season", "Player", "Playing.for" = "Team")) %>% | ||
dplyr::select(-Player) | ||
|
||
df <- games_joined %>% | ||
dplyr::rename(!!! rlang::syms(with(stat_abbr, setNames(stat.abb, stat)))) %>% | ||
dplyr::select(one_of(afldata_cols)) | ||
|
||
df <- df %>% | ||
dplyr::mutate_if(is.numeric, ~ifelse(is.na(.), 0, .)) %>% | ||
mutate(Round = as.character(Round)) | ||
|
||
# message(paste("Returned data for", min(df$Season), "to", max(df$Season))) | ||
|
||
return(df) | ||
} |
Oops, something went wrong.