Skip to content

Commit

Permalink
Added get_afltables_stats function fixes #26 and #19
Browse files Browse the repository at this point in the history
Merge branch 'afltables-playerstats'

# Conflicts:
#	DESCRIPTION
  • Loading branch information
jimmyday12 committed Aug 15, 2018
2 parents 8ff86f8 + 6cd2df9 commit ce189c0
Show file tree
Hide file tree
Showing 56 changed files with 10,916 additions and 262 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: fitzRoy
Version: 0.1.4
Version: 0.1.5
Date: 2018-08-13
Title: Easily Scrape and Process AFL Data
Description: An easy to use tool for scraping and processing AFL data.
Expand Down Expand Up @@ -32,4 +32,5 @@ Imports:
readr,
stringr,
rlang,
jsonlite
jsonlite,
xml2
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
# Generated by roxygen2: do not edit by hand

export(convert_results)
export(get_afltables_stats)
export(get_afltables_urls)
export(get_fixture)
export(get_footywire_stats)
export(get_match_results)
export(get_score_progression_raw)
export(get_squiggle_data)
export(replace_teams)
export(scrape_afltables_match)
export(update_footywire_stats)
import(dplyr)
import(tidyr)
importFrom(dplyr,mutate)
importFrom(magrittr,"%>%")
importFrom(purrr,map)
importFrom(purrr,map2)
importFrom(rvest,html_nodes)
importFrom(rvest,html_text)
importFrom(utils,type.convert)
9 changes: 8 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
# fitzRoy 0.1.5

* new function `get_aflplayer_data` returns a data frame containing aflplayer stats for the specified games [#19](https://github.com/jimmyday12/fitzRoy/issues/19)
* new helper function `get_aflplayer_urls` returns the URLs of games falling within a date range. Useful to pass to `get_aflplayer_data`
* BREAKING CHANGE: removed `afldata` from the included data to reduce package size (in preperation for CRAN submission). Please use `update_aflplayer_data` or the helper functions
* fixed bug where `get_fixture` returned wrong teams [#23](https://github.com/jimmyday12/fitzRoy/issues/23)


# fitzRoy 0.1.4

* `update_footywire` now more efficiently searches through missing match_ids
* fixed bug where `get_fixture` returned wrong teams [#23](https://github.com/jimmyday12/fitzRoy/issues/23)

# fitzRoy 0.1.3

Expand Down
2 changes: 2 additions & 0 deletions R/afltables_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ get_match_results <- function() {

#' Internal function to ensure names match between different sources and also name changes.
#' This gets applied to any web scraper
#' @param team Team name
#' @export
replace_teams <- function(team) {
# Internal function
Expand All @@ -94,6 +95,7 @@ replace_teams <- function(team) {
team == "Lions" ~ "Brisbane Lions",
team == "Brisbane" ~ "Brisbane Lions",
team == "GW Sydney" ~ "GWS",
team == "Greater Western Sydney" ~ "GWS",
team == "GC" ~ "Gold Coast",
team == "StK" ~ "St Kilda",
team == "PA" ~ "Port Adelaide",
Expand Down
127 changes: 127 additions & 0 deletions R/afltables_player.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#' Return afltables match stats
#'
#' \code{get_afltables_stats} returns a data frame containing match stats for each game within the specified date range
#'
#' This function returns a data frame containing match stats for each game within the specified date range. The data from contains all stats on afltables match pages and returns 1 row per player.
#'
#' The data for this function is hosted on github to avoid extensive scraping of historical data from afltables.com. This will be updated regularly.
#'
#' @param start_date character string for start date return to URLs from, in "dmy" or "ymd" format
#' @param end_date optional, character string for end date to return URLS, in "dmy" or "ymd" format
#'
#' @return a data table containing player stats for each game between start date and end date
#' @export
#'
#' @examples
#' # Gets all data
#' get_afltables_stats()
#'
#' # Specify a date range
#' get_afltables_stats("01/01/2018", end_date = "01/04/2018")
#' @importFrom magrittr %>%
#' @importFrom purrr map
get_afltables_stats <- function(start_date = "1897-05-08", end_date = Sys.Date()) {
start_date <- lubridate::parse_date_time(start_date, c("dmy", "ymd"))
if (is.na(start_date)) stop(paste("Date format not reccognised. Check that start_date is in dmy or ymd format"))
end_date <- lubridate::parse_date_time(end_date, c("dmy", "ymd"))
if (is.na(end_date)) stop(paste("Date format not reccognised. Check that end_date is in dmy or ymd format"))
message(paste0("Returning data from ", start_date, " to ", end_date))

dat_url <- url("https://github.com/jimmyday12/fitzRoy/raw/afltables-playerstats/data-raw/afl_tables_playerstats/afldata.rda")

loadRData <- function(fileName) {
load(fileName)
get(ls()[ls() != "fileName"])
}

dat <- loadRData(dat_url)
max_date <- max(dat$Date)

if (end_date > max_date) {
urls <- get_afltables_urls(max_date, end_date)
dat_new <- scrape_afltables_match(urls)
dat <- dplyr::bind_rows(dat, dat_new)
}
message("Finished getting afltables data")
dplyr::filter(dat, Date > start_date & Date < end_date)
}

#' Return match URLs for specified dates
#'
#' \code{get_afltables_urls} returns a character vector containing match URLs for the specified date range
#'
#' This function returns match URLs for the specified date range. This will typically be used to pass to
#' to `scrape_afltables_match` to return player match results.
#'
#' @param start_date character string for start date return to URLs from, in "dmy" or "ymd" format
#' @param end_date optional, character string for end date to return URLS, in "dmy" or "ymd" format
#'
#' @return a character vector of match URL's between `start_date` and `end_date`
#' @export
#'
#' @examples
#' get_afltables_urls("01/01/2018")
#' get_afltables_urls("01/01/2018", end_date = "01/04/2018")
#' @importFrom magrittr %>%
#' @importFrom purrr map
get_afltables_urls <- function(start_date,
end_date = Sys.Date()) {
start_date <- lubridate::parse_date_time(start_date, c("dmy", "ymd"))
if (is.na(start_date)) stop(paste("Date format not reccognised. Check that start_date is in dmy or ymd format"))
end_date <- lubridate::parse_date_time(end_date, c("dmy", "ymd"))
if (is.na(end_date)) stop(paste("Date format not reccognised. Check that end_date is in dmy or ymd format"))

Seasons <- format(start_date, "%Y"):format(end_date, "%Y")

html_games <- Seasons %>%
map(~ paste0("https://afltables.com/afl/seas/", ., ".html")) %>%
map(xml2::read_html)

dates <- html_games %>%
map(rvest::html_nodes, "table+ table tr:nth-child(1) > td:nth-child(4)") %>%
map(rvest::html_text) %>%
map(stringr::str_extract, "\\d{1,2}-[A-z]{3}-\\d{4}") %>%
map(lubridate::dmy) %>%
map(~.x > start_date & .x < end_date)

match_ids <- html_games %>%
map(rvest::html_nodes, "tr+ tr b+ a") %>%
map(rvest::html_attr, "href") %>%
map(~stringr::str_replace(., "..", "https://afltables.com/afl"))

# Return only id's that match
match_ids <- match_ids %>%
purrr::map2(.y = dates, ~magrittr::extract(.x, .y)) %>%
purrr::reduce(c)

match_ids[!is.na(match_ids)]
}


get_afltables_player_ids <- function(seasons) {
base_url <- function(x) {
if (x < 2017) {
stop("season must be greater than 2016")
} else if (x == 2017) {
"https://raw.githubusercontent.com/jimmyday12/fitzRoy/afltables-playerstats/data-raw/afl_tables_playerstats/afltables_playerstats_2017.csv"
} else {
paste0("https://afltables.com/afl/stats/", x, "_stats.txt")
}
}

if (min(seasons) < 2017) stop("season must be 2017 onwards")
urls <- purrr::map_chr(seasons, base_url)

vars <- c("Season", "Player", "ID", "Team")

id_data <- urls %>%
purrr::map(readr::read_csv, col_types = readr::cols(Round = "c")) %>%
purrr::map2_dfr(.y = seasons, ~mutate(., Season = .y))

id_data %>%
dplyr::select(!! vars) %>%
dplyr::distinct() %>%
dplyr::rename(Team.abb = Team) %>%
dplyr::left_join(team_abbr, by = c("Team.abb" = "Team.abb")) %>%
dplyr::select(!! vars)
}
164 changes: 164 additions & 0 deletions R/afltables_player_main.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@

#' Return afltables playr match stats
#'
#' \code{scrape_afltables_match} returns a character vector containing match URLs for the specified date range
#'
#' This function returns the full afltables.com match stats for each player and each game specified in `match_urls`.
#' It is useful to use the helper function `get_afltables_urls` to return these or simply navigate to afltables.com
#' and find the match of interest.
#'
#' @param match_urls A list of URL's for matches to scrape data from
#' @return data table of afltables.com match results, with a row per player per match.
#' @export
#'
#' @examples
#' scrape_afltables_match("https://afltables.com/afl/stats/games/2018/071120180602.html")
#' \dontrun{
#' scrape_afltables_match(get_afltables_urls("01/06/2018, "01/06/2018"))
#' }
#' @importFrom magrittr %>%
#' @importFrom purrr map
#' @importFrom purrr map2
#' @importFrom dplyr mutate
#' @importFrom utils type.convert
scrape_afltables_match <- function(match_urls) {

# For each game url, download data, extract the stats tables #3 and #5 and bind together
message("Downloading data\n")
pb <- progress_estimated(length(match_urls))

match_xmls <- match_urls %>%
map(~{
pb$tick()$print()
xml2::read_html(.)
})
message("\nFinished downloading data. Processing XMLs\n")


replace_names <- function(x) {
names(x) <- x[1, ]
x[-1, ]
}

details <- match_xmls %>%
map(rvest::html_nodes, "br+ table td") %>%
map(rvest::html_text)

home_scores <- match_xmls %>%
map(rvest::html_nodes, "br+ table tr:nth-child(2) td") %>%
map(rvest::html_text)

away_scores <- match_xmls %>%
map(rvest::html_nodes, "br+ table tr:nth-child(3) td") %>%
map(rvest::html_text)

games <- match_xmls %>%
map(rvest::html_table, fill = TRUE) %>%
map(magrittr::extract, c(3, 5)) %>%
purrr::modify_depth(1, ~ purrr::map(., replace_names))

home_games <- games %>%
rvest::pluck(1) %>%
map2(.y = details, ~ mutate(.x, Playing.for = .y[4]))

away_games <- games %>%
rvest::pluck(2) %>%
map2(.y = details, ~ mutate(.x, Playing.for = .y[9]))

games <- home_games %>%
map2(.y = away_games, ~bind_rows(.x, .y))


games_df <- games %>%
map2(.y = details, ~ mutate(
.x,
Round = stringr::str_extract(.y[2], "(?<=Round:\\s)(.*)(?=\\sVenue)"),
Venue = stringr::str_extract(.y[2], "(?<=Venue:\\s)(.*)(?=\\Date)"),
Date = stringr::str_extract(.y[2], "(?<=Date:\\s)(.*)(?=\\sAtt)"),
Attendance = stringr::str_extract(.y[2], "(?<=Attendance:\\s)(.*)"),
Umpires = .y[length(.y)]
)) %>%
map2(.y = home_scores, ~mutate(
.x,
Home.team = .y[1],
HQ1 = .y[2],
HQ2 = .y[3],
HQ3 = .y[4],
HQ4 = .y[5]
)) %>%
map2(.y = away_scores, ~mutate(
.x,
Away.team = .y[1],
AQ1 = .y[2],
AQ2 = .y[3],
AQ3 = .y[4],
AQ4 = .y[5]
)) %>%
purrr::reduce(dplyr::bind_rows)

games_df <- games_df %>%
mutate(Date = gsub("\\([^]]*)", "", Date))

# Remove columns with NA and abbreviations
games_df <- games_df[, !(names(games_df) %in% "NA")]
games_df <- games_df[, !(stringr::str_detect(names(games_df), "Abbreviations"))]

# Fix names
names(games_df) <- make.names(names(games_df))
if ("X." %in% names(games_df)) games_df <- rename(games_df, Jumper.No. = X.)
if ("X1." %in% names(games_df)) games_df <- rename(games_df, One.Percenters = X1.)
if ("X.P" %in% names(games_df)) games_df <- rename(games_df, TOG = X.P)

# change column types
games_df <- games_df %>%
dplyr::filter(!Player %in% c("Rushed", "Totals", "Opposition"))

games_df <- as.data.frame(lapply(games_df, function(x) type.convert(x, na.strings = "NA", as.is = TRUE)), stringsAsFactors = FALSE)

games_cleaned <- games_df %>%
mutate(
Date = lubridate::dmy_hm(Date),
Local.start.time = as.integer(format(Date, "%H%M")),
Date = lubridate::ymd(format(Date, "%Y-%m-%d")),
Season = as.integer(lubridate::year(Date))
) %>%
tidyr::separate(Player, into = c("Surname", "First.name"), sep = ",") %>%
dplyr::mutate_at(c("Surname", "First.name"), stringr::str_squish) %>%
tidyr::separate(Umpires, into = c("Umpire.1", "Umpire.2", "Umpire.3", "Umpire.4"), sep = ",", fill = "right") %>%
dplyr::mutate_at(vars(starts_with("Umpire")), stringr::str_replace, " \\(.*\\)", "")

sep <- function(...) {
dots <- list(...)
tidyr::separate_(..., into = sprintf("%s%s", dots[[2]], c("G", "B", "P")), sep = "\\.")
}

score_cols <- c("HQ1", "HQ2", "HQ3", "HQ4", "AQ1", "AQ2", "AQ3", "AQ4")
games_cleaned <- games_cleaned %>%
Reduce(f = sep, x = score_cols) %>%
dplyr::mutate_at(vars(contains("HQ")), as.integer) %>%
dplyr::mutate_at(vars(contains("AQ")), as.integer) %>%
dplyr::rename(
Home.score = HQ4P,
Away.score = AQ4P
)


ids <- get_afltables_player_ids(min(games_cleaned$Season):max(games_cleaned$Season))

games_joined <- games_cleaned %>%
mutate(Player = paste(First.name, Surname)) %>%
dplyr::left_join(ids, by = c("Season", "Player", "Playing.for" = "Team")) %>%
dplyr::select(-Player)

df <- games_joined %>%
dplyr::rename(!!! rlang::syms(with(stat_abbr, setNames(stat.abb, stat)))) %>%
dplyr::select(one_of(afldata_cols))

df <- df %>%
dplyr::mutate_if(is.numeric, ~ifelse(is.na(.), 0, .)) %>%
mutate(Round = as.character(Round))

# message(paste("Returned data for", min(df$Season), "to", max(df$Season)))

return(df)
}
Loading

0 comments on commit ce189c0

Please sign in to comment.