In [9]:
### Importing Tidyverse functions

library(repr)
library(tidyverse)
library(tidymodels)

### Limits data frame outputs to 6 rows
options(repr.matrix.max.rows = 6)

In [10]:
### Reading in the data from Excel

tennis_data <- read_csv("data/atp2017-2019.csv")

tennis_data

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m6866[39m [1mColumns: [22m[34m50[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (16): tourney_id, tourney_name, surface, tourney_level, winner_seed, win...
[32mdbl[39m (34): ...1, draw_size, tourney_date, match_num, winner_id, winner_ht, wi...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


...1,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,⋯,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,2019-M020,Brisbane,Hard,32,A,20181231,300,105453,2,⋯,54,34,20,14,10,15,9,3590,16,1977
1,2019-M020,Brisbane,Hard,32,A,20181231,299,106421,4,⋯,52,36,7,10,10,13,16,1977,239,200
2,2019-M020,Brisbane,Hard,32,A,20181231,298,105453,2,⋯,27,15,6,8,1,5,9,3590,40,1050
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
2883,2017-M-DC-2017-WG-M-BEL-FRA-01,Davis Cup WG F: BEL vs FRA,Hard,4,D,20171124,2,104542,,⋯,54,30,12,12,5,11,15,2320,76,667
2884,2017-M-DC-2017-WG-M-BEL-FRA-01,Davis Cup WG F: BEL vs FRA,Hard,4,D,20171124,4,105676,,⋯,54,44,13,14,7,10,7,3775,15,2320
2885,2017-M-DC-2017-WG-M-BEL-FRA-01,Davis Cup WG F: BEL vs FRA,Hard,4,D,20171124,5,106298,,⋯,53,29,11,11,4,11,18,2235,76,667


In [56]:
### Selecting for columns to be used in our project

tennis_data_tidy <- tennis_data |>
    select(tourney_name, surface, tourney_date, match_num, minutes, w_ace:l_bpFaced) |>
    filter(!is.na(minutes), surface != "None") |> # Removes all rows with missing minutes 
    mutate(surface = as.factor(surface))

tennis_data_tidy

tourney_name,surface,tourney_date,match_num,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,⋯,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
<chr>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Brisbane,Hard,20181231,300,124,3,3,77,44,31,⋯,6,8,6,100,54,34,20,14,10,15
Brisbane,Hard,20181231,299,82,10,1,52,33,28,⋯,1,17,2,77,52,36,7,10,10,13
Brisbane,Hard,20181231,298,66,2,2,47,33,26,⋯,2,10,3,46,27,15,6,8,1,5
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Davis Cup WG F: BEL vs FRA,Hard,20171124,2,106,12,2,64,33,31,⋯,0,1,2,81,54,30,12,12,5,11
Davis Cup WG F: BEL vs FRA,Hard,20171124,4,164,1,0,122,71,47,⋯,10,10,3,88,54,44,13,14,7,10
Davis Cup WG F: BEL vs FRA,Hard,20171124,5,94,7,1,57,38,34,⋯,0,1,0,87,53,29,11,11,4,11


In [57]:
### Spliting of data into training and testing data
set.seed(100)

tennis_split <- initial_split(tennis_data_tidy, prop = 0.75, strata = surface)
tennis_training <- training(tennis_split)
tennis_testing <- testing(tennis_split)

In [63]:
### Ignore this for now, I just wanted to see if we had missing values in our training data

test <- tibble(is_na_present = rowSums(is.na(tennis_training)) > 0)


test_filtered <- test |>
    filter(test == TRUE)

test_filtered



is_na_present
<lgl>
True
True
True


In [64]:
### Summary of the data

### Finding number of different surfaces within the data

surface_types <- tennis_training |>
    group_by(surface) |>
    summarize(count = n())

surface_types

surface,count
<fct>,<int>
Clay,1541
Grass,603
Hard,2904
