Can experience and age predict hours played?

In [28]:
library(tidyverse)
library(repr)
library(tidymodels)
source('cleanup.R')

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39m 1.2.1
[32m✔[39m [34mdials       [39m 1.3.0     [32m✔[39m [34mtune        [39m 1.1.2
[32m✔[39m [34minfer       [39m 1.0.7     [32m✔[39m [34mworkflows   [39m 1.1.4
[32m✔[39m [34mmodeldata   [39m 1.4.0     [32m✔[39m [34mworkflowsets[39m 1.0.1
[32m✔[39m [34mparsnip     [39m 1.2.1     [32m✔[39m [34myardstick   [39m 1.3.1
[32m✔[39m [34mrecipes     [39m 1.1.0     

── [1mConflicts[22m ───────────────────────────────────────── tidymodels_conflicts() ──
[31m✖[39m [34mscales[39m::[32mdiscard()[39m masks [34mpurrr[39m::discard()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m   masks [34mstats[39m::filter()
[31m✖[39m [34mrecipes[39m::[32mfixed()[39m  masks [34mstringr[39m::fixed()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m      masks [34mstats[39m::lag()
[31m✖[39m [3

ERROR: Error in file(filename, "r", encoding = encoding): cannot open the connection


In [16]:
players <- read_csv('players.csv')

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [18]:
players_tidy <- players |>
    na.omit() |>
    mutate(experience = as_factor(experience)) |>
    select(experience, Age, played_hours)
players_tidy

experience,Age,played_hours
<fct>,<dbl>,<dbl>
Pro,9,30.3
Veteran,17,3.8
Veteran,17,0.0
Amateur,21,0.7
Regular,21,0.1
Amateur,17,0.0
Regular,19,0.0
Amateur,21,0.0
Amateur,17,0.1
Veteran,22,0.0


In [24]:
experience_playtime_means <- players_tidy |>
    group_by(experience) |>
    summarize(mean_player_hours = mean(played_hours))
experience_playtime_means

experience,mean
<fct>,<dbl>
Pro,2.7846154
Veteran,0.6479167
Amateur,6.0174603
Regular,18.7257143
Beginner,1.2485714


In [25]:
experience_age_means <- players_tidy |>
    group_by(experience) |>
    summarize(mean_age = mean(Age))
experience_age_means

experience,mean
<fct>,<dbl>
Pro,16.92308
Veteran,20.95833
Amateur,20.25397
Regular,20.6
Beginner,21.65714


In [29]:
players_split <- initial_split(players_tidy, prop = 0.75, strata = played_hours)
players_train <- training(players_split)
players_test <- testing(players_split)

In [39]:
players_recipe <- recipe(played_hours ~ Age, data = players_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

players_spec_tune <- nearest_neighbor(weight_func = 'rectangular', neighbors = tune()) |>
    set_engine('kknn') |>
    set_mode('regression')

players_vfold = vfold_cv(players_train, v = 5, strata = played_hours)

players_workflow <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(players_spec_tune)

players_workflow

══ Workflow ════════════════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_scale()
• step_center()

── Model ───────────────────────────────────────────────────────────────────────
K-Nearest Neighbor Model Specification (regression)

Main Arguments:
  neighbors = tune()
  weight_func = rectangular

Computational engine: kknn 


In [40]:
gridvals <- tibble(neighbors = seq(from = 1, to = 12, by = 1))

players_results <- players_workflow |>
  tune_grid(resamples = players_vfold, grid = gridvals) |>
  collect_metrics() |>
  filter(.metric == "rmse")
players_results

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,rmse,standard,25.7265,5,7.958687,Preprocessor1_Model01
2,rmse,standard,27.7126,5,8.692335,Preprocessor1_Model02
3,rmse,standard,28.07281,5,7.402618,Preprocessor1_Model03
4,rmse,standard,28.80036,5,6.693474,Preprocessor1_Model04
5,rmse,standard,26.11925,5,8.15489,Preprocessor1_Model05
6,rmse,standard,29.27849,5,6.408616,Preprocessor1_Model06
7,rmse,standard,28.53754,5,6.686223,Preprocessor1_Model07
8,rmse,standard,28.46131,5,6.658614,Preprocessor1_Model08
9,rmse,standard,27.92661,5,7.265369,Preprocessor1_Model09
10,rmse,standard,27.67335,5,7.286079,Preprocessor1_Model10


In [41]:
players_min <- players_results |>
    filter(mean == min(mean))
players_min

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,rmse,standard,25.7265,5,7.958687,Preprocessor1_Model01


In [45]:
players_spec <-  nearest_neighbor(weight_func = 'rectangular', neighbors = 4) |>
    set_engine('kknn') |>
    set_mode('regression')

players_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(players_spec) |>
    fit(data = players_train)

players_summary <- players_fit |>
  predict(players_test) |>
  bind_cols(players_test) |>
  metrics(truth = played_hours, estimate = .pred) |>
  filter(.metric == 'rmse')

players_summary

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,24.30197


In [47]:
players_preds <- predict(marathon_best_fit, marathon_training) |>
        bind_cols(marathon_training)

players_plot <- ggplot(players_tidy, aes(x = Age, y = played_hours)) +
  geom_point() +
  geom_line(data = players_preds,
            mapping = aes(x = Age, y = .pred),
            color = "steelblue",
            linewidth = 1) +
  xlab("House size (square feet)") +
  ylab("Price (USD)")

players_plot

ERROR while rich displaying an object: [1m[33mError[39m in `geom_line()`:[22m
[1m[22m[33m![39m Problem while computing aesthetics.
[36mℹ[39m Error occurred in the 2nd layer.
[1mCaused by error:[22m
[33m![39m object 'Age' not found

Traceback:
1. tryCatch(withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
2. tryCatchList(expr, classes, parentenv, handlers)
3. tryCatchOne(expr, names, parentenv, handlers[[1L]])
4. doTryCatch(return(expr), name, parentenv, handler)
5. withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .   