In [9]:
library(tidyverse)
library(tidymodels)
library(cowplot)
library(scales)
library(repr)

In [10]:
players <- read_csv("https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz") |> 
    as_tibble() |> 
    select(-individualId, -organizationName)
players

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, age
[33mlgl[39m (3): subscribe, individualId, organizationName

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,TRUE,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,TRUE,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,FALSE,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
Amateur,TRUE,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,Female,21
Regular,TRUE,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,Male,21
Amateur,TRUE,f58aad5996a435f16b0284a3b267f973f9af99e7a89bee0430055a44fa92f977,0.0,Adrian,Female,17
Regular,TRUE,8e594b8953193b26f498db95a508b03c6fe1c24bb5251d392c18a0da9a722807,0.0,Luna,Female,19
Amateur,FALSE,1d2371d8a35c8831034b25bda8764539ab7db0f63938696917c447128a2540dd,0.0,Emerson,Male,21
Amateur,TRUE,8b71f4d66a38389b7528bb38ba6eb71157733df7d1740371852a797ae97d82d1,0.1,Natalie,Male,17
Veteran,TRUE,bbe2d83de678f519c4b3daa7265e683b4fe2d814077f9094afd11d8f217039ec,0.0,Nyla,Female,22


In [11]:
players |> distinct(gender)

gender
<chr>
Male
Female
Non-binary
Prefer not to say
Agender
Two-Spirited
Other


In [12]:
num_players <- players |> 
 mutate(experience = as.factor(experience)) |>
 mutate(experience = fct_recode(experience, "1"="Beginner", "2"="Amateur" , "3"="Regular", "4"="Pro","5"="Veteran")) |>
 mutate(experience = as.integer(experience)) |> 
 mutate(gender = as.factor(gender)) |>
 mutate(gender = fct_recode(gender, "1"="Male", 
                                    "2"="Female" , 
                                    "3"="Non-binary", 
                                    "4"="Prefer not to say",
                                    "5"="Agender",
                                    "6"="Two-Spirited",
                                    "7"="Other")) |>
 mutate(gender = as.integer(gender)) |> 
 mutate(subscribe = as.factor(subscribe)) |>
 mutate(subscribe = fct_recode(subscribe, 
                               "1"="TRUE", 
                              "2"="FALSE")) |>
 mutate(subscribe = as.integer(subscribe))
num_players

experience,subscribe,hashedEmail,played_hours,name,gender,age
<int>,<int>,<chr>,<dbl>,<chr>,<int>,<dbl>
3,2,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,3,9
5,2,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,3,17
5,1,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,3,17
1,2,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,2,21
4,2,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,3,21
1,2,f58aad5996a435f16b0284a3b267f973f9af99e7a89bee0430055a44fa92f977,0.0,Adrian,2,17
4,2,8e594b8953193b26f498db95a508b03c6fe1c24bb5251d392c18a0da9a722807,0.0,Luna,2,19
1,1,1d2371d8a35c8831034b25bda8764539ab7db0f63938696917c447128a2540dd,0.0,Emerson,3,21
1,2,8b71f4d66a38389b7528bb38ba6eb71157733df7d1740371852a797ae97d82d1,0.1,Natalie,3,17
5,2,bbe2d83de678f519c4b3daa7265e683b4fe2d814077f9094afd11d8f217039ec,0.0,Nyla,2,22


In [22]:
set.seed(3)
#making regression model
gridvals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))


players_split <- initial_split(num_players, prop = 0.70, strata = played_hours)

training_players <- training(players_split)

testing_players <- testing(players_split)

players_recipe <- recipe(played_hours ~ experience + age + gender + subscribe, data = training_players) |> 
    step_scale(all_predictors()) |> 
    step_center(all_predictors()) 

players_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |> 
    set_engine("kknn") |> 
    set_mode("regression")

vfold <- training_players |> 
    vfold_cv(v = 5, strata = played_hours)

players_test_workflow <- workflow() |> 
    add_recipe(players_recipe) |> 
    add_model(players_spec) |> 
    tune_grid(resamples = vfold, grid = gridvals) |> 
    collect_metrics()

players_min <- players_test_workflow |> 
    filter(.metric == "rmse") |> 
    slice_min(mean, n = 1)
players_min
#use K = 10

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
10,rmse,standard,22.09107,5,6.750646,Preprocessor1_Model10
