In [None]:
library(tidyverse)
library(tidymodels)

In [None]:
players <- read_csv("data/players.csv")
sessions <- read_csv("data/sessions.csv") |>
    mutate(start_time = dmy_hm(start_time), end_time = dmy_hm(end_time), session_length_minutes = as.numeric(end_time - start_time)) |>
    select(-original_start_time, -original_end_time)

head(players)
head(sessions)

In [None]:
sessions_summary <- sessions |>
    group_by(hashedEmail) |>
    summarise(sessions_num = n(), 
              average_session_length = mean(session_length_minutes), 
              total_play_length = sum(session_length_minutes))

head(sessions_summary)

In [None]:
players_sessions <- merge(players, sessions_summary, by = "hashedEmail") |>
    filter(gender %in% c("Male", "Female", "Non-binary", "Prefer not to say")) |>
    mutate(subscribe = as.factor(subscribe), 
           gender = as.factor(gender),
           experience = as.factor(experience),
           age = Age) |>
    select(-played_hours, -Age)

           
head(players_sessions)

In [None]:
quick_point <- players_sessions |>
    ggplot(aes(x = sessions_num, y = total_play_length)) +
    geom_point(alpha = 0.7) +
    labs(
    title = "Total Playtime vs. Number of Sessions",
    x = "Number of Sessions",
    y = "Total Playtime (minutes)") +
    scale_x_log10(labels = label_comma()) +
    scale_y_log10(labels = label_comma()) +
    theme(text = element_text(size = 16))

quick_point

In [None]:
set.seed(0)
data_split <- initial_split(players_sessions, prop = 0.75, strata = total_play_length)
training <- training(data_split)
testing  <- testing(data_split)

lm_spec <- linear_reg() |>
    set_engine("lm") |>
    set_mode("regression")

lm_recipe <- recipe(total_play_length ~ sessions_num + average_session_length + age + experience + subscribe + gender, data = training)

lm_fit <- workflow() |>
    add_recipe(lm_recipe) |>
    add_model(lm_spec) |>
    fit(data = training)
lm_fit

In [None]:
lm_rmse <- lm_fit |>
        predict(training) |>
        bind_cols(training) |>
        metrics(truth = total_play_length, estimate = .pred) |>
        filter(.metric == "rmse") |>
        select(.estimate) |>
        pull()
lm_rmse

In [None]:
lm_rmspe <- lm_fit |>
        predict(testing) |>
        bind_cols(testing) |>
        metrics(truth = total_play_length, estimate = .pred) |>
        filter(.metric == "rmse") |>
        select(.estimate) |>
        pull()
lm_rmspe

In [None]:
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("regression")

knn_recipe <- recipe(total_play_length ~ sessions_num + average_session_length + age + experience + subscribe + gender, data = training) |>
    step_scale(all_numeric_predictors()) |> 
    step_center(all_numeric_predictors())

vfold <- vfold_cv(training, v = 5, strata = total_play_length)

gridvals <- tibble(neighbors = seq(1, 20))

knn_multi <- workflow() |>
    add_recipe(knn_recipe) |>
    add_model(knn_spec) |>
    tune_grid(vfold, grid = gridvals) |>
    collect_metrics() |>
    filter(.metric == "rmse") |>
    filter(mean == min(mean))

best_k <- knn_multi |>
    pull(neighbors)
best_k

In [None]:
knn_spec_new <- nearest_neighbor(weight_func = "rectangular", neighbors = best_k) |>
  set_engine("kknn") |>
  set_mode("regression")

knn_mult_fit <- workflow() |>
  add_recipe(sacr_recipe) |>
  add_model(sacr_spec) |>
  fit(data = sacramento_train)