## Project Final Report

In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
set.seed(1234)

In [None]:
url_players<-"https://drive.google.com/uc?export=download&id=1M8-duCFfUh3MYjEn0j2WWRXzVMzTG97x"
players <- read_csv(url_players)

url_sessions <-"https://drive.google.com/uc?export=download&id=1DNxV79eDG7KkIWn6hIwgWeAeUN074SCS"
sessions <-read_csv(url_sessions)

### Introduction

A research group in Computer Science at UBC, led by Frank Wood, collected data about how people play video games. They set up a MineCraft server, where the data collected consisted of two datasets: players.csv and sessions.csv. Our report will look at how player characteristics predict their number of played hours on the server. 

Our group was interested in the “kinds” of players that are most likely to contribute a large amount of data, such that they can be targeted in recruiting efforts. To investigate this, we chose the specific research question: Can gender, experience, and age predict the total number of minutes of playtime from the dataset? On a broader scale, these insights can help video game companies develop strategies that enhance user experience and engagement across diverse populations.

To answer the question, we must use both datasets provided (players.csv and sessions.csv). This data was collected by the UBC research group by recording play sessions, to collect information on player skill level, past play sessions, and player demographics. The players dataset contains data pertaining to our explanatory variables (gender, experience, age) whilst the sessions dataset contains data pertaining to the response variable (total playtime minutes).



### Methods and Results

In [None]:
players_tidy<- players|>
               mutate(gender=as_factor(gender))|>
               mutate(experience= as_factor(experience))|>
               filter(!is.na(played_hours))|>
               filter(!is.na(Age))
               

tidy_sessions <- sessions |>
    separate(col = start_time, into = c("start_date", "start_time"), sep = " ")|>
    separate(col = end_time, into = c("end_date", "end_time"), sep = " ")|>
    mutate(start_time = hm(start_time))|>
    mutate(end_time = hm(end_time))|>
    mutate(played_minutes = end_time-start_time)|>
    mutate(played_minutes = abs(as.numeric(played_minutes)/60))|>
    filter(played_minutes <= 1000)


data_combined<- inner_join(players_tidy, tidy_sessions, by = "hashedEmail")

data_selected<-data_combined|> select(Age, experience, played_minutes, gender)

data_summarized <- data_combined |> 
  summarise(
    n = n(),
    min_age = min(Age, na.rm = TRUE),
    max_age = max(Age, na.rm = TRUE),
    mean_age = mean(Age, na.rm = TRUE),
    median_age = median(Age, na.rm = TRUE),
    min_minutes = min(played_minutes, na.rm = TRUE),
    max_minutes = max(played_minutes, na.rm = TRUE),
    mean_minutes = mean(played_minutes, na.rm = TRUE),
    median_minutes = median(played_minutes, na.rm = TRUE))
data_summarized

data_split <- initial_split(data_selected, prop = 0.70, strata = played_minutes)
data_training <- training(data_split)
data_testing <- testing(data_split)
 
players_plot1<-data_training|>
               ggplot(aes(x=Age, y=played_minutes,color=experience))+
               geom_point()+
               labs(x="Age", y="Total Time Played (Minutes)", fill="Experience", title="Age vs Total Minutes Played")
players_plot1

players_plot2<-data_training|>
               ggplot(aes(x=Age, y=played_minutes,color=gender))+
               geom_point()+
               labs(x="Age", y="Total Time Played (Minutes)", fill="Gender", title="Age vs Total Minutes Played")
players_plot2

In [None]:
knn_recipe <- recipe(played_minutes ~ Age + experience + gender, data = data_training) |>
  step_dummy(all_nominal_predictors()) |> 
  step_zv(all_predictors())|>
  step_normalize(all_predictors())          

knn_model <- nearest_neighbor(mode = "regression",
  neighbors = tune() ) |>
  set_engine("kknn")


knn_workflow <- workflow() |>
  add_model(knn_model) |>
  add_recipe(knn_recipe)

folds <- vfold_cv(data_training, v=5, strata=played_minutes)

knn_grid <- grid_regular(neighbors(range = c(1,25)), levels =10)

knn_results <- tune_grid(
  knn_workflow,
  resamples = folds,
  grid = knn_grid,
  metrics = metric_set(rmse))

best_knn <- knn_results |>
  select_best(metric = "rmse")
final_knn_workflow <- finalize_workflow(knn_workflow, best_knn)

final_knn_fit <- final_knn_workflow |>
  fit(data = data_training)

final_preds <- predict(final_knn_fit, data_testing) |>
  bind_cols(data_testing)

rmse(final_preds, truth = played_minutes, estimate = .pred)

knn_plot<-ggplot(final_preds, aes(x = .pred , y = played_minutes)) +
  geom_point(alpha = 0.6) +
geom_line(data= final_preds, mapping=aes( y=.pred), color = "blue")+
  labs(x = "Actual Played Minutes", y = "Predicted Played Minutes",
       title = "kNN Regression: Actual vs Predicted")
knn_plot