# DSCI 100 Project Final Report

#### Names: Jasleen Jhajj, Josh Mitra, Angelina Chen

#### Student IDs: 98671837,

---
## Introduction

A UBC Computer Science research group is studying how people play video games, focusing on Minecraft. They set up a Minecraft server to track player actions. The data is divided into two sets: players.csv and sessions.csv. The players.csv data set has 7 variables and 194 observations. Contrarily, the sessions.csv data set has 5 variables and 1533 observations. Some potential issues with these data sets are that they are not very human readable, some players have played 0 hour, hashedEmail columns are extremely hard to read, as they are just a long sequence of numbers and letters, in the sessions.csv data set, it is very unclear what the original_start_time and original_end_time columns represent, the start_time and end_time columns have 2 observations in each cell, making the data untidy, finally there are multiple players in the players.csv data set that have player 0 hours, causing issues when trying to create visuals for this data. The question we tried to answer in our project was if a player's playtime (played_hours), experience level (experience), and number of sessions (number_of_sessions) predict their likelihood of subscribing to the newsletter (subscribe)? played_hours, experience, and number_of_sessions are key factors in predicting whether a player will subscribe to the newsletter. played_hours measures overall engagement, experience indicates skill level and familiarity with the game, and number_of_sessions reflects playing habits. Preparing the data involves scaling numerical values, encoding categorical variables, and handling missing data to ensure the model can accurately identify patterns and make reliable predictions.

---
## Methods & Results

In [2]:
# Importing libraries
library(tidyverse)
library(tidymodels)
library(repr)
library(RColorBrewer)
library(ggplot2)
library(forcats)
library(lubridate)

#Limit output of dataframes to 6 frames
options(repr.matrix.max.rows = 6)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [None]:
players <- read_csv("https://raw.githubusercontent.com/JasleenJhajj/toy_ds_project/refs/heads/main/players.csv")
players
sessions <- read_csv("https://raw.githubusercontent.com/JasleenJhajj/toy_ds_project/refs/heads/main/sessions.csv")
sessions

In [None]:
# Determine if there are missing values in each dataset
cat("\nMissing values in players.csv:\n")
print(colSums(is.na(players)))

# Determine the unique values for categorical variables in players
cat("\nUnique values in 'experience':\n")
print(unique(players$experience))

cat("\nUnique values in 'gender':\n")
print(unique(players$gender))

In [None]:
# Summary statistics for quantitative variables in the players data set

players_summarized <- players |>
summarize (total_players = n(),
           mean_age = mean(Age, na.rm = TRUE),
           median_age = median(Age, na.rm = TRUE),
           sd_age = sd(Age, na.rm = TRUE),
           min_age = min(Age, na.rm = TRUE),
           max_age = max(Age, na.rm = TRUE),
           mean_hours = mean(played_hours, na.rm = TRUE),
           median_hours = median(played_hours, na.rm = TRUE),
           sd_hours = sd(played_hours, na.rm = TRUE),
           min_hours = min(played_hours, na.rm = TRUE),
           max_hours = max(played_hours, na.rm = TRUE))
players_summarized

In [None]:
# Clean NA values from players data set
clean_players <- players |>
filter(!is.na(experience), !is.na(subscribe), !is.na(hashedEmail), !is.na(played_hours), !is.na(Age))

# Clean NA values from sessions data set
clean_sessions <- sessions |>
filter(!is.na(hashedEmail), !is.na(original_start_time), !is.na(original_end_time))

In [None]:
# Create new column called number_of_sessions (Total number of sessions played per player)

sessions_player <- clean_sessions |>
group_by(hashedEmail) |>
summarize(number_of_sessions = n(), .groups = "drop")

# Merge players and sessions data sets
# Remove players without any sessions
players_combined <- clean_players |>
left_join (sessions_player, by = "hashedEmail") |>
filter (!is.na(number_of_sessions))

# Convert `experience` from a Character to a Factor using as.factor function
players_combined$experience <- as.factor(players_combined$experience)

#Final data set
players_final <- players_combined |>
select(played_hours, number_of_sessions, subscribe, experience)
players_final

In [None]:
# wrangle data
players_final$experience <- as.numeric(as.factor(players_final$experience))
players_final$subscribe <- as.factor(players_final$subscribe)
players_final

In [None]:
# knn
# Can a player's playtime (played_hours), experience level (experience), and number of sessions (number_of_sessions) predict their likelihood of subscribing to the newsletter (subscribe)?
set.seed(3456) 
players_split <- initial_split(players_final, prop = 0.75, strata = subscribe)  
players_train <- training(players_split)   
players_test <- testing(players_split)

players_train
players_test

In [None]:
set.seed(2020)

players_recipe <- recipe(subscribe ~ played_hours + number_of_sessions + experience, data = players_train) |>
   step_scale(all_predictors()) |>
   step_center(all_predictors())

players_recipe

In [None]:
set.seed(2020)

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
      set_engine("kknn") |>
      set_mode("classification")

players_fit <- workflow() |>
      add_recipe(players_recipe) |>
      add_model(knn_spec) |>
      fit(data = players_train)
players_fit

In [None]:
players_test_predictions <- predict(players_fit , players_test) |>
      bind_cols(players_test)

players_test_predictions

In [None]:
set.seed(2020)
players_prediction_accuracy <- players_test_predictions |>
        metrics(truth = subscribe, estimate = .pred_class)             

players_prediction_accuracy

In [None]:
set.seed(2020)

players_mat <- players_test_predictions |> 
      conf_mat(truth = subscribe, estimate = .pred_class)

players_mat

In [None]:
set.seed(2020)
players_vfold <- vfold_cv(players_train, v = 10, strata = subscribe)

In [None]:
set.seed(2020)
players_resample_fit <- workflow() |>
      add_recipe(players_recipe) |>
      add_model(knn_spec) |>
      fit_resamples(resamples = players_vfold)

In [None]:
players_metrics <- players_resample_fit %>%
    collect_metrics()
players_metrics

In [None]:
set.seed(2020)
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
      set_engine("kknn") |>
      set_mode("classification")
knn_tune

In [None]:
set.seed(1234)

k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

knn_results <- workflow() |>
      add_recipe(players_recipe) |>
      add_model(knn_tune) |>
      tune_grid(resamples = players_vfold, grid = k_vals) |>
      collect_metrics()

knn_results

In [None]:
accuracies <- knn_results |> 
      filter(.metric == "accuracy")

accuracy_versus_k <- ggplot(accuracies, aes(x = neighbors, y = mean))+
      geom_point() +
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") +
      scale_x_continuous(breaks = seq(0, 14, by = 1)) +  # adjusting the x-axis
      scale_y_continuous(limits = c(0.2, 1.0)) # adjusting the y-axis

accuracy_versus_k

In [None]:
set.seed(2020)

knn_spec_7 <- nearest_neighbor(weight_func = "rectangular", neighbors = 8) |>
      set_engine("kknn") |>
      set_mode("classification")

players_fit_7 <- workflow() |>
      add_recipe(players_recipe) |>
      add_model(knn_spec_7) |>
      fit(data = players_train)
players_fit_7

In [None]:
players_test_predictions_7 <- predict(players_fit_7 , players_test) |>
      bind_cols(players_test)

players_test_predictions_7

In [3]:
set.seed(2020)

players_mat_7 <- players_test_predictions_7 |> 
      conf_mat(truth = subscribe, estimate = .pred_class)

players_mat_7

ERROR: Error in eval(expr, envir, enclos): object 'players_test_predictions_7' not found
