In [9]:
#library needed for this project
library(tidyverse)
library(dplyr)
library(RColorBrewer)
library(tidyr)
library(tidymodels)
library(repr)

**Load Data**

In [10]:
url<-"https://www.kaggle.com/datasets/gregorut/videogamesales/download"

raw_vgdata <- read_csv(url)
summary(raw_vgdata)

Parsed with column specification:
cols(
  `<!DOCTYPE html>` = [31mcol_character()[39m
)

“75 parsing failures.
row col           expected    actual                                                               file
  5  -- 1 columns          2 columns 'https://www.kaggle.com/datasets/gregorut/videogamesales/download'
  8  -- 1 columns          4 columns 'https://www.kaggle.com/datasets/gregorut/videogamesales/download'
 21  -- 1 columns          8 columns 'https://www.kaggle.com/datasets/gregorut/videogamesales/download'
 25  -- 1 columns          8 columns 'https://www.kaggle.com/datasets/gregorut/videogamesales/download'
 28  -- delimiter or quote ]         'https://www.kaggle.com/datasets/gregorut/videogamesales/download'
... ... .................. ......... ..................................................................
See problems(...) for more details.
”


 <!DOCTYPE html>   
 Length:70         
 Class :character  
 Mode  :character  

**Clean Data**

In [11]:
vg <- na.omit(raw_vgdata) %>%
      filter(Year < 2017) %>%
    filter(Genre == "Sport" | Genre == "Action")    

ERROR: Error: Problem with `filter()` input `..1`.
[31m✖[39m object 'Year' not found
[34mℹ[39m Input `..1` is `Year < 2017`.


**Split Data**


In [None]:
set.seed(9999)

vgsplit<-initial_split(vg, prop=0.75, strata=EU_Sales) 
vgtrain<-training(vgsplit)
vgtest<-testing(vgsplit)

**Forward Selection**

In [None]:
set.seed(9999)

vg_formula <- paste("EU_Sales", "~", paste(names, collapse="+"))
vg_formula

# create an empty tibble to store the results
accuracies <- tibble(size = integer(),
                     model_string = character(),
                     accuracy = numeric())

# create a model specification
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
     set_engine("kknn") %>%
     set_mode("regression")

# create a 5-fold cross-validation object
vg_vfold <- vfold_cv(vg_train, v = 5, strata = Class)

# store the total number of predictors
n_total <- length(names)

# stores selected predictors
selected <- c()

# for every size from 1 to the total number of predictors 
for (i in 1:n_total) {
# for every predictor still not added yet
accs <- list()
    models <- list()
for (j in 1:length(names)) {

# create a model string for this combination of predictors        
        preds_new <- c(selected, names[[j]])
        model_string <- paste("EU_Sales", "~", paste(preds_new, collapse="+"))

# create a recipe from the model string  
vg_recipe <- recipe(as.formula(model_string),
                                data = vg_train) %>%                          
step_scale(all_predictors()) %>%                          
step_center(all_predictors())

# tune the KNN regression model with these predictors,
# and collect the accuracy for the best K        
acc <- workflow() %>%          
add_recipe(vg_recipe)%>%           
add_model(knn_spec)%>%          
tune_grid(resamples = vg_vfold, grid = 10) %>%          
collect_metrics() %>%           
filter(.metric == "accuracy") %>%          
summarize(mx = max(mean))
        acc <- acc$mx %>%  unlist()

# add this result to the dataframe        
accs[[j]] <- acc
        models[[j]] <- model_string
    }

    jstar <- which.max(unlist(accs))
    accuracies <- accuracies %>% 
      add_row(size = i,
              model_string = models[[jstar]],
              accuracy = accs[[jstar]])
    selected <- c(selected, names[[jstar]])
    names <- names[-jstar]
}
accuracies

**Choose K**

In [None]:
gridvals <- tibble(neighbors = seq(from = 1, to = 200))

vg_recipe <- recipe(EU_Sales ~ NA_Sales + Other_sales, data = vgtrain) %>%  
step_scale(all_predictors()) %>%  
step_center(all_predictors())


vg_spec <- nearest_neighbor(weight_func = "rectangular",
                    neighbors = tune()) %>%  
set_engine("kknn") %>%  
set_mode("regression")


vg_vfold <- vfold_cv(vgtrain, v = 5, strata = EU_Sales)


vg_wkflw <- workflow() %>%  
add_recipe(vg_recipe) %>%  
add_model(vg_spec)
vg_wkflw


vg_results <- vg_wkflw %>%  
tune_grid(resamples = vg_vfold, grid = gridvals) %>%  
collect_metrics() %>%  
filter(.metric == "rmse")

**Create a model with best predictors and chosen K**

In [1]:
set.seed(9999)

vg_spec_k <- nearest_neighbor(weight_func = "rectangular",
                    neighbors = ?) %>%  
set_engine("kknn") %>%  
set_mode("regression")


vg_fit <- workflow() %>%  
add_recipe(vg_recipe) %>%  
add_model(vg_spec_k) %>%
fit(data=vgtrain)
                                 
vg_fit

ERROR: Error in parse(text = x, srcfile = src): <text>:4:34: unexpected ')'
3: vg_spec_k <- nearest_neighbor(weight_func = "rectangular",
4:                     neighbors = ?)
                                    ^


**Evaluate how good the model is on test data**

In [None]:
vg_summary <- vg_fit %>%  
predict(vgtest) %>%  
bind_cols(vgtest) %>%  
metrics(truth = EU_Sales, estimate = .pred) %>%  
filter(.metric == 'rmse')

vg_summary