In [2]:
#library needed for this project
library(tidyverse)
library(dplyr)
library(RColorBrewer)
library(tidyr)
library(tidymodels)
library(repr)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

**Load Data**

In [3]:
raw_vgdata <- read_csv("vgsales.csv")
summary(raw_vgdata)

Parsed with column specification:
cols(
  Rank = [32mcol_double()[39m,
  Name = [31mcol_character()[39m,
  Platform = [31mcol_character()[39m,
  Year = [31mcol_character()[39m,
  Genre = [31mcol_character()[39m,
  Publisher = [31mcol_character()[39m,
  NA_Sales = [32mcol_double()[39m,
  EU_Sales = [32mcol_double()[39m,
  JP_Sales = [32mcol_double()[39m,
  Other_Sales = [32mcol_double()[39m,
  Global_Sales = [32mcol_double()[39m
)



      Rank           Name             Platform             Year          
 Min.   :    1   Length:16598       Length:16598       Length:16598      
 1st Qu.: 4151   Class :character   Class :character   Class :character  
 Median : 8300   Mode  :character   Mode  :character   Mode  :character  
 Mean   : 8301                                                           
 3rd Qu.:12450                                                           
 Max.   :16600                                                           
    Genre            Publisher            NA_Sales          EU_Sales      
 Length:16598       Length:16598       Min.   : 0.0000   Min.   : 0.0000  
 Class :character   Class :character   1st Qu.: 0.0000   1st Qu.: 0.0000  
 Mode  :character   Mode  :character   Median : 0.0800   Median : 0.0200  
                                       Mean   : 0.2647   Mean   : 0.1467  
                                       3rd Qu.: 0.2400   3rd Qu.: 0.1100  
                                

**Clean Data**

In [4]:
vg <- na.omit(raw_vgdata) %>%
      filter(Year < 2017) %>%
    filter(Genre == "Sport" | Genre == "Action")    

**Split Data**


In [5]:
set.seed(9999)

vgsplit<-initial_split(vg, prop=0.75, strata=EU_Sales) 
vgtrain<-training(vgsplit)
vgtest<-testing(vgsplit)

**Choose K**

In [8]:
gridvals <- tibble(neighbors = seq(from = 1, to = 200))

vg_recipe <- recipe(EU_Sales ~ NA_Sales + Other_Sales, data = vgtrain) %>%  
step_scale(all_predictors()) %>%  
step_center(all_predictors())


vg_spec <- nearest_neighbor(weight_func = "rectangular",
                    neighbors = tune()) %>%  
set_engine("kknn") %>%  
set_mode("regression")


vg_vfold <- vfold_cv(vgtrain, v = 5, strata = EU_Sales)


In [10]:
vg_wkflw <- workflow() %>%  
add_recipe(vg_recipe) %>%  
add_model(vg_spec)


vg_results <- vg_wkflw %>%  
tune_grid(resamples = vg_vfold, grid = gridvals) %>%  
collect_metrics() %>%  
filter(.metric == "rmse")%>%
filter(mean == min(mean)) %>%
select(neighbors)

vg_results

neighbors
<int>
12


**Create a model with best predictors and chosen K**

In [11]:
set.seed(9999)

vg_spec_k <- nearest_neighbor(weight_func = "rectangular",
                    neighbors = 12) %>%  
set_engine("kknn") %>%  
set_mode("regression")


vg_fit <- workflow() %>%  
add_recipe(vg_recipe) %>%  
add_model(vg_spec_k) %>%
fit(data=vgtrain)
                                 
vg_fit

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

● step_scale()
● step_center()

── Model ───────────────────────────────────────────────────────────────────────

Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = ~12, kernel = ~"rectangular")

Type of response variable: continuous
minimal mean absolute error: 0.06330841
Minimal mean squared error: 0.05406003
Best kernel: rectangular
Best k: 12

**Evaluate how good the model is on test data**

In [12]:
vg_summary <- vg_fit %>%  
predict(vgtest) %>%  
bind_cols(vgtest) %>%  
metrics(truth = EU_Sales, estimate = .pred) %>%  
filter(.metric == 'rmse')

vg_summary

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,0.1854805
