In [17]:
# Import packages
library(tidyverse)
library(tidymodels)

In [18]:
# Read csv file
# Source: https://archive.ics.uci.edu/ml/datasets/HTRU2
htru_data <- read_csv('https://raw.githubusercontent.com/iaujla01/DSCI100Project/main/data/HTRU_2.csv', col_names = FALSE)

# Adding column names: 
# 1. Mean of the integrated profile
# 2. Standard deviation of the integrated profile
# 3. Excess kurtosis of the integrated profile 
# 4. Skewness of the integrated profile
# 5. Mean of the DM-SNR curve
# 6. Standard deviation of the DM-SNR curve
# 7. Excess kurtosis of the DM-SNR curve
# 8. Skewness of the DM-SNR curve
# 9. Class
colnames(htru_data) <- c('mean_ip', 'sd_ip', 'kurtosis_ip', 'skew_ip', 'mean_dmsnr', 'sd_dmsnr', 'kurtosis_dmsnr', 'skew_dmsnr', 'class')
htru_data <- htru_data %>% 
    mutate(class = as.factor(class)) # since class is a dummy variable that is either positive (1) or negative (0)
head(htru_data)

Parsed with column specification:
cols(
  X1 = [32mcol_double()[39m,
  X2 = [32mcol_double()[39m,
  X3 = [32mcol_double()[39m,
  X4 = [32mcol_double()[39m,
  X5 = [32mcol_double()[39m,
  X6 = [32mcol_double()[39m,
  X7 = [32mcol_double()[39m,
  X8 = [32mcol_double()[39m,
  X9 = [32mcol_double()[39m
)



mean_ip,sd_ip,kurtosis_ip,skew_ip,mean_dmsnr,sd_dmsnr,kurtosis_dmsnr,skew_dmsnr,class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
140.5625,55.68378,-0.23457141,-0.6996484,3.199833,19.11043,7.975532,74.24222,0
102.50781,58.88243,0.46531815,-0.5150879,1.677258,14.86015,10.576487,127.39358,0
103.01562,39.34165,0.32332837,1.0511644,3.121237,21.74467,7.735822,63.17191,0
136.75,57.17845,-0.06841464,-0.6362384,3.642977,20.95928,6.896499,53.59366,0
88.72656,40.67223,0.60086608,1.1234917,1.17893,11.46872,14.269573,252.56731,0
93.57031,46.69811,0.53190485,0.4167211,1.636288,14.54507,10.621748,131.394,0


In [19]:
# 17,898 total examples; 1,639 positive examples; 16,259 negative examples
# Decide to split dataset into 75% training and 25% testing

htru_split <- initial_split(htru_data, prop = 0.75, strata = class)
htru_train <- training(htru_split)
htru_test <- testing(htru_split)

glimpse(htru_train)
glimpse(htru_test)

Rows: 13,424
Columns: 9
$ mean_ip        [3m[90m<dbl>[39m[23m 140.56250, 102.50781, 103.01562, 136.75000, 130.38281,…
$ sd_ip          [3m[90m<dbl>[39m[23m 55.68378, 58.88243, 39.34165, 57.17845, 39.84406, 39.4…
$ kurtosis_ip    [3m[90m<dbl>[39m[23m -0.23457141, 0.46531815, 0.32332837, -0.06841464, -0.1…
$ skew_ip        [3m[90m<dbl>[39m[23m -0.69964840, -0.51508791, 1.05116443, -0.63623837, 0.3…
$ mean_dmsnr     [3m[90m<dbl>[39m[23m 3.1998328, 1.6772575, 3.1212375, 3.6429766, 1.2207358,…
$ sd_dmsnr       [3m[90m<dbl>[39m[23m 19.110426, 14.860146, 21.744669, 20.959280, 14.378941,…
$ kurtosis_dmsnr [3m[90m<dbl>[39m[23m 7.975532, 10.576487, 7.735822, 6.896499, 13.539456, 7.…
$ skew_dmsnr     [3m[90m<dbl>[39m[23m 74.24222, 127.39358, 63.17191, 53.59366, 198.23646, 57…
$ class          [3m[90m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
Rows: 4,474
Columns: 9
$ mean_ip        [3m[90m<dbl>[39m[23m 88.72656, 93.57031, 119.48438

In [20]:
options(repr.plot.height = 10, repr.plot.width = 5)
set.seed(3456)  

htru_tune <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = tune()) %>%
                              set_engine("kknn") %>%
                              set_mode("classification")

htru_recipe <- recipe(class ~. , data = htru_train)%>%
               step_scale(all_predictors()) %>%
               step_center(all_predictors())

In [None]:
set.seed(3456)  
htru_vfold <- vfold_cv(htru_train, v = 5, strata = class)
#have tried more folds, but no staticall significants. 
gridvals <- tibble(neighbors = 1:25)
htru_results <- workflow() %>%
                add_recipe(htru_recipe) %>%
                add_model(htru_tune) %>%
                tune_grid(resamples = htru_vfold, grid = gridvals) %>%
                collect_metrics()

htru_results

accuracies <- htru_results %>% 
       filter(.metric == "accuracy")

In [None]:
set.seed(3456)  
cross_val_plot  <- ggplot(accuracies, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(0, 25, by = 1))

cross_val_plot 



In [None]:
set.seed(3456)  
htru_max <- htru_results %>%
  filter(mean == max(mean))

htru_max

htru_max_number <- htru_max %>%
                   slice(1) %>%       #we have two same mean accuracy here, so I've pick one of the tibble.
                   pull(neighbors)%>%
                   as.integer()

htru_spec_max <- nearest_neighbor(weight_func = "rectangular", neighbors = htru_max_number) %>%
  set_engine("kknn") %>%
  set_mode("classification")


htru_fit <- workflow() %>%
  add_recipe(htru_recipe) %>%
  add_model(htru_spec_max) %>%
  fit(data = htru_train)

htru_summary <- htru_fit %>%
  predict(htru_test) %>%
  bind_cols(htru_test) %>%
  metrics(truth = class, estimate = .pred_class) %>%
  filter(.metric == 'accuracy')

htru_summary

In [None]:
source('cleanup.R')