In [None]:
# Import packages
library(tidyverse)
library(tidymodels)

In [None]:
# Read csv file
# Source: https://archive.ics.uci.edu/ml/datasets/HTRU2
htru_data <- read_csv('https://raw.githubusercontent.com/iaujla01/DSCI100Project/main/data/HTRU_2.csv', col_names = FALSE)

# Adding column names: 
# 1. Mean of the integrated profile
# 2. Standard deviation of the integrated profile
# 3. Excess kurtosis of the integrated profile 
# 4. Skewness of the integrated profile
# 5. Mean of the DM-SNR curve
# 6. Standard deviation of the DM-SNR curve
# 7. Excess kurtosis of the DM-SNR curve
# 8. Skewness of the DM-SNR curve
# 9. Class
colnames(htru_data) <- c('mean_ip', 'sd_ip', 'kurtosis_ip', 'skew_ip', 'mean_dmsnr', 'sd_dmsnr', 'kurtosis_dmsnr', 'skew_dmsnr', 'class')
htru_data <- htru_data %>% 
    mutate(class = as.factor(class)) # since class is a dummy variable that is either positive (1) or negative (0)
head(htru_data)

In [None]:
# 17,898 total examples; 1,639 positive examples; 16,259 negative examples
# Decide to split dataset into 75% training and 25% testing

htru_split <- initial_split(htru_data, prop = 0.75, strata = class)
htru_train <- training(htru_split)
htru_test <- testing(htru_split)

glimpse(htru_train)
glimpse(htru_test)

In [None]:
options(repr.plot.height = 5, repr.plot.width = 6)
set.seed(3456)  

htru_tune <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = tune()) %>%
                              set_engine("kknn") %>%
                              set_mode("classification")

htru_recipe <- recipe(class ~. , data = htru_train)%>%
               step_scale(all_predictors()) %>%
               step_center(all_predictors())

In [None]:

htru_vfold <- vfold_cv(htru_train, v = 5, strata = class)
#have tried more folds, but no staticall significants. 
gridvals <- tibble(neighbors = 1:200)
htru_results <- workflow() %>%
       add_recipe(htru_recipe) %>%
       add_model(htru_tune) %>%
       tune_grid(resamples = htru_vfold, grid = gridvals) %>%
       collect_metrics()

accuracies <- htru_results %>% 
       filter(.metric == "accuracy")

In [None]:

cross_val_plot  <- ggplot(accuracies, aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") +
       scale_x_continuous(breaks = seq(0, 14, by = 1))

cross_val_plot 



In [None]:
htru_max <- htru_results %>%
  filter(mean == max(mean))


htru_max_number <- htru_max %>%
          pull(neighbors)%>%
          as.integer()

htru_spec_max <- nearest_neighbor(weight_func = "rectangular", neighbors = htru_max_number) %>%
  set_engine("kknn") %>%
  set_mode("classification")


htru_fit <- workflow() %>%
  add_recipe(htru_recipe) %>%
  add_model(htru_spec_max) %>%
  fit(data = htru_train)

htru_summary <- htru_fit %>%
  predict(htru_test) %>%
  bind_cols(htru_test) %>%
  metrics(truth = class, estimate = .pred_class) %>%
  filter(.metric == 'accuracy')

htru_summary
