In [None]:
# Load packages
library(tidyverse)
library(readxl)
library(repr)

In [None]:
# set seed for reproducibility
set.seed(1234)
# load data; already split training and testing data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls"
data = download.file(url, destfile = "user.xls")
user_train = read_excel("user.xls", sheet = 2) %>%
            select(LPR, PEG, UNS) %>%
            mutate(UNS = as.factor(UNS))
user_test = read_excel("user.xls", sheet = 3) %>%
            select(LPR, PEG, UNS) %>%
            mutate(UNS = as.factor(UNS))
head(user_train)
head(user_test)

In [None]:
# visualizing training data
options(repr.plot.width = 12, repr.plot.height = 8) # for display
user_train_plot = user_train %>%
        ggplot(aes(x = LPR, y = PEG, colour = UNS)) +
        labs(x = "Exam performance of user for related objects with goal object (LPR)",y = "Exam performance of user for goal objects (PEG)", colour = 'User Knowledge Level') +
        ggtitle("PEG vs. LPR") +
        geom_point() +
        theme(text = element_text(size = 17))
user_train_plot

In [None]:
# prepare for data modelling
user_recipe <- recipe(UNS ~ LPR + PEG , data = user_train) %>%
      step_scale(all_predictors()) %>%
      step_center(all_predictors())
# create classifier and using cross-validation for parameter value selection (K)
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("classification")
# 5-fold cross-validation
user_vfold <- vfold_cv(user_train, v = 5, strata = "UNS")
k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))
# train classifier with training data
user_results <- workflow() %>%
      add_recipe(user_recipe) %>%
      add_model(knn_spec) %>%
      tune_grid(user_vfold, grid = k_vals) %>%
      collect_metrics()
user_results
# look for K with highest accuracy
accuracies <- knn_results %>%
      filter(.metric == "accuracy")
accuracies
accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y= mean)) +
      geom_point() + 
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") + 
      theme(text = element_text(size = 12)) +
      scale_x_continuous(breaks = seq(0, 14, by = 1)) +  # adjusting the x-axis
      scale_y_continuous(limits = c(0.4, 1.0)) # adjusting the y-axis
accuracy_vs_k

In [None]:
# final model with best value K
final_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = ) %>%
      set_engine("kknn") %>%
      set_mode("classification") %>%

final_fit <- workflow() %>%
      add_recipe(user_recipe) %>%
      add_model(final_spec) %>%
      fit(data = data_train)

# predict labels for testing data
final_predictions <- predict(final_fit, user_test) %>%
      bind_cols(user_test)
final_predictions
# find statistics about our model quality
final_metrics <- final_predictions %>%
      metrics(truth = UNS, estimate = .pred_class)
final_metrics
# confusion matrix for classifier
final_conf_mat <- final_predictions %>%
      conf_mat(truth = UNS, estimate = .pred_class)
final_conf_mat