In [None]:
# Load packages
library(tidyverse)
library(tidymodels)
library(recipes)
library(readxl)
library(repr)
library(ggplot2)

In [None]:
# set seed for reproducibility
set.seed(1234)
# load data; already split training and testing data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls"
data = download.file(url, destfile = "user.xls")
user_train = read_excel("user.xls", sheet = 2) %>%
            select(LPR, PEG, UNS) %>%
            mutate(UNS = as.factor(UNS))
user_test = read_excel("user.xls", sheet = 3) %>%
            select(LPR, PEG, UNS) %>%
            mutate(UNS = as.factor(UNS))
user_train$UNS <- recode_factor(user_train$UNS, very_low = "Very Low")

Table 1. First Portion of Training Data

In [None]:
head(user_train)

Table 2. First Portion of Testing Data

In [None]:
head(user_test)

Table 3. Summary of Training Data

In [None]:
summary(user_train)

Table 4. Summary of Testing Data

In [None]:
summary(user_test)

Figure 1. Training Data Plot

In [None]:
# visualizing training data
options(repr.plot.width = 12, repr.plot.height = 8) # for display
user_train_plot = user_train %>%
        ggplot(aes(x = LPR, y = PEG, colour = UNS)) +
        labs(x = "Exam performance of user for related objects with goal object (LPR)", y = "Exam performance of user for goal objects (PEG)", colour = 'User Knowledge Level') +
        ggtitle("PEG vs. LPR") +
        geom_point() +
        theme(text = element_text(size = 17))
user_train_plot

In [None]:
# prepare for data modelling for BOTH LPR AND PEG - MODEL 1
user_recipe <- recipe(UNS ~ LPR + PEG , data = user_train)
# create classifier and using cross-validation for parameter value selection (K)
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("classification")
# 5-fold cross-validation
user_vfold <- vfold_cv(user_train, v = 5, strata = "UNS")
k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 5))
# train classifier with training data
user_results <- workflow() %>%
      add_recipe(user_recipe) %>%
      add_model(knn_spec) %>%
      tune_grid(user_vfold, grid = k_vals) %>%
      collect_metrics()

Table 5. Accuracies of Classifier for Model 1

In [None]:
accuracies <- user_results %>%
      filter(.metric == "accuracy")
head(accuracies)

Figure 2. Accuracy vs K for Model 1

In [None]:
# look for K with highest accuracy
accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y= mean)) +
      geom_point() + 
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") + 
      theme(text = element_text(size = 12))
accuracy_vs_k

In [None]:
# from accuracy_vs_k above, we see 6 is the best value for K
# final model with best value K
final_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 6) %>%
      set_engine("kknn") %>%
      set_mode("classification")

final_fit <- workflow() %>%
      add_recipe(user_recipe) %>%
      add_model(final_spec) %>%
      fit(data = user_train)

Table 6. Prediction Labels of Model 1 Classifier for Testing Data

In [None]:
# predict labels for testing data
final_predictions <- predict(final_fit, user_test) %>%
      bind_cols(user_test)
head(final_predictions)

Table 7. Statistics of Model 1 Quality

In [None]:
# keeping the values of levels in estimate and truth classes same
final_predictions$.pred_class <- factor(final_predictions$.pred_class, levels = c("High", "Low", "Middle", "Very Low"))
# find statistics about our model quality
final_metrics <- final_predictions %>%
      metrics(truth = UNS, estimate = .pred_class)
final_metrics

Table 8. Confusion Matrix for Model 1 Classifier

In [None]:
# confusion matrix for classifier
final_conf_mat <- final_predictions %>%
      conf_mat(truth = UNS, estimate = .pred_class)
final_conf_mat

Figure 3. Classification Mosaic Diagram of Model 1

In [None]:
# Visualization of analysis for Model 1
autoplot(final_conf_mat, type = "mosaic")

In [None]:
# prepare for data modelling for LPR ONLY - MODEL 2
lpr_recipe <- recipe(UNS ~ LPR, data = user_train)
# create classifier and using cross-validation for parameter value selection (K)
lpr_knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("classification")
# train classifier with training data
lpr_results <- workflow() %>%
      add_recipe(lpr_recipe) %>%
      add_model(lpr_knn_spec) %>%
      tune_grid(user_vfold, grid = k_vals) %>%
      collect_metrics()

Table 9. Accuracies of Classifier for Model 2

In [None]:
lpr_accuracies <- lpr_results %>%
      filter(.metric == "accuracy")
head(lpr_accuracies)

Figure 4. Accuracy vs K for Model 2

In [None]:
# look for K with highest accuracy
lpr_accuracy_vs_k <- ggplot(lpr_accuracies, aes(x = neighbors, y= mean)) +
      geom_point() + 
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") + 
      theme(text = element_text(size = 12))
lpr_accuracy_vs_k

In [None]:
# from accuracy_vs_k above, we see 16 is the best value for K
# final model with best value K
final_lpr_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 16) %>%
      set_engine("kknn") %>%
      set_mode("classification")

final_lpr_fit <- workflow() %>%
      add_recipe(lpr_recipe) %>%
      add_model(final_lpr_spec) %>%
      fit(data = user_train)

Table 10. Prediction Labels of Model 2 Classifier for Testing Data

In [None]:
# predict labels for testing data
final_lpr_predictions <- predict(final_lpr_fit, user_test) %>%
      bind_cols(user_test)
head(final_lpr_predictions)

Table 11. Statistics of Model 2 Quality

In [None]:
# keeping the values of levels in estimate and truth classes same
final_lpr_predictions$.pred_class <- factor(final_lpr_predictions$.pred_class, levels = c("High", "Low", "Middle", "Very Low"))
# find statistics about our model quality
final_lpr_metrics <- final_lpr_predictions %>%
      metrics(truth = UNS, estimate = .pred_class)
final_lpr_metrics

Table 12. Confusion Matrix for Model 2 Classifier

In [None]:
# confusion matrix for classifier
final_lpr_conf_mat <- final_lpr_predictions %>%
      conf_mat(truth = UNS, estimate = .pred_class)
final_lpr_conf_mat

Figure 5. Classification Mosaic Diagram of Model 2

In [None]:
autoplot(final_lpr_conf_mat, type = "mosaic")

In [None]:
# prepare for data modelling for PEG ONLY - MODEL 3
peg_recipe <- recipe(UNS ~ PEG, data = user_train)
# create classifier and using cross-validation for parameter value selection (K)
peg_knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("classification")
# train classifier with training data
peg_results <- workflow() %>%
      add_recipe(peg_recipe) %>%
      add_model(peg_knn_spec) %>%
      tune_grid(user_vfold, grid = k_vals) %>%
      collect_metrics()

Table 13. Accuracies of Classifier for Model 3

In [None]:
peg_accuracies <- peg_results %>%
      filter(.metric == "accuracy")
head(peg_accuracies)

Figure 6. Accuracy vs K for Model 3

In [None]:
# look for K with highest accuracy
peg_accuracy_vs_k <- ggplot(peg_accuracies, aes(x = neighbors, y= mean)) +
      geom_point() + 
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") + 
      theme(text = element_text(size = 12))
peg_accuracy_vs_k

In [None]:
# from accuracy_vs_k above, we see 11 is the best value for K
# final model with best value K
final_peg_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 11) %>%
      set_engine("kknn") %>%
      set_mode("classification")

final_peg_fit <- workflow() %>%
      add_recipe(peg_recipe) %>%
      add_model(final_peg_spec) %>%
      fit(data = user_train)

Table 14. Prediction Labels of Model 3 Classifier for Testing Data

In [None]:
# predict labels for testing data
final_peg_predictions <- predict(final_peg_fit, user_test) %>%
      bind_cols(user_test)
head(final_peg_predictions)

Table 15. Statistics of Model 3 Quality

In [None]:
# keeping the values of levels in estimate and truth classes same
final_peg_predictions$.pred_class <- factor(final_peg_predictions$.pred_class, levels = c("High", "Low", "Middle", "Very Low"))
# find statistics about our model quality
final_peg_metrics <- final_peg_predictions %>%
      metrics(truth = UNS, estimate = .pred_class)
final_peg_metrics

Table 16. Confusion Matrix for Model 3 Classifier

In [None]:
# confusion matrix for classifier
final_peg_conf_mat <- final_peg_predictions %>%
      conf_mat(truth = UNS, estimate = .pred_class)
final_peg_conf_mat

Figure 6. Classification Mosaic Diagram of Model 3

In [None]:
# Visualization of analysis for Model 3
autoplot(final_peg_conf_mat, type = "mosaic")