In [None]:
#In order to classify data, the data must be of type factor.
#Ex:

In [None]:
cancer <- cancer %>%
    mutate(Class = as_factor(Class))
    
#where cancer is the data table and Class is the column 

In [None]:
#to get a value out of a cell use pull()
xa <- slice(cancer, 1) %>% 
   pull(Symmetry)

#takes Symmetry value of first row of cancer

In [None]:
#to find distance between two rows use dist()
fruit_dist_2 <- fruit_data %>%
   slice(1, 2) %>% # We use slice to get the first two rows of the fruit dataset use : to get a 
   select(mass, width) %>%
   dist()

In [None]:
#addrow() adds a row
newData <- small_sample %>%
   add_row(Symmetry = 0.5, Radius = 0, Class = "unknown")

In [None]:
#Creating recipe 
fruit_data_recipe <- recipe(fruit_name ~ mass, width, height, color_score, data = fruit_data) %>%
                        step_scale(all_predictors()) %>%
                        step_center(all_predictors())

In [None]:
#Creating Model Specification for knn
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = n) %>%
      set_engine("kknn") %>%
      set_mode("classification")

In [None]:
#Fitting model to data using fit()
knn_fit <- knn_spec %>% 
       fit(Class ~ Predictor + Predictor ..., data = dataset)

In [None]:
#Predicting class of an unknown observation 
new_obs <- tibble(Predictor1 = ?, Predictor2 = ?)
class_prediction <- predict(knn_fit, new_obs)

In [None]:
#To remove values from data, do it in recipe step
knn_recipe <- recipe(Class ~ ., data = data) %>%
               step_rm("what u want to remove")

In [None]:
#use prep and bake for recipe data inspection
preprocessed_data <- knn_recipe %>% 
                        prep() %>% 
                        bake(cancer)
preprocessed_data

In [None]:
#Creating workflow
knn_workflow <- workflow() %>%
                  add_recipe(knn_recipe) %>%
                  add_model(knn_spec)

In [None]:
fitting workflow
knn_fit_all <- knn_workflow %>%
            fit(data = data)

In [None]:
#to assign column names 
names <- c("area", "perimeter", "compactness", "length", "width", 
                                     "asymmetry_coefficient", "groove_length", "variety")
seed_data <- read_table2("https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt",
                        col_names = names)

In [None]:
#scale and center data to display as a table without removing existing unscaled columns
fruit_data_scaled <- fruit_data %>% 
mutate(scaled_mass = scale(mass, center = TRUE), 
       scaled_width = scale(width, center = TRUE),
       scaled_height = scale(height, center = TRUE),
       scaled_color_score = scale(color_score, center = TRUE))

In [None]:
#predict labels in test set and use bind cols to add column of predictions to original test data
cancer_test_predictions <- predict(knn_fit, cancer_test) |>
  bind_cols(cancer_test)

In [None]:
#computing accuracy, use metrics function
cancer_test_predictions |>
  metrics(truth = Class, estimate = .pred_class) |>
  filter(.metric == "accuracy")

In [None]:
#confusion matrix
confusion <- cancer_test_predictions |>
             conf_mat(truth = Class, estimate = .pred_class)
##           Truth
## Prediction  M  B
##          M 39  6
##          B 14 84
#the matrix shows 39 observations were correctly predicted as malignant
# and 84 correctly predicted as benign

In [None]:
#cross validation, set v to the number of folds you want
fruit_vfold <- vfold_cv(fruit_train, v = 5, strata = fruit_name)
# do workflow analysis again except when fitting model use fit_resamples function instead of fit for training
# fit resamples allows us to run cross validation on each train/validation split we created
fruit_resample_fit <- workflow() %>%
      add_recipe(fruit_recipe) %>%
      add_model(knn_spec) %>%
      fit_resamples(resamples = fruit_vfold)

In [None]:
#to find how accurate the classifier's validation was across folds, we aggregate the mean and std error using
# collect_metrics()
fruit_metrics <- collect_metrics(fruit_resample_fit)

In [None]:
#selecting parameter value, choosing k value

In [None]:
# step 1
# create a nearest neighbors specification with neighbors = tune()
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("classification")

In [None]:
# step 2
#create workflow analysis that combines your recipe with new tune model spec
knn_results <- workflow() %>%
      add_recipe(fruit_recipe) %>%
      add_model(knn_tune) %>%
      tune_grid(resamples = fruit_vfold, grid = 10) %>%
      collect_metrics()
#instead of using fit or fit_resamples we use tune_grid. For the resamples arg, we input the cross validation 
# model created earlier. for grid we input the number of k values it should try

In [None]:
#to count number of rows use nrow()
number_of_rows <- nrow(training_data)

In [None]:
#use bindcols to combine data sets
testing_set <- bind_cols(testing_data, testing_labels) # for the testing data

In [None]:
#example of finding the best k value by graphing the accuracies
knn_recipe <- recipe(y ~ ., data = training_set)
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("classification")
knn_vfold <- vfold_cv(training_set, v = 5, strata = y)
knn_results <- workflow() %>%
    add_recipe(knn_recipe) %>%
    add_model(knn_tune) %>%
    tune_grid(resamples = knn_vfold, grid = 10) %>%
    collect_metrics
knn_results <- knn_results %>%
    filter(.metric == "accuracy")
cross_val_plot <- knn_results %>%
    ggplot(aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Accuracy", y = "Number of Neighbors") +
    scale_x_continuous(breaks = seq(0, 14, by = 1)) +  # adjusting the x-axis
      scale_y_continuous(limits = c(0.4, 1.0)) # adjusting the y-axis
cross_val_plot

In [None]:
#collecting results from prediction
mnist_predictions <- predict(mnist_fit, testing_set) %>%
    bind_cols(testing_set)
mnist_predictions #makes prediction

mnist_metrics <- mnist_predictions %>%
    metrics(truth = y, estimate = .pred_class)
mnist_metrics #calculates accuracy of model

mnist_conf_mat <- mnist_predictions %>% 
      conf_mat(truth = y, estimate = .pred_class)
mnist_conf_mat  #confusion matrix

In [None]:
#predicting value using knn regression manually
answer3 <- marathon_50 %>% 
 mutate(diff = abs(100 - max)) %>% #predicting at 100
 arrange(diff) %>% 
 slice(1:4) %>%                    # 4 nearest neighbors
 summarise(predicted = mean(time_hrs)) %>%
 pull()


answer3

In [None]:
#knn regression choosing k
marathon_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>% 
      set_engine("kknn") %>%
      set_mode("regression") 

marathon_recipe <- recipe(time_hrs ~ max, data = marathon_training) %>%
      step_scale(all_predictors()) %>%
      step_center(all_predictors())
#recipe and model basically the same except use regression in setmode

In [None]:
#knn regression choosing k, cross validation + workflow
marathon_vfold <- vfold_cv(marathon_training, v = 5, strata = time_hrs)
marathon_workflow <- workflow() %>%
    add_recipe(marathon_recipe) %>%
    add_model(marathon_spec)

In [None]:
#running cross validation for knn regression, testing 200 values of k
gridvals <- tibble(neighbors = seq(1, 200))
marathon_results <- workflow() %>%
    add_recipe(marathon_recipe) %>%
    add_model(marathon_spec) %>%
    tune_grid(marathon_vfold, grid = gridvals) %>%
    collect_metrics() 

In [None]:
#finding minimum rmse along with other metrics can use 2 ways
marathon_min <- marathon_results %>%
    filter(.metric == "rmse") %>%
    filter(mean == min(mean))

marathon_min <- marathon_results %>%
   filter(.metric == "rmse") %>%
   arrange(mean, std_err) %>% 
   slice(1)

In [None]:
# creates tibble with column neighbors containing 1 to 200
gridvals <- tibble(neighbors = seq(1, 200))

In [None]:
#assesing our model on testing data
marathon_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_min) %>%
         set_engine("kknn") %>%
         set_mode("regression") #new model spec with best k value

marathon_best_fit <- workflow() %>%
         add_recipe(marathon_recipe) %>%
         add_model(marathon_best_spec) %>%
         fit(data = marathon_training)

marathon_summary <- marathon_best_fit %>%
          predict(marathon_testing) %>% #predict on testing data
          bind_cols(marathon_testing) %>%
          metrics(truth = time_hrs, estimate = .pred) #use metrics to compute rmspe
#                                                  

In [None]:
#visualizing the knn regression model with predicted value as line
marathon_preds <- predict(marathon_best_fit, marathon_training) %>%
    bind_cols(marathon_training)
marathon_plot <- ggplot(marathon_preds, aes(x = max, y = time_hrs)) +
    labs(x = "Maximum Distance Ran per \n Week During Training (mi)",
        y = "Race Time (hours)", title = "Max Distance ran per week vs marathon time") +
    geom_point() +
    geom_line(data = marathon_preds,
             mapping = aes(x = max, y = .pred), color = "blue") #pred class