In [9]:
library(tidyverse)
library(repr)
library(tidymodels)

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔[39m [34mbroom    [39m 0.7.0      [32m✔[39m [34mrecipes  [39m 0.1.13
[32m✔[39m [34mdials    [39m 0.0.9      [32m✔[39m [34mrsample  [39m 0.0.7 
[32m✔[39m [34minfer    [39m 0.5.4      [32m✔[39m [34mtune     [39m 0.1.1 
[32m✔[39m [34mmodeldata[39m 0.0.2      [32m✔[39m [34mworkflows[39m 0.2.0 
[32m✔[39m [34mparsnip  [39m 0.1.3      [32m✔[39m [34myardstick[39m 0.0.7 

“package ‘broom’ was built under R version 4.0.2”
“package ‘dials’ was built under R version 4.0.2”
“package ‘infer’ was built under R version 4.0.3”
“package ‘modeldata’ was built under R version 4.0.1”
“package ‘parsnip’ was built under R version 4.0.2”
“package ‘recipes’ was built under R version 4.0.1”
“package ‘tune’ was built under R version 4.0.2”
“package ‘workflows’ was built under R version 4.0.2”
“package ‘yardstick’ was built u

In [11]:
weather_train <- read_csv("data/training_set2.csv")
weather_train

Parsed with column specification:
cols(
  Temp = [32mcol_double()[39m,
  Humid = [32mcol_double()[39m,
  Wind_Spd = [32mcol_double()[39m,
  Vis = [32mcol_double()[39m,
  Press = [32mcol_double()[39m,
  Weather = [31mcol_character()[39m
)



Temp,Humid,Wind_Spd,Vis,Press,Weather
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
13.8,94,4,32.2,102.04,Mainly Clear
13.4,90,3,32.2,101.98,Mainly Clear
18.1,74,9,40.2,101.94,Mainly Clear
19.6,69,11,40.2,101.82,Clear
20.6,73,12,40.2,101.61,Clear
22.5,65,13,40.2,101.36,Clear
21.0,76,8,32.2,101.14,Clear
17.5,91,6,32.2,101.06,Clear
16.9,91,9,32.2,101.08,Clear
14.4,96,7,32.2,101.15,Clear


In [None]:
set.seed(2022)


#check to see if proportions of data are close enough to use or if we should scale up / remove some
#of the other weather classifications

number_rows <- nrow(weather_train)

proportions <- weather_train %>%
    group_by(Weather) %>%
    summarize(n = n())
proportions

#make recipe and model with cross validation to find k

weather_recipe <- recipe(Weather ~ ., data = weather_train) %>%
    step_center(all_predictors()) %>%
    step_scale(all_predictors())

weather_knn <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("classification")

weather_vfold <- vfold_cv(weather_train, v = 5, strata = Weather)

k_grid <- tibble(neighbors = seq(from = 1, to = 100))

weather_fit <- workflow() %>%
    add_recipe(weather_recipe) %>%
    add_model(weather_knn) %>%
    tune_grid(resamples = weather_vfold, grid = k_grid) %>%
    collect_metrics()

#find accuracy for different values of k on our model

weather_accuracy <- weather_fit %>%
    filter(.metric == "accuracy")

#create line plot with data points for accuracy estimate vs neighbors to find optimal k value
#(looking for high accuracy estimate with high values of accuracy for nearby values of k as well)

options(repr.plot.height = 8, repr.plot.width = 12)
finding_k_plot <- ggplot(weather_accuracy, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Number of Neighbors", y = "Accuracy Estimate",
        title = "Accuracy Estimates for Different Numbers of Neighbors") +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(breaks = 1:100)
finding_k_plot

#retrain our model with the newfound value of k and fit it to our training data

weather_model <- nearest_neighbor(weight_func = "rectangular", neighbors = ...) %>%  ## must use value of k determined above
    set_engine("kknn") %>%
    set_mode("classification")

weather_fit_final <- workflow() %>%
    add_recipe(weather_recipe) %>%
    add_model(weather_model) %>%
    fit(data = weather_train)

#we can now use our fitted model to predict the weather for our testing data set

weather_predictions <- predict(weather_fit_final, weather_test) %>%
    bind_cols(weather_test) %>%
    select(.pred_class, Weather) %>%
weather_predictions

#we can use the metrics function to see how accurate our model was at predicting weather values 
#on the testing dataset

weather_metrics <- weather_predictions %>%
    metrics(truth = Weather, estimate = .pred_class) %>%
    filter(.metric == "accuracy")
weather_metrics

`summarise()` ungrouping output (override with `.groups` argument)



Weather,n
<chr>,<int>
Clear,2250
Cloudy,750
Mainly Clear,2010
Moderate Rain,60
Mostly Cloudy,1680
Rain,1350
Rain Showers,90


In [None]:
# Model 1 - Temp
temp_recipe <- recipe(Weather ~ Temp, data = weather_train) %>%
    step_center(all_predictors()) %>%
    step_scale(all_predictors())

temp_fit <- workflow() %>%
    add_recipe(temp_recipe) %>%
    add_model(weather_knn) %>%
    tune_grid(resamples = weather_vfold, grid = k_grid) %>%
    collect_metrics()

temp_accuracy <- temp_fit %>%
    filter(.metric == "accuracy")

options(repr.plot.height = 8, repr.plot.width = 12)
finding_k_plot <- ggplot(temp_accuracy, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Number of Neighbors", y = "Accuracy Estimate",
        title = "Accuracy Estimates for Different Numbers of Neighbors") +
    theme(text = element_text(size = 20)) +
    scale_x_continuous(breaks = 1:100)

finding_k_plot