# Can we predict math grades using student's characteristics? 

## Reading the data

In [5]:
library(tidyverse)
library(tidymodels)
options(repr.matrix.max.rows = 6)
student <- read_csv2("https://raw.githubusercontent.com/kx-chen/dsci-contract/master/student-mat.csv")
student

Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.

Parsed with column specification:
cols(
  .default = col_character(),
  age = [32mcol_double()[39m,
  Medu = [32mcol_double()[39m,
  Fedu = [32mcol_double()[39m,
  traveltime = [32mcol_double()[39m,
  studytime = [32mcol_double()[39m,
  failures = [32mcol_double()[39m,
  famrel = [32mcol_double()[39m,
  freetime = [32mcol_double()[39m,
  goout = [32mcol_double()[39m,
  Dalc = [32mcol_double()[39m,
  Walc = [32mcol_double()[39m,
  health = [32mcol_double()[39m,
  absences = [32mcol_double()[39m,
  G1 = [32mcol_double()[39m,
  G2 = [32mcol_double()[39m,
  G3 = [32mcol_double()[39m
)

See spec(...) for full column specifications.



school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,⋯,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GP,F,18,U,GT3,A,4,4,at_home,teacher,⋯,4,3,4,1,1,3,6,5,6,6
GP,F,17,U,GT3,T,1,1,at_home,other,⋯,5,3,3,1,1,3,4,5,5,6
GP,F,15,U,LE3,T,1,1,at_home,other,⋯,4,3,2,2,3,3,10,7,8,10
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
MS,M,21,R,GT3,T,1,1,other,other,⋯,5,5,3,3,3,3,3,10,8,7
MS,M,18,R,LE3,T,3,2,services,other,⋯,4,4,1,3,4,5,0,11,12,10
MS,M,19,U,LE3,T,1,1,other,at_home,⋯,3,2,3,3,3,5,5,8,9,9


## Cleaning the Data

In [6]:
student_clean <- student %>%
    select(school, sex, age, paid, schoolsup, internet, studytime, failures, health, absences, G1, G2, G3)
student_clean

school,sex,age,paid,schoolsup,internet,studytime,failures,health,absences,G1,G2,G3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GP,F,18,no,yes,no,2,0,3,6,5,6,6
GP,F,17,no,no,yes,2,0,3,4,5,5,6
GP,F,15,yes,yes,yes,2,3,3,10,7,8,10
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
MS,M,21,no,no,no,1,3,3,3,10,8,7
MS,M,18,no,no,yes,1,0,5,0,11,12,10
MS,M,19,no,no,yes,1,0,5,5,8,9,9


## Setting up

In [7]:
student_split <- initial_split(student_clean, props = 0.80, strata = G3)
student_train <- training(student_split)
student_test <- testing(student_split)

student_split
student_train
student_test

<Analysis/Assess/Total>
<298/97/395>

school,sex,age,paid,schoolsup,internet,studytime,failures,health,absences,G1,G2,G3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GP,F,17,no,no,yes,2,0,3,4,5,5,6
GP,F,15,yes,no,yes,3,0,5,2,15,14,15
GP,F,16,yes,no,no,2,0,5,4,6,10,10
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
MS,F,19,no,no,yes,3,1,5,0,7,5,0
MS,F,18,yes,no,yes,2,0,1,0,7,9,8
MS,M,18,no,no,yes,1,0,5,0,11,12,10


school,sex,age,paid,schoolsup,internet,studytime,failures,health,absences,G1,G2,G3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GP,F,18,no,yes,no,2,0,3,6,5,6,6
GP,F,15,yes,yes,yes,2,3,3,10,7,8,10
GP,F,17,no,yes,no,2,0,1,6,6,5,6
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
MS,M,17,no,no,yes,1,0,2,3,14,16,16
MS,M,21,no,no,no,1,3,3,3,10,8,7
MS,M,19,no,no,yes,1,0,5,5,8,9,9


## Setting up recipe and model

In [9]:
student_recipe <- recipe(G3 ~ age + absences + failures, data = student_train) %>%
                    step_scale(all_predictors()) %>% step_center(all_predictors())

student_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>% set_engine("kknn") %>% set_mode("regression")

student_recipe
student_spec

Data Recipe

Inputs:

      role #variables
   outcome          1
 predictor          3

Operations:

Scaling for all_predictors()
Centering for all_predictors()

K-Nearest Neighbor Model Specification (regression)

Main Arguments:
  neighbors = tune()
  weight_func = rectangular

Computational engine: kknn 


## Tuning/Finding ideal K

In [10]:
gridvals <- tibble(neighbors = seq(1, 200))
student_vfold <- vfold_cv(student_train, v = 5, strata = G3)

student_k <- workflow() %>% add_recipe(student_recipe) %>% add_model(student_spec) %>%
             tune_grid(student_vfold, grid = gridvals) %>% collect_metrics() %>% filter(.metric == "rmse") %>% filter(mean == min(mean)) %>% pull(neighbors)


student_k

## Evaluating on the test set

In [11]:
student_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = student_k) %>%
  set_engine("kknn") %>%
  set_mode("regression")

student_fit <- workflow() %>%
  add_recipe(student_recipe) %>%
  add_model(student_spec) %>%
  fit(data = student_train)

predictions <- student_fit %>%
  predict(student_test) %>%
  bind_cols(student_test)

student_metrics <- metrics(predictions, truth = G3, estimate = .pred)
student_metrics


.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,3.7991432
rsq,standard,0.1906105
mae,standard,2.953305


## Final Model