# Methods

## Our Method:
1. Import the data set.

2. Clean and wrangle data set to have a tidydata format.
3. Visualize relationships between variables of interest:
    1. Investment Activity vs Label (>= 50k, > 50k annual income)
        1. Bar graph
        2. X-Axis: Label 
        3. Y-Axis: Count of Capital Gains and Capital Losses
    2. Capital Gains vs Age
        1. Scatter plot
        2. X-Axis: Age 
        3. Y-Axis: Capital Gains (USD)
    3. Working Hours per Week vs Age
        1. Scatter plot
        2. X-Axis: Age
        3. Y-Axis: Working Hours per Week
        
4. Summarize the data set and address class imbalance if one label is more prevalent then the other.
5. Tune our classification model (k-nearest neighbours) using predictors of interest.
    1. Our dataset provided training and testing data, so we do not have to split our data set.
    2. Pre-process our training data (standardize and center).
    3. Create a 5 fold cross validation data split using vfold.
    4. Determine specifications for the nearest neighbour function.
        1. weight_func = "rectangular"
        2. neighbors = tune()
    5. Fit our model for each fold in our cross validation.
        3. tune_grid(resamples=vold,grid=10)
    6. Create a scatter plot of Accuracy vs k to determine the best k
    
6. Retrain our classification model (k-nearest neighbours) using our tuned k value and predictors of interest.
7. Evaluate the estimate accuracy of our classification model using our test data set.
    1. We may need to go back and try various values for #-fold cross validation and/or using different combinations of predictors.

## Visualizing Our Results:
One way to visualize our results is having several test points on 2D plots with various combinations of our predictors (e.g Capital Gains vs Age). We can use our model to classify these test points and outline them on the 2D plot to see whether the classification visually makes sense and to see whether gauge the weights of the variables in our classification model.


In [9]:
library(tidyverse)
library(tidymodels)
library(ggplot2)

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔[39m [34mbroom    [39m 0.7.0      [32m✔[39m [34mrecipes  [39m 0.1.13
[32m✔[39m [34mdials    [39m 0.0.9      [32m✔[39m [34mrsample  [39m 0.0.7 
[32m✔[39m [34minfer    [39m 0.5.4      [32m✔[39m [34mtune     [39m 0.1.1 
[32m✔[39m [34mmodeldata[39m 0.0.2      [32m✔[39m [34mworkflows[39m 0.2.0 
[32m✔[39m [34mparsnip  [39m 0.1.3      [32m✔[39m [34myardstick[39m 0.0.7 

“package ‘broom’ was built under R version 4.0.2”
“package ‘dials’ was built under R version 4.0.2”
“package ‘infer’ was built under R version 4.0.3”
“package ‘modeldata’ was built under R version 4.0.1”
“package ‘parsnip’ was built under R version 4.0.2”
“package ‘recipes’ was built under R version 4.0.1”
“package ‘tune’ was built under R version 4.0.2”
“package ‘workflows’ was built under R version 4.0.2”
“package ‘yardstick’ was built u

In [10]:
## Reads in data
adult <- read_delim("data/adult.data.txt", delim=",",col_names=c("age", "workclass", "fnl_wgt","education",
    "education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss",
    "hrs_per_week","native_country","label"))

Parsed with column specification:
cols(
  age = [32mcol_double()[39m,
  workclass = [31mcol_character()[39m,
  fnl_wgt = [31mcol_character()[39m,
  education = [31mcol_character()[39m,
  education_num = [31mcol_character()[39m,
  marital_status = [31mcol_character()[39m,
  occupation = [31mcol_character()[39m,
  relationship = [31mcol_character()[39m,
  race = [31mcol_character()[39m,
  sex = [31mcol_character()[39m,
  capital_gain = [31mcol_character()[39m,
  capital_loss = [31mcol_character()[39m,
  hrs_per_week = [31mcol_character()[39m,
  native_country = [31mcol_character()[39m,
  label = [31mcol_character()[39m
)



In [11]:
## Cleaning and Wrangling
adult_tidy <- adult %>%
    mutate(label=as_factor(label), capital_gain = as.numeric(capital_gain), 
           capital_loss = as.numeric(capital_loss), hrs_per_week = as.numeric(hrs_per_week) ) %>%
    filter_all(all_vars(. != " ?")) %>%
    select(age,capital_gain,capital_loss,hrs_per_week,label) 
   
head(adult_tidy)

age,capital_gain,capital_loss,hrs_per_week,label
<dbl>,<dbl>,<dbl>,<dbl>,<fct>
39,2174,0,40,<=50K
50,0,0,13,<=50K
38,0,0,40,<=50K
53,0,0,40,<=50K
28,0,0,40,<=50K
37,0,0,40,<=50K


In [12]:
adult_tidy %>%
    filter(capital_gain > 0)

age,capital_gain,capital_loss,hrs_per_week,label
<dbl>,<dbl>,<dbl>,<dbl>,<fct>
39,2174,0,40,<=50K
31,14084,0,50,>50K
42,5178,0,40,>50K
30,5013,0,40,<=50K
30,2407,0,40,<=50K
44,14344,0,40,>50K
44,15024,0,60,>50K
32,7688,0,40,>50K
28,4064,0,25,<=50K
38,4386,0,35,<=50K


In [5]:
#Reading Adult Data
adult_data <- read_delim("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", delim=",",col_names=c("age", "workclass", "fnl_wgt","education",
    "education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss",
    "hrs_per_week","native_country","label"))

head(adult_data)

Parsed with column specification:
cols(
  age = [32mcol_double()[39m,
  workclass = [31mcol_character()[39m,
  fnl_wgt = [31mcol_character()[39m,
  education = [31mcol_character()[39m,
  education_num = [31mcol_character()[39m,
  marital_status = [31mcol_character()[39m,
  occupation = [31mcol_character()[39m,
  relationship = [31mcol_character()[39m,
  race = [31mcol_character()[39m,
  sex = [31mcol_character()[39m,
  capital_gain = [31mcol_character()[39m,
  capital_loss = [31mcol_character()[39m,
  hrs_per_week = [31mcol_character()[39m,
  native_country = [31mcol_character()[39m,
  label = [31mcol_character()[39m
)



age,workclass,fnl_wgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,label
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [13]:
## Cleaning and Wrangling
adult_tidy <- adult_data %>%
    mutate(label=as_factor(label), capital_gain = as.numeric(capital_gain), 
           capital_loss = as.numeric(capital_loss), hrs_per_week = as.numeric(hrs_per_week) ) %>%
    filter_all(all_vars(. != " ?")) %>%
    select(label,capital_gain)

head(adult_tidy)

label,capital_gain
<fct>,<dbl>
<=50K,2174
<=50K,0
<=50K,0
<=50K,0
<=50K,0
<=50K,0


In [22]:
set.seed(100)

knn_spec <- nearest_neighbor(weight_func="rectangular",neighbors=tune()) %>%
    set_engine("kknn") %>%
    set_mode("classification")

In [25]:
adult_vfold <- vfold_cv(adult_tidy, v = 5, strata = label)

In [32]:
#with upsample
adult_recipe_2 <- recipe(label~capital_gain,data=adult_tidy) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors()) %>%
    step_upsample(label, over_ratio = 1, skip = FALSE)%>%
    prep()

In [30]:
knn_results_2 <- workflow() %>%
    add_recipe(adult_recipe)%>%
    add_model(knn_spec)%>%
    tune_grid(resamples = adult_vfold, grid = 10)%>%
    collect_metrics()

In [31]:
knn_results_2

neighbors,.metric,.estimator,mean,n,std_err,.config
<int>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
2,accuracy,binary,0.8021684,5,0.0007264466,Model1
2,roc_auc,binary,0.6057121,5,0.0017136674,Model1
4,accuracy,binary,0.8024667,5,0.0005425558,Model2
4,roc_auc,binary,0.6083964,5,0.003960434,Model2
6,accuracy,binary,0.8025661,5,0.0008335775,Model3
6,roc_auc,binary,0.6146082,5,0.0035343964,Model3
8,accuracy,binary,0.8022678,5,0.0009625781,Model4
8,roc_auc,binary,0.6204985,5,0.0018069809,Model4
10,accuracy,binary,0.8022346,5,0.0009858191,Model5
10,roc_auc,binary,0.6201081,5,0.0018265707,Model5
