In [1]:
library(tidyverse)
library(caret)
library(randomForest)
library(doParallel)

options(digits=5)
options(warn=-1)
options(width=100)

R.Version()$version.string
print(map_chr(c('tidyverse','caret','randomForest'),
              function(x){paste(x,': ',packageVersion(x),', ',sep='')}), quote=F)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --
[32m√[39m [34mggplot2[39m 3.1.1     [32m√[39m [34mpurrr  [39m 0.3.2
[32m√[39m [34mtibble [39m 2.1.1     [32m√[39m [34mdplyr  [39m 0.8.1
[32m√[39m [34mtidyr  [39m 0.8.3     [32m√[39m [34mstringr[39m 1.4.0
[32m√[39m [34mreadr  [39m 1.3.1     [32m√[39m [34mforcats[39m 0.4.0
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.

Attaching package:

[1] tidyverse: 1.2.1,      caret: 6.0.84,         randomForest: 4.6.14, 


In [2]:
#============================
# Read Data

train <- read_csv('pml-training.csv')
test  <- read_csv('pml-testing.csv' )

Parsed with column specification:
cols(
  .default = col_double(),
  user_name = [31mcol_character()[39m,
  cvtd_timestamp = [31mcol_character()[39m,
  new_window = [31mcol_character()[39m,
  kurtosis_roll_belt = [31mcol_character()[39m,
  kurtosis_picth_belt = [31mcol_character()[39m,
  kurtosis_yaw_belt = [31mcol_character()[39m,
  skewness_roll_belt = [31mcol_character()[39m,
  skewness_roll_belt.1 = [31mcol_character()[39m,
  skewness_yaw_belt = [31mcol_character()[39m,
  max_yaw_belt = [31mcol_character()[39m,
  min_yaw_belt = [31mcol_character()[39m,
  amplitude_yaw_belt = [31mcol_character()[39m,
  kurtosis_picth_arm = [31mcol_character()[39m,
  kurtosis_yaw_arm = [31mcol_character()[39m,
  skewness_pitch_arm = [31mcol_character()[39m,
  skewness_yaw_arm = [31mcol_character()[39m,
  kurtosis_yaw_dumbbell = [31mcol_character()[39m,
  skewness_yaw_dumbbell = [31mcol_character()[39m,
  kurtosis_roll_forearm = [31mcol_character()[39m,
  kurtosis

In [3]:
#============================
# Pre Process Sample Date

train <- train %>% mutate(flag='train')
test  <- test  %>% mutate(flag='test' ) %>% 
          mutate(problem_id=as.character(problem_id)) %>% rename(classe=problem_id)

all  <- bind_rows(train,test) %>% select(names(train)[-c(1,3:7)])
drop <- map_lgl(all[,2:153], function(c){ any(is.na(c))} )      # Drop columns contain NA
all <- all[, c(T,!drop,T,T)]
print(ncol(all))

train <- all %>% filter(flag=='train') %>% select(-flag) %>% mutate(classe=factor(classe))
test  <- all %>% filter(flag=='test')  %>% select(-flag)

[1] 55


In [4]:
#============================
# Split Training Data to train, valid
set.seed(0)

index <- createDataPartition(train$classe, p=0.8, list=F, times=1)
train.t <- train[ index,]
train.v <- train[-index,]

In [5]:
#============================
# Fit randomForest
# 

set.seed(0)

    cl <- makeCluster(detectCores()) # for Parallel processing
    registerDoParallel(cl)           #
#-- Fit ----
modFit <- train(classe~., method='rf', data=train.t,
                trControl=trainControl(method = 'cv', number=10))
#-----------
    stopCluster(cl)                 #


In [8]:
# In sample
cat('=== In Sample ===')
confusionMatrix(predict(modFit,newdata=train.t), train.t$classe)
# Out of sample
cat('=== Out of Sample ===')
confusionMatrix(predict(modFit,newdata=train.v), train.v$classe)

=== In Sample ===

Confusion Matrix and Statistics

          Reference
Prediction    A    B    C    D    E
         A 4464    0    0    0    0
         B    0 3038    0    0    0
         C    0    0 2738    0    0
         D    0    0    0 2573    0
         E    0    0    0    0 2886

Overall Statistics
                                
               Accuracy : 1     
                 95% CI : (1, 1)
    No Information Rate : 0.284 
    P-Value [Acc > NIR] : <2e-16
                                
                  Kappa : 1     
                                
 Mcnemar's Test P-Value : NA    

Statistics by Class:

                     Class: A Class: B Class: C Class: D Class: E
Sensitivity             1.000    1.000    1.000    1.000    1.000
Specificity             1.000    1.000    1.000    1.000    1.000
Pos Pred Value          1.000    1.000    1.000    1.000    1.000
Neg Pred Value          1.000    1.000    1.000    1.000    1.000
Prevalence              0.284    0.194    0.174    0.164    0

=== Out of Sample ===

Confusion Matrix and Statistics

          Reference
Prediction    A    B    C    D    E
         A 1114    8    0    0    0
         B    2  749    1    0    0
         C    0    2  682    5    1
         D    0    0    1  637    1
         E    0    0    0    1  719

Overall Statistics
                                        
               Accuracy : 0.994         
                 95% CI : (0.992, 0.996)
    No Information Rate : 0.284         
    P-Value [Acc > NIR] : <2e-16        
                                        
                  Kappa : 0.993         
                                        
 Mcnemar's Test P-Value : NA            

Statistics by Class:

                     Class: A Class: B Class: C Class: D Class: E
Sensitivity             0.998    0.987    0.997    0.991    0.997
Specificity             0.997    0.999    0.998    0.999    1.000
Pos Pred Value          0.993    0.996    0.988    0.997    0.999
Neg Pred Value          0.999    0.997    0.999    0.99

In [7]:
#============================
# Predict from Test set

pred <- predict(modFit, newdata=test)
print(pred)

 [1] B A B A A E D B A A B C B A E E A B B B
Levels: A B C D E
