# Diabetes dataset in R

### Load data

In [6]:
# Load the ML library
library(caret)
library(randomForest)

# Read the dataset
dataset <- read.csv("data/diabetes.csv")

# Split into 20% validation and 80% training
val_index <- createDataPartition(dataset$Diabetes, p=0.80, list=FALSE)
validation <- dataset[-val_index,]
training <- dataset[val_index,]

# Train ML models and estimate accuracy on test data
# Setup 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

# Set randomizer seed
set.seed(7)

### Train classifiers

In [10]:
# Support Vector Machines
fit.svm <- train(Diabetes~., data=training, method="svmRadial", metric=metric, trControl=control)
# k-Nearest Neighbors (kNN)
fit.nn <- train(Diabetes~., data=training, method="nnet", metric=metric, trControl=control, trace=FALSE)
# Random Forest
fit.rf <- train(Diabetes~., data=training, method="rf", metric=metric, trControl=control)

### Evaluate results

In [11]:
# Evaluate the models and check which one is best
# Summarize accuracy of the models
results <- resamples(list(svm=fit.svm, nn=fit.nn, rf=fit.rf))
summary(results)


Call:
summary.resamples(object = results)

Models: svm, nn, rf 
Number of resamples: 10 

Accuracy 
         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
svm 0.6612903 0.6734532 0.6774194 0.6895029 0.7049180 0.7377049    0
nn  0.6065574 0.6316103 0.7071655 0.7056584 0.7812665 0.8196721    0
rf  0.6774194 0.7550899 0.7741935 0.7643046 0.7894632 0.8360656    0

Kappa 
          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
svm 0.17771883 0.2014659 0.2301454 0.2408175 0.2721525 0.3612565    0
nn  0.07575758 0.1546760 0.3098351 0.3288915 0.5011757 0.5960265    0
rf  0.29545455 0.4594563 0.4801791 0.4705629 0.5235120 0.6369048    0


### Evaluate best model on validation set

In [12]:
# Use the best model and evaluate on the validation dataset
predictions <- predict(fit.rf, validation)
confusionMatrix(predictions, validation$Diabetes)

Confusion Matrix and Statistics

          Reference
Prediction NO YES
       NO  84  21
       YES 16  32
                                          
               Accuracy : 0.7582          
                 95% CI : (0.6824, 0.8237)
    No Information Rate : 0.6536          
    P-Value [Acc > NIR] : 0.003479        
                                          
                  Kappa : 0.4538          
 Mcnemar's Test P-Value : 0.510798        
                                          
            Sensitivity : 0.8400          
            Specificity : 0.6038          
         Pos Pred Value : 0.8000          
         Neg Pred Value : 0.6667          
             Prevalence : 0.6536          
         Detection Rate : 0.5490          
   Detection Prevalence : 0.6863          
      Balanced Accuracy : 0.7219          
                                          
       'Positive' Class : NO              
                                          