# Diabetes dataset in R

### Load data

In [2]:
# Load the ML library
library(caret)
library(randomForest)

# Read the dataset
dataset <- read.csv("../data/diabetes.csv")

# Split into 20% validation and 80% training
val_index <- createDataPartition(dataset$Diabetes, p=0.80, list=FALSE)
validation <- dataset[-val_index,]
training <- dataset[val_index,]

# Train ML models and estimate accuracy on test data
# Setup 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

# Set randomizer seed
set.seed(7)

### Train classifiers

In [4]:
# Support Vector Machines
fit.svm <- train(Diabetes~., data=training, method="svmRadial", metric=metric, trControl=control)
# k-Nearest Neighbors (kNN)
fit.nn <- train(Diabetes~., data=training, method="nnet", metric=metric, trControl=control, trace=FALSE)
# Random Forest
fit.rf <- train(Diabetes~., data=training, method="rf", metric=metric, trControl=control)

### Evaluate results

In [5]:
# Evaluate the models and check which one is best
# Summarize accuracy of the models
results <- resamples(list(svm=fit.svm, nn=fit.nn, rf=fit.rf))
summary(results)


Call:
summary.resamples(object = results)

Models: svm, nn, rf 
Number of resamples: 10 

Accuracy 
         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
svm 0.6393443 0.6612903 0.6749075 0.6943416 0.7246827 0.7868852    0
nn  0.6451613 0.7004892 0.7377049 0.7448440 0.7974617 0.8360656    0
rf  0.6612903 0.7387626 0.7641460 0.7609995 0.8000397 0.8225806    0

Kappa 
          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
svm 0.07575758 0.1852315 0.2162299 0.2590222 0.3604199 0.4873950    0
nn  0.15594059 0.3224361 0.3898749 0.4030895 0.5432294 0.6104725    0
rf  0.25258324 0.4072828 0.4938244 0.4641234 0.5361268 0.6084960    0


### Evaluate best model on validation set

In [6]:
# Use the best model and evaluate on the validation dataset
predictions <- predict(fit.rf, validation)
confusionMatrix(predictions, validation$Diabetes)

Confusion Matrix and Statistics

          Reference
Prediction NO YES
       NO  88  24
       YES 12  29
                                          
               Accuracy : 0.7647          
                 95% CI : (0.6894, 0.8294)
    No Information Rate : 0.6536          
    P-Value [Acc > NIR] : 0.001988        
                                          
                  Kappa : 0.4512          
 Mcnemar's Test P-Value : 0.066753        
                                          
            Sensitivity : 0.8800          
            Specificity : 0.5472          
         Pos Pred Value : 0.7857          
         Neg Pred Value : 0.7073          
             Prevalence : 0.6536          
         Detection Rate : 0.5752          
   Detection Prevalence : 0.7320          
      Balanced Accuracy : 0.7136          
                                          
       'Positive' Class : NO              
                                          