# Machine Learning in R

In [None]:
library(caret)

In [None]:
filename <- "iris.csv"

# load the CSV file from the local directory

In [None]:
dataset <- read.csv(filename, header=FALSE)

# set the column names in the dataset

In [None]:
colnames(dataset) <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species")

# create a list of 80% of the rows in the original dataset we can use for training

In [None]:
validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE)

# select 20% of the data for validation

In [None]:
validation <- dataset[-validation_index,]

# use the remaining 80% of data to training and testing the models

In [None]:
dataset <- dataset[validation_index,]

# dimensions of dataset

In [None]:
dim(dataset)

# list types for each attribute

In [None]:
sapply(dataset, class)

# take a peek at the first 5 rows of the data

In [None]:
head(dataset)

# list the levels for the class

In [None]:
levels(dataset$Species)

# summarize the class distribution

In [None]:
percentage <- prop.table(table(dataset$Species)) * 100
cbind(freq=table(dataset$Species), percentage=percentage)

# summarize attribute distributions

In [None]:
summary(dataset)

# split input and output

In [None]:
x <- dataset[,1:4]
y <- dataset[,5]

# boxplot for each attribute on one image

In [None]:
par(mfrow=c(1,4))
  for(i in 1:4) {
  boxplot(x[,i], main=names(iris)[i])
}

# barplot for class breakdown

In [None]:
plot(y)

# scatterplot matrix

In [None]:
featurePlot(x=x, y=y, plot="ellipse")

# box and whisker plots for each attribute

In [None]:
featurePlot(x=x, y=y, plot="box")

# density plots for each attribute by class value

In [None]:
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

# Run algorithms using 10-fold cross validation

In [None]:
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

# a) linear algorithms

In [None]:
set.seed(7)
fit.lda <- train(Species~., data=dataset, method="lda", metric=metric, trControl=control)

# b) nonlinear algorithms
# CART

In [None]:
set.seed(7)
fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)

# kNN

In [None]:
set.seed(7)
fit.knn <- train(Species~., data=dataset, method="knn", metric=metric, trControl=control)

# c) advanced algorithms
# SVM

In [None]:
set.seed(7)
fit.svm <- train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control)

# Random Forest

In [None]:
set.seed(7)
fit.rf <- train(Species~., data=dataset, method="rf", metric=metric, trControl=control)

# summarize accuracy of models

In [None]:
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)

# compare accuracy of models

In [None]:
dotplot(results)

# summarize Best Model

In [None]:
print(fit.lda)

# estimate skill of LDA on the validation dataset

In [None]:
predictions <- predict(fit.lda, validation)
confusionMatrix(predictions, validation$Species)