# Assignment 5 – Machine Learning in R

**Author**: Jeff Barbee  
**Email**: jeff.barbee@osumc.edu 
**Date**: [Submission Date]  
**Environment**: See `environment.yml`  
**Description**:  Assignment 5A
This notebook explores basic machine learning using [scikit-learn/caret/etc.], including data preparation, model training, and evaluation.

In [None]:
#Assignment 5B R 

## Install packages

In [None]:
install.packages("caret")
library(caret)


In [None]:
dataset<-read.csv("iris.csv", header = F)

In [None]:
colnames(dataset)<-c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species")

In [None]:
colnames(dataset)
str(dataset)
table(dataset$Species)
dataset$Species<-as.factor(dataset$Species)
str(dataset)

# Create Validation Data

In [None]:
validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE)

In [None]:
validation <- dataset[-validation_index,]
# use the remaining 80% of data to training and testing the models
dataset <- dataset[validation_index,]

# Dimensions

In [None]:
dim(dataset)

In [None]:
colnames(dataset)

In [None]:
sapply(dataset, class)

In [None]:
head(dataset)

In [None]:
colnames(dataset)

In [None]:
levels(dataset$Species)

In [None]:
percentage <- prop.table(table(dataset$Species)) * 100
cbind(freq=table(dataset$Species), percentage=percentage)

In [None]:
summary(dataset)

# Visualizations 

In [None]:
# split input and output
x <- dataset[,1:4]
y <- dataset[,5]


In [None]:
# boxplot for each attribute on one image
par(mfrow=c(1,4))
  for(i in 1:4) {
  boxplot(x[,i], main=names(iris)[i])
}


In [None]:
install.packages("Seurat")
library(Seurat)

In [None]:
# barplot for class breakdown
plot(y)


In [None]:
# scatterplot matrix
featurePlot(x=x, y=y, plot="ellipse")


In [None]:
# box and whisker plots for each attribute
featurePlot(x=x, y=y, plot="box")


In [None]:
# density plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)


In [None]:
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"


# Build Models

In [None]:

# a) linear algorithms
set.seed(7)
fit.lda <- train(Species~., data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(Species~., data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <- train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(Species~., data=dataset, method="rf", metric=metric, trControl=control)


In [None]:
# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)


In [None]:
# compare accuracy of models
dotplot(results)


In [None]:
# summarize Best Model
print(fit.lda)


In [None]:
# estimate skill of LDA on the validation dataset
predictions <- predict(fit.lda, validation)
confusionMatrix(predictions, validation$Species)
