# ID5059 2022-23: Lecture 2

## Acknowledgments

This notebook is based on:

* the corresponding lecture by [Tom Kelsey](https://www.st-andrews.ac.uk/computer-science/people/twk/) in the 2020-21 delivery of ID5059
* [Predicting wine quality using Random Forests](https://www.r-bloggers.com/2016/02/predicting-wine-quality-using-random-forests/) by Teja Kodali

## Setup

See [Software_Setup.ipynb]() for details of options for running Jupyter notebooks.

To run R in a VS Code Jupyter notebook, first install R if necessary.

Then inside an R shell, run:

```R
install.packages('IRkernel')
IRkernel::installspec()
```

Restart VS Code and select R from the kernel list.

## Classification

Download data and read into a table.

In [None]:
url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
wine <- read.table(url,fill = TRUE , header = TRUE, sep = ";" )
head(wine)

Define some utility functions.

In [None]:
quality_categories = sort(c('low', 'normal', 'high'))

quality_to_category <- function(quality) {
    return (ifelse(quality < 6, 'low', ifelse(quality == 6, 'normal', 'high')))
}

classification_accuracy <- function(confusion_matrix, number_of_test_cases) {

    # Sum the elements on the diagonal, which are the correct classifications.
    correct_predictions <- 0
    for (j in 1 : nrow(confusion_matrix)) {
        correct_predictions <- correct_predictions + confusion_matrix[j, j]
    }

    return (sprintf("%1.f%%", (correct_predictions / number_of_test_cases) * 100))
}

index_of_highest_value <- function(vector) {

    index = 1
    highest = vector[index]

    for (i in 2 : length(vector)) {
        if (vector[i] > highest) {
            index = i
            highest = vector[i]
        }
    }

    return (index)
}

highest_probability_label = function(probability_of_high, probability_of_low, probability_of_normal) {

    return (quality_categories[index_of_highest_value(c(probability_of_high, probability_of_low, probability_of_normal))])
}

scale_to_zero_one <- function(x) { (x - min(x)) / (max(x) - min(x)) }

Check the distribution of the quality attribute.

In [None]:
barplot(table(wine$quality))

Create three (arbitrary) categories for wine quality, and check the distribution of the new category attribute.

In [None]:
wine$quality_category <- quality_to_category(wine$quality)
wine$quality_category <- as.factor(wine$quality_category)

head(wine)
barplot(table(wine$quality_category))

Split 60-40 into training and test data.

In [None]:
# Set a seed for the random generator to make behaviour deterministic, i.e. we always get the same result.
set.seed(123)

# Generate random indices for the training set.
samp <- sample(nrow(wine), 0.6 * nrow(wine))

# Extract the training and test sets.
train <- wine[samp, ]
test <- wine[-samp, ]

### Simple regression

In [None]:
library(gam)

# Reset the quality category column to use ordered factors.
wine$quality_category <- quality_to_category(wine$quality)
wine$quality_category <- factor(wine$quality_category, levels = c('low', 'normal', 'high'), ordered = TRUE)

# Train a simple regression model predicting 'quality' from all other variables excluding 'quality_category'.
simple_regression_model <- gam(quality ~ . -quality_category, data = train)

# Summarise the model. The Anova table near the end shows which attributes are significantly correlated with quality.
summary(simple_regression_model)

Test on the test data and calculate the confusion matrix, using the category corresponding to predicted quality. The categories are in alphabetical order. There's no third row, since none of the quality predictions were exactly equal to 6, and so no predicted 'normal' categories.

In [None]:
predictions <- predict(simple_regression_model, test)

predicted_categories <- vector(length = nrow(test))

for (j in 1 : nrow(test)) {

	predicted_quality <- predictions[j]
	predicted_categories[j] <- quality_to_category(predicted_quality)
}

simple_regression_confusion_matrix <- table(predicted_categories, test$quality_category)
simple_regression_confusion_matrix

Calculate the accuracy.

In [None]:
classification_accuracy(simple_regression_confusion_matrix, nrow(test))

### Ordinal logistic regression

In [None]:
library(MASS)

# Reset the quality category column to use ordered factors.
wine$quality_category <- quality_to_category(wine$quality)
wine$quality_category <- factor(wine$quality_category, levels = c('low', 'normal', 'high'), ordered = TRUE)

# Train an ordinal logistic regression model predicting 'quality_category' from all other variables excluding 'quality'.
ordinal_logistic_regression_model <- polr(quality_category ~ . - quality, data = train)

# Summarise the model.
summary(ordinal_logistic_regression_model)

Test on the test data and calculate the confusion matrix.

In [None]:
predictions <- predict(ordinal_logistic_regression_model, test, type = 'probs')
predicted_categories <- vector(length = nrow(test))

for (j in 1 : nrow(test)) {

	# The probabilities of the various categories, assuming in alphabetical category order.
	probabilities <- predictions[j,]

	# Record the prediction based on the highest probability.
	predicted_categories[j] = quality_categories[index_of_highest_value(probabilities)]
}

ordinal_logistic_regression_confusion_matrix <- table(predicted_categories, test$quality_category)
ordinal_logistic_regression_confusion_matrix

Calculate the accuracy.

In [None]:
classification_accuracy(ordinal_logistic_regression_confusion_matrix, nrow(test))

### Random forest

In [None]:
library(randomForest)

# Reset the quality category column to use unordered factors.
wine$quality_category <- quality_to_category(wine$quality)
wine$quality_category <- as.factor(wine$quality_category)

# Train a random forest model predicting 'quality_category' from all other variables excluding 'quality'.
random_forest_model <- randomForest(quality_category ~ . - quality, data = train)

# Summarise the model.
summary(random_forest_model)

Test on the test data and calculate the confusion matrix.

In [None]:
# The predictions are 'quality_category' labels.
predictions <- predict(random_forest_model, test)

random_forest_confusion_matrix <- table(predictions, test$quality_category)
random_forest_confusion_matrix

Calculate the accuracy.

In [None]:
classification_accuracy(random_forest_confusion_matrix, nrow(test))

### Gradient boost random forest

In [None]:
library(caret)
library(Ecdat)
library(gbm)

# Explore parameter combinations for random forest learning.
# This can take several minutes to execute.

# Reset the quality category column to use unordered factors.
wine$quality_category <- quality_to_category(wine$quality)
wine$quality_category <- as.factor(wine$quality_category)

control <- trainControl(method = "CV", number = 10)

grid <- expand.grid(
	.n.trees = seq(200, 500, by = 200),
	.interaction.depth = seq(1, 3, by = 2),
	.shrinkage = seq(.01, .09, by = .04),
    .n.minobsinnode = seq(1, 5, by = 2))

gbm_exploration <- train(quality_category ~ . - quality, data = train, method = 'gbm', trControl = control, tuneGrid = grid)
gbm_exploration

# The end of the output shows recommended parameter values.

Plug in the parameter values output by the previous cell.

In [None]:
# Train a generalized boosted model predicting 'quality_category' from all other variables excluding 'quality'.

generalized_boosted_model <- gbm(quality_category ~ . - quality, data = train,
	n.trees = 400, interaction.depth = 3, shrinkage = .09, n.minobsinnode = 3)

# Summarise the model.
summary(generalized_boosted_model)

Test on the test data and calculate the confusion matrix.

In [None]:
# The predictions are probabilities.
predictions <- predict(generalized_boosted_model, newdata = test, type = 'response', n.trees = 400)

predicted_categories <- vector(length = nrow(test))

for (i in 1 : nrow(test)) {

	# Record the prediction based on the highest probability.
	probability_of_high =   predictions[i]
	probability_of_low =    predictions[i + nrow(test)]
	probability_of_normal = predictions[i + 2 * nrow(test)]

	predicted_categories[i] <- highest_probability_label(probability_of_high, probability_of_low, probability_of_normal)
}

generalized_boosted_model_confusion_matrix <- table(predicted_categories, test$quality_category)
generalized_boosted_model_confusion_matrix

Calculate the accuracy.

In [None]:
classification_accuracy(generalized_boosted_model_confusion_matrix, nrow(test))

### Neural net

In [None]:
library(nnet)

# Reset the quality category column to use unordered factors.
wine$quality_category <- quality_to_category(wine$quality)
wine$quality_category <- as.factor(wine$quality_category)

wine1 <- data.frame(wine)

# Scale predictor values to [0,1] range (this helps convergence when learning weights).
wine1[, 1:11] <- data.frame(lapply(wine1[, 1:11], scale_to_zero_one))

# Generate random indices for the new training set.
samp1 <- sample(nrow(wine1), 0.6 * nrow(wine1))

# Extract the training and test sets.
train1 <- wine1[samp, ]
test1 <- wine1[-samp, ]

# Train a neural net model predicting 'quality_category' from all other variables excluding 'quality'.
# Use a one hidden layer NN with ten hidden nodes.
neural_net_model <- nnet(quality_category ~ . -quality, data = wine1, subset = row.names(train1),
  	size = 10, decay = 1.0e-5, maxit = 1000)

# Summarise the model.
summary(neural_net_model)

Test on the test data and calculate the confusion matrix.

In [None]:
# The predictions are 'quality_category' labels.
predictions <- predict(neural_net_model, test1, type="class")

neural_net_confusion_matrix <- table(predictions, test1$quality_category)
neural_net_confusion_matrix

Calculate the accuracy.

In [None]:
classification_accuracy(neural_net_confusion_matrix, nrow(test1))

### Neural net 2

Try a different neural net classifer.

In [None]:
library(RSNNS)

# Reset the quality category column to use unordered factors.
wine$quality_category <- quality_to_category(wine$quality)
wine$quality_category <- as.factor(wine$quality_category)

wine2 <- data.frame(wine)

wine_values <- wine2[,1:11]
wine_targets <- decodeClassLabels(wine2$quality_category)

wine2 <- splitForTrainingAndTest(wine_values, wine_targets, ratio = 0.15)
wine2 <- normTrainingAndTestSet(wine2)

# Train a neural net model predicting 'quality_category' from all other variables excluding 'quality'.
# Use 5 hidden layers, each with 7 nodes.
neural_net_model2 <- mlp(wine2$inputsTrain, wine2$targetsTrain, 
	size = c(5,7), 
	learnFunc = "Std_Backpropagation", learnFuncParams = c(0.1), 
	hiddenActFunc = "Act_Logistic",
    maxit = 1000, inputsTest = wine2$inputsTest, targetsTest = wine2$targetsTest)

Test on the test data and calculate the confusion matrix.

In [None]:
# The predictions are probabilities.
predictions <- predict(neural_net_model2, wine2$inputsTest)

# Use confusion matrix function from RSNNS.
neural_net_confusion_matrix2 <- confusionMatrix(wine2$targetsTest, predictions)
neural_net_confusion_matrix2

In [None]:
classification_accuracy(neural_net_confusion_matrix2, nrow(wine2$targetsTest))

## Conclusions

Results from one run:

* simple regression: 46%
* ordinal logistic regression: 43%
* random forest: 69%
* gradient boost random forest: 64%
* neural net 1: 57%
* neural net 2: 55%