In [0]:
import os
import math
import random
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg # used to create png image files

In [2]:
from google.colab import drive
drive.mount('/content/drive')
wine_data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/winequality-red.csv", sep=';')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## **V.5 Adventures in the Nth dimension**

In [0]:
def ft_dot(a, b):
    result = 0
    for i in range(len(a)):
        result += a[i] * b[i]   
    return result

In [0]:
class Adaline(object):
  def __init__(self, lr):
    self.w = None
    self.lr = lr
    self.stats = []

  def _net_input(self, X):
    return ft_dot(self.w, X)

  def _activation_function(self, X):
    net_input = self._net_input(X)
    return 1 / (1 + math.exp(-net_input))

  def predict(self, X):
    return 1 if self._activation_function(X) > 0.5 else 0

  def _eval_epoch(self, X, y):
    num_misclass = 0
    for xi, yi in zip(X, y):
      num_misclass += int(self.predict(xi) != int(yi))
    return num_misclass

  def _train_epoch(self, X, y, epoch, mode, verbose):
    # m length vector of errors of all samples
    # continuous error
    epoch_errors = []

    # zip returns a row of matrix X and an element of vector y
    for xi, yi in zip(X, y):
      yhat = self._activation_function(xi)
      error = yi - yhat
      epoch_errors.append(error)

      # stochastic
      # note, we do not aggregate the error over the entire training sample
      if mode == 'online':
        self.w += self.lr * error * xi

    if mode == 'batch':
      self.w += self.lr * ft_dot(X, epoch_errors)

    num_misclass = self._eval_epoch(X, y)
    if verbose and epoch % 10 == 0:
      print('Epoch {}: {} errors'.format(epoch, num_misclass))
    self.stats.append((epoch, num_misclass, self.w))
    return epoch_errors

  def train(self, X, y, epochs, mode='batch', verbose=False, seed=None):
    if seed:
      random.seed(seed)
    if self.w is None:
      self.w = [0.001 * random.uniform(-1, 1) for i in range(X.shape[1])]
    epoch = 0
    while True:
      epoch_errors = self._train_epoch(X, y, epoch, mode, verbose)
      epoch += 1
      if epochs != 0 and epoch == epochs:
        break
      elif epochs == 0 and epoch_errors == 45: # this will never happen under pure adaline !!!
        break
    return self.stats

  def test_accuracy(self, X, y):
    num_correct = 0
    for xi, yi in zip(X, y):
      num_correct += int(int(yi) == self.predict(xi))
    accuracy = num_correct / len(y)
    print('Test accuracy {} {} {}%'.format(num_correct, len(y), 100*accuracy))
    return accuracy

In [0]:
def feature_scaling(series):
  return (series - series.mean()) / (series.max() - series.min())

In [0]:
def k_fold_split(data, k, shuffle=True):
  folds = []
  if shuffle:
    data = data.sample(frac=1).reset_index(drop=True)
    # sampling 100% here

  for i in range(k):
    size = data.shape[0] // k + 1 if i < data.shape[0] % k else data.shape[0] // k
    test_data = data.iloc[i * size: (i + 1) * size, :]
    train_data = data.iloc[data.index.difference(test_data.index), :]
    folds.append((train_data, test_data))
  return folds

In [0]:
def k_fold_adaline(folds, features, lr=0.05, epochs=500, mode='batch', verbose=False):
  sum_accuracy = 0

  for i, fold in enumerate(folds):
    X_train = fold[0][features]
    y_train = fold[0]['y']
    ad = Adaline(lr=lr)
    train_stats = ad.train(X_train.values, y_train.values, epochs, mode, verbose)
    X_test = fold[1][features]
    y_test = fold[1]['y']
    accuracy = ad.test_accuracy(X_test.values, y_test.values)
    sum_accuracy += accuracy

  print('K Fold: Mean accuracy {0:.3f}'.format(sum_accuracy / len(folds)))

a) Try training your perceptron/ADALINE with different numbers and types of chemical factors. Under what circumstances can your perceptron/ADALINE successfully train?

In [0]:
good_threshold = 8
bad_threshold = 3
selected_wine_data = wine_data[(wine_data['quality'] >= good_threshold) | (wine_data['quality'] <= bad_threshold)]
selected_wine_data = selected_wine_data.assign(y = pd.Series(selected_wine_data['quality'] >= good_threshold))

# feature scaling, applied only to X inputs, not to y targets !
selected_wine_data['pH'] = feature_scaling(selected_wine_data['pH'])
selected_wine_data['alcohol'] = feature_scaling(selected_wine_data['alcohol'])


In [10]:
num_folds = 5
folds = k_fold_split(selected_wine_data, num_folds)

print('train, val lengths for {} folds: {}'.format(
    num_folds, [(len(train), len(val)) for train, val in folds]))

train, val lengths for 5 folds: [(22, 6), (22, 6), (22, 6), (23, 5), (23, 5)]


In [11]:
# ADD 'volatile acidity'

features = ['pH', 'alcohol', 'volatile acidity']
lr = .005
epochs = 200
mode = 'batch'
k_fold_adaline(folds, features, lr=lr, epochs=epochs, mode=mode, verbose=False)


Test accuracy 2 6 33.33333333333333%
Test accuracy 5 6 83.33333333333334%
Test accuracy 6 6 100.0%
Test accuracy 5 5 100.0%
Test accuracy 4 5 80.0%
K Fold: Mean accuracy 0.793


In [12]:
# ADD 'sulphates'

features = ['pH', 'alcohol', 'volatile acidity', 'sulphates']
lr = .005
epochs = 200
mode = 'batch'
k_fold_adaline(folds, features, lr=lr, epochs=epochs, mode=mode, verbose=False)


Test accuracy 6 6 100.0%
Test accuracy 4 6 66.66666666666666%
Test accuracy 6 6 100.0%
Test accuracy 4 5 80.0%
Test accuracy 5 5 100.0%
K Fold: Mean accuracy 0.893


In [13]:
# ADD 'fixed acidity'

features = ['pH', 'alcohol', 'volatile acidity', 'fixed acidity']
lr = .005
epochs = 200
mode = 'batch'
k_fold_adaline(folds, features, lr=lr, epochs=epochs, mode=mode, verbose=False)


Test accuracy 5 6 83.33333333333334%
Test accuracy 3 6 50.0%
Test accuracy 6 6 100.0%
Test accuracy 4 5 80.0%
Test accuracy 5 5 100.0%
K Fold: Mean accuracy 0.827


In [14]:
# ONLY 'volatile acidity' and 'fixed acidity'

features = ['volatile acidity', 'fixed acidity']
lr = .005
epochs = 200
mode = 'batch'
k_fold_adaline(folds, features, lr=lr, epochs=epochs, mode=mode, verbose=False)



Test accuracy 3 6 50.0%
Test accuracy 2 6 33.33333333333333%
Test accuracy 6 6 100.0%
Test accuracy 4 5 80.0%
Test accuracy 5 5 100.0%
K Fold: Mean accuracy 0.727


Under the scenario when I include the following features: pH, alcohol, volatile acidity, and sulphates, the accuracy under validation data went up to 90%

b) You know what the decision boundary for 2 wine chemical factors looks like, but what does the decision boundary for 3 factors look like? What about if you use 7 factors? How about if you use all 11 wine chemical factors?

The decision boundary is a 2 dimensional hyperplace when we have 3 factors.

The decision boundary is a 6 dimensional hyperplace when we have 7 factors.

The decision boundary is a 10 dimensional hyperplace when we have 11 factors.

The decision boundary is a (n - 1) dimensional hyperplace when we have n factors.
