In [1]:
%%html
<style>
.CodeMirror pre, .CodeMirror-dialog, .CodeMirror-dialog .CodeMirror-search-field, .terminal-app .terminal {
    font-family: "Courier New", Courier, monospace;
    font-size: 12pt;
}
</style> 

In [None]:
# In this notebook I run Stephen Marsland's MLP code through and OR function, an AND
# function and an XOR function and show that the homemade multi-layer perceptron
# is able to solve all three functions, whereas the single layer perceptron, being
# purely a linear function, would have failed to solve the XOR function, which is 
# non-linear in nature. By moving to the MLP, we avoid the AI winter and fast forward
# AI research by 50 years...well, in theory at least.
# code from Stephen Marsland, Machine Learning An Algorithmic Perspective, 2nd edition

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set 

<function seaborn.rcmod.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)>

In [3]:
# Code from Stephen Marsland, Machine Learning An Algorithmic Perspective, 2nd edition
# Chapter 4, multi-layer perceptron. The original code is in Python 2.7, I've made some
# minor changes to the iterators to make the code run in python 3

import numpy as np


class Mlp:
    """This class implements a multi-layer perceptron

    Attributes:
        inputs: input vector
        target: target vector
        n_in: number of input features
        n_out: number of output classes

    Methods:
        early_stopping(self, inputs, targets, valid, validation_targets, learning_rate, n_iter=100)
        train(self, inputs, targets, learning_rate, n_iter)
        fwd(self, inputs)
        confusion_matrix(self, inputs, targets)

    """

    def __init__(self, inputs, targets, n_hidden, beta=1, momentum=0.9, output_type='logistic'):
        # Set up the network parameters
        self.n_in = np.shape(inputs)[1]
        self.n_out = np.shape(targets)[1]
        self.n_data = np.shape(inputs)[0]
        self.n_hidden = n_hidden

        self.beta = beta
        self.momentum = momentum
        self.output_type = output_type
        self.outputs = []
        self.hidden = []

        # Initialize the network
        # Input layer (add 1 for the biases)
        self.weights1 = (np.random.rand(self.n_in + 1, self.n_hidden) - 0.5) \
            * 2 / np.sqrt(self.n_in)
        # Hidden layers (add 1 for the biases)
        self.weights2 = (np.random.rand(self.n_hidden + 1, self.n_out) - 0.5) \
            * 2 / np.sqrt(self.n_hidden)

    def early_stopping(self, inputs, targets, valid, validation_targets, learning_rate, n_iter=100):
        valid = np.concatenate((valid, -np.ones((np.shape(valid)[0], 1))), axis=1)

        old_val_error1 = 100002
        old_val_error2 = 100001
        new_val_error = 100000

        count = 0
        while (((old_val_error1 - new_val_error) > 0.001) \
               or ((old_val_error2 - old_val_error1) > 0.001)):
            count += 1
            print("Epoch: ", count)
            self.train(inputs, targets, learning_rate, n_iter)
            old_val_error2 = old_val_error1
            old_val_error1 = new_val_error
            validation_out = self.fwd(valid)
            new_val_error = 0.5 * np.sum((validation_targets - validation_out)**2)

        print("Stopped", new_val_error, old_val_error1, old_val_error2)
        return new_val_error

    def train(self, inputs, targets, learning_rate, n_iter):
        # Add a bias term to each of the input nodes, use 1 as the bias
        inputs = np.concatenate((inputs, -np.ones((self.n_data, 1))), axis=1)
        # change = range(self.n_data)

        updated_weights1 = np.zeros((np.shape(self.weights1)))
        updated_weights2 = np.zeros((np.shape(self.weights2)))

        for n in range(n_iter):
            self.outputs = self.fwd(inputs)
            """print("shape of self.outputs", np.shape(self.outputs))
            print("shape of inputs", np.shape(inputs))
            print("shape of targets", np.shape(targets))"""

            error = 0.5 * np.sum((self.outputs - targets)**2)

            if np.mod(n, 10) == 0:
               print("Iteration: ", n, " Error: ", error)

            # Here, we allow for different types of output neurons
            # namely linear, logistic/sigmoid, and softmax
            if self.output_type == 'linear':
                delta_out = (self.outputs - targets) / self.n_data
            elif self.output_type == 'logistic':
                delta_out = self.beta * (self.outputs - targets) * \
                             self.outputs * (1.0 - self.outputs)
            elif self.output_type == 'softmax':
                delta_out = (self.outputs - targets) * \
                            (self.outputs * (-self.outputs) + self.outputs) / self.n_data
            else:
                print("Error")

            delta_full = self.hidden * self.beta * (1.0 - self.hidden) * \
                (np.dot(delta_out, np.transpose(self.weights2)))

            updated_weights1 = learning_rate * \
                (np.dot(np.transpose(inputs), delta_full[:, :-1])) \
                + self.momentum * updated_weights1
            updated_weights2 = learning_rate * \
                (np.dot(np.transpose(self.hidden), delta_out)) \
                + self.momentum * updated_weights2
            self.weights1 -= updated_weights1
            self.weights2 -= updated_weights2

            # You can add a randomnization step here if you want to

    def fwd(self, inputs):
        """Run the network forward"""

        self.hidden = np.dot(inputs, self.weights1)
        self.hidden = 1.0 / (1.0 + np.exp(-self.beta * self.hidden))
        self.hidden = np.concatenate((self.hidden, -np.ones((np.shape(inputs)[0], 1))), axis=1)

        outputs = np.dot(self.hidden, self.weights2)
        """print("Shape of outputs inside fwd: ", np.shape(outputs))
        print("Shape of hidden: ", np.shape(self.hidden))
        print("Shape of weights1: ", np.shape(self.weights1))
        print("Shape of weights2: ", np.shape(self.weights2))"""
        # Different types of output neurons
        if self.output_type == 'linear':
            return outputs
        elif self.output_type == 'logistic':
            return 1.0 / (1.0 + np.exp(-self.beta * outputs))
        elif self.output_type == 'softmax':
            normalizers = np.sum(np.exp(outputs), axis=1) * np.ones((1, np.shape(outputs)[0]))
            return np.transpose(np.transpose(np.exp(outputs))/normalizers)
        else:
            print("Error")

    def confusion_matrix(self, inputs, targets, verbose=False):
        """Confusion matrix"""
        inputs = np.concatenate((inputs, -np.ones((np.shape(inputs)[0], 1))), axis=1)
        outputs = self.fwd(inputs)

        n_classes = np.shape(targets)[1]
        print("n_classes: ", n_classes)

        if n_classes == 1:
            n_classes == 2
            outputs = np.where(outputs > 0.5, 1, 0)
        else:
            # 1-of-N encoding
            outputs = np.argmax(outputs, 1)
            targets = np.argmax(targets, 1)

        conf_mat = np.zeros((n_classes, n_classes))
        for i in range(n_classes):
            for j in range(n_classes):
                conf_mat[i, j] = np.sum(np.where(outputs == i, 1, 0)
                                        * np.where(targets == j, 1, 0))

        if verbose:
            print("Outputs at the end of the iterations are: ")
            print(outputs[:, -1])
            print("The targets for these were: ")
            print(np.transpose(targets))
            print()

        print("Confusion matrix is: ")
        print(conf_mat)
        print("Percentage correct: ", np.trace(conf_mat)/np.sum(conf_mat) * 100)

        return conf_mat

    def confusion_matrix_alt(self, inputs, targets, verbose=False):
        # Add the inputs that match the bias node
        inputs = np.concatenate((inputs, -np.ones((self.n_data, 1))), axis=1)

        outputs = self.fwd(inputs)
        num_classes = np.shape(targets)[1]

        if num_classes == 1:
            num_classes = 2
            outputs = np.where(outputs > 0, 1, 0)
        else:
            # 1 of N encoding
            outputs = np.argmax(outputs, 1)
            targets = np.argmax(targets, 1)

        if verbose:
            print("Outputs at the end of the iterations are: ")
            print(outputs[:, -1])
            print("The targets for these were: ")
            print(np.transpose(targets))
            print()

        conf_mat = np.zeros((num_classes, num_classes))
        for i in range(num_classes):
            for j in range(num_classes):
                conf_mat[i, j] = np.sum(np.where(outputs == i, 1, 0) * np.where(targets == j, 1, 0))

        print(conf_mat)
        print(np.trace(conf_mat) / np.sum(conf_mat))
        print("----------------------------------------------")


In [4]:
# I've added the Mlp class directly into the code so no need for imports

In [5]:
# OR
a = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
# AND
b = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 1]])
# XOR
c = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0]])

In [6]:
# OR function
print("Running the OR function through the multi-layer perceptron")
p = Mlp(a[:, 0:2], a[:, 2:3], 2)
p.train(a[:, 0:2], a[:, 2:3], 0.25, 1001)
p.confusion_matrix(a[:, 0:2], a[:, 2:3], True)

Running the OR function through the multi-layer perceptron
Iteration:  0  Error:  0.6366151198807215
Iteration:  10  Error:  0.4103099364374195
Iteration:  20  Error:  0.41681959017139864
Iteration:  30  Error:  0.3656792804756345
Iteration:  40  Error:  0.35450731533455754
Iteration:  50  Error:  0.32685943505235426
Iteration:  60  Error:  0.2726191380267489
Iteration:  70  Error:  0.1918907651575373
Iteration:  80  Error:  0.10829949052170823
Iteration:  90  Error:  0.05495686310574678
Iteration:  100  Error:  0.030130067041185368
Iteration:  110  Error:  0.019118799928722657
Iteration:  120  Error:  0.01374074093748451
Iteration:  130  Error:  0.010747138196452186
Iteration:  140  Error:  0.008869117714040463
Iteration:  150  Error:  0.007575667174188232
Iteration:  160  Error:  0.0066225462217102534
Iteration:  170  Error:  0.0058857716925809775
Iteration:  180  Error:  0.005296504948264927
Iteration:  190  Error:  0.004813249499257789
Iteration:  200  Error:  0.004409239057208216


array([[1.]])

In [7]:
# AND function
print("Running the AND function through the multi-layer perceptron")
q = Mlp(b[:, 0:2], b[:, 2:3], 2)
q.train(b[:, 0:2], b[:, 2:3], 0.25, 5001)
ans = q.confusion_matrix(b[:, 0:2], b[:, 2:3], True)

:  920  Error:  0.0006637223185746321
Iteration:  930  Error:  0.0006551821345652114
Iteration:  940  Error:  0.0006468502774612323
Iteration:  950  Error:  0.0006387193398984818
Iteration:  960  Error:  0.000630782258421441
Iteration:  970  Error:  0.0006230322938675371
Iteration:  980  Error:  0.0006154630130736367
Iteration:  990  Error:  0.0006080682718021262
Iteration:  1000  Error:  0.000600842198793062
Iteration:  1010  Error:  0.0005937791808569933
Iteration:  1020  Error:  0.0005868738489302732
Iteration:  1030  Error:  0.0005801210650213012
Iteration:  1040  Error:  0.0005735159099822399
Iteration:  1050  Error:  0.0005670536720461073
Iteration:  1060  Error:  0.0005607298360740297
Iteration:  1070  Error:  0.0005545400734621157
Iteration:  1080  Error:  0.0005484802326611939
Iteration:  1090  Error:  0.0005425463302667533
Iteration:  1100  Error:  0.0005367345426394092
Iteration:  1110  Error:  0.0005310411980195933
Iteration:  1120  Error:  0.0005254627691029303
Iteration: 

In [8]:
# XOR function
print("Running the AND function through the multi-layer perceptron")
r = Mlp(c[:, 0:2], c[:, 2:3], 2)
r.train(c[:, 0:2], c[:, 2:3], 0.25, 5001)
ans = r.confusion_matrix(c[:, 0:2], c[:, 2:3], True)

:  0.001989992960165293
Iteration:  900  Error:  0.00195367479962918
Iteration:  910  Error:  0.0019186287403522612
Iteration:  920  Error:  0.0018847897006152685
Iteration:  930  Error:  0.001852096943030975
Iteration:  940  Error:  0.0018204937189857451
Iteration:  950  Error:  0.0017899269473808477
Iteration:  960  Error:  0.0017603469238785462
Iteration:  970  Error:  0.0017317070573297719
Iteration:  980  Error:  0.0017039636304676184
Iteration:  990  Error:  0.0016770755823024848
Iteration:  1000  Error:  0.0016510043099595521
Iteration:  1010  Error:  0.0016257134879640838
Iteration:  1020  Error:  0.0016011689032102532
Iteration:  1030  Error:  0.0015773383040499312
Iteration:  1040  Error:  0.0015541912621135246
Iteration:  1050  Error:  0.0015316990456286062
Iteration:  1060  Error:  0.0015098345031366292
Iteration:  1070  Error:  0.0014885719566267348
Iteration:  1080  Error:  0.0014678871032098263
Iteration:  1090  Error:  0.0014477569245481426
Iteration:  1100  Error:  0.0

In [9]:
# AND function
print("Running the AND function through the multi-layer perceptron")
q = Mlp(b[:, 0:2], b[:, 2:3], 2, output_type='logistic')
q.train(b[:, 0:2], b[:, 2:3], 0.25, 5001)
ans = q.confusion_matrix(b[:, 0:2], b[:, 2:3], True)


Iteration:  930  Error:  0.0006828041060024843
Iteration:  940  Error:  0.000674097114277802
Iteration:  950  Error:  0.0006656008312393615
Iteration:  960  Error:  0.0006573078200813904
Iteration:  970  Error:  0.0006492109868546049
Iteration:  980  Error:  0.0006413035610423755
Iteration:  990  Error:  0.0006335790774375113
Iteration: 1000  Error:  0.0006260313592195497
Iteration:  1010  Error:  0.0006186545021409817
Iteration:  1020  Error:  0.0006114428597387793
Iteration:  1030  Error:  0.0006043910294947826
Iteration:  1040  Error:  0.00059749383987481
Iteration:  1050  Error:  0.0005907463381822685
Iteration:  1060  Error:  0.0005841437791673735
Iteration:  1070  Error:  0.0005776816143377712
Iteration:  1080  Error:  0.0005713554819209322
Iteration:  1090  Error:  0.0005651611974324657
Iteration: 1100  Error:  0.0005590947448082789
Iteration:  1110  Error:  0.0005531522680616687
Iteration:  1120  Error:  0.0005473300634296929
Iteration:  1130  Error:  0.0005416245719756015
Ite

In [10]:
# XOR function
print("Running the AND function through the multi-layer perceptron")
r = Mlp(c[:, 0:2], c[:, 2:3], 2, output_type='logistic')
r.train(c[:, 0:2], c[:, 2:3], 0.25, 5001)
ans = r.confusion_matrix(c[:, 0:2], c[:, 2:3], True)

Error:  0.0016806424206919856
Iteration:  900  Error:  0.0016545147729657107
Iteration:  910  Error:  0.0016291677095859676
Iteration:  920  Error:  0.001604567123897514
Iteration:  930  Error:  0.0015806808563661643
Iteration:  940  Error:  0.0015574785580839048
Iteration:  950  Error:  0.001534931565569491
Iteration:  960  Error:  0.001513012785790594
Iteration:  970  Error:  0.0014916965904488123
Iteration:  980  Error:  0.0014709587186698093
Iteration:  990  Error:  0.001450776187330624
Iteration:  1000  Error:  0.0014311272083349855
Iteration:  1010  Error:  0.0014119911122178065
Iteration:  1020  Error:  0.0013933482775218072
Iteration:  1030  Error:  0.00137518006544494
Iteration:  1040  Error:  0.0013574687593058464
Iteration:  1050  Error:  0.0013401975084189682
Iteration:  1060  Error:  0.0013233502760094515
Iteration:  1070  Error:  0.001306911790833435
Iteration:  1080  Error:  0.0012908675022002168
Iteration:  1090  Error:  0.0012752035381209375
Iteration:  1100  Error:  0